import requests import time,json,os,random from joblib import Parallel, delayed ideal_delta = 60 * 15 # 15 minutes request_timeout = 0. cache_origin = 'https://libresolutions.network' crawler_header = {'User-agent': 'interverse-crawler', 'info': 'https://libresolutions.network/videos/interverse-demo-1/'} schemes = ['https://'] locations = [ '/.well-known/discover.json', '/.well-known/interverse', '/interverse.json' ] def dictify(lst): dat = {} for i in lst: if i != None: dat[i['location']]=i return dat class Cache: def __init__(self, delta=None): if delta == None: self.delta = ideal_delta else: self.delta = delta self.links = {} self.build_cache() # link = key:{data,time} def load_data(self, url): print(f"Loading interverse data for :{url}") data = None t = time.time() if url in self.links: if t - self.links[url]['time'] <= self.delta: print(f"Using cached result for {url}") return self.links[url]['data'] for s in schemes: for l in locations: try: data = requests.get( s+url.replace( 'https://', '').replace('http://', '').replace("/", '')+l, headers=crawler_header, timeout=1).json() if l.find('discover'): # translate discover to interverse data = json.loads(json.dumps(data).replace( "preview_connections", "connection_groups")) print(f"Interverse connection found at {l}") t = time.time() self.links[url] = { 'time': t, 'data': data, } return data except: pass if data != None: t = time.time() self.links[url] = { 'time': t+ideal_delta, 'data': data, } if data == None: # If no data is returned, wait longer before attempting again self.links[url] = { 'data': None, 'time': t + (60*60) * random.randint(8,16) } return data def get_interverse_data(self, url): origin = self.load_data(url) connections = [] try: for con in origin['connections']: connections.append(con) except: pass try: for g in origin['connection_groups']: for con in origin['connection_groups'][g]: connections.append(con) except: pass c = Parallel()(delayed(self.load_data)(i) for i in connections) return{ 'main': origin, 'connections': dictify(c) } def build_cache(self): print("Building cache..\nThis may take some time") origin = self.load_data(cache_origin) connections = [] try: for con in origin['connections']: connections.append(con) except: pass try: for g in origin['connection_groups']: for con in origin['connection_groups'][g]: connections.append(con) except: pass c = Parallel()(delayed(self.get_interverse_data)(i) for i in connections) if __name__ == '__main__': cache = Cache()