# **************************************************************************** # # # # ::: :::::::: # # scrap.py :+: :+: :+: # # +:+ +:+ +:+ # # By: tomoron +#+ +:+ +#+ # # +#+#+#+#+#+ +#+ # # Created: 2024/11/25 16:39:19 by tomoron #+# #+# # # Updated: 2024/11/26 14:54:43 by tomoron ### ########.fr # # # # **************************************************************************** # from login import Intra42 from getpass import getpass from tqdm import tqdm import re import json import threading def getUrl(project_list, x, checkPrev): page_data = connIntra.get_project_page(project_list[x][0]) subject_url_line = [x for x in page_data.split("\n") if "https://cdn.intra.42.fr/pdf/pdf" in x][0] p = re.compile("([0-9]{3,10})") subject_url = p.search(subject_url_line).group(1) if(checkPrev and subject_url == project_list[x][2]): project_list[x].append(None); else: project_list[x].append(subject_url); def getUrls(project_list, connIntra, checkPrev): threads = [] for x in range(len(project_list)): threads.append(threading.Thread(target=getUrl, args=(project_list, x, checkPrev))) threads[-1].start() for t in tqdm(range(len(threads))): #for t in range(len(threads)): threads[t].join() with open("intra_projects.json", 'r') as f: project_list = json.loads(f.read()) connIntra = Intra42() input("set language to english and press enter (fuck l'intra)...") getUrls(project_list, connIntra, False) input("set language to french and press enter (fuck l'intra)...") getUrls(project_list, connIntra, True) with open("result.json", 'w') as f: f.write(json.dumps(project_list, indent=4))