Files
42intra_login/scrap.py
2025-08-06 20:16:29 +02:00

50 lines
2.1 KiB
Python

# **************************************************************************** #
# #
# ::: :::::::: #
# scrap.py :+: :+: :+: #
# +:+ +:+ +:+ #
# By: tomoron <tomoron@student.42angouleme.fr> +#+ +:+ +#+ #
# +#+#+#+#+#+ +#+ #
# Created: 2024/11/25 16:39:19 by tomoron #+# #+# #
# Updated: 2024/11/26 14:54:43 by tomoron ### ########.fr #
# #
# **************************************************************************** #
from login import Intra42
from getpass import getpass
from tqdm import tqdm
import re
import json
import threading
def getUrl(project_list, x, checkPrev):
page_data = connIntra.get_project_page(project_list[x][0])
subject_url_line = [x for x in page_data.split("\n") if "https://cdn.intra.42.fr/pdf/pdf" in x][0]
p = re.compile("([0-9]{3,10})")
subject_url = p.search(subject_url_line).group(1)
if(checkPrev and subject_url == project_list[x][2]):
project_list[x].append(None);
else:
project_list[x].append(subject_url);
def getUrls(project_list, connIntra, checkPrev):
threads = []
for x in range(len(project_list)):
threads.append(threading.Thread(target=getUrl, args=(project_list, x, checkPrev)))
threads[-1].start()
for t in tqdm(range(len(threads))):
#for t in range(len(threads)):
threads[t].join()
with open("intra_projects.json", 'r') as f:
project_list = json.loads(f.read())
connIntra = Intra42()
input("set language to english and press enter (fuck l'intra)...")
getUrls(project_list, connIntra, False)
input("set language to french and press enter (fuck l'intra)...")
getUrls(project_list, connIntra, True)
with open("result.json", 'w') as f:
f.write(json.dumps(project_list, indent=4))