50 lines
2.1 KiB
Python
50 lines
2.1 KiB
Python
# **************************************************************************** #
|
|
# #
|
|
# ::: :::::::: #
|
|
# scrap.py :+: :+: :+: #
|
|
# +:+ +:+ +:+ #
|
|
# By: tomoron <tomoron@student.42angouleme.fr> +#+ +:+ +#+ #
|
|
# +#+#+#+#+#+ +#+ #
|
|
# Created: 2024/11/25 16:39:19 by tomoron #+# #+# #
|
|
# Updated: 2024/11/26 14:54:43 by tomoron ### ########.fr #
|
|
# #
|
|
# **************************************************************************** #
|
|
|
|
from login import Intra42
|
|
from getpass import getpass
|
|
from tqdm import tqdm
|
|
import re
|
|
import json
|
|
import threading
|
|
|
|
def getUrl(project_list, x, checkPrev):
|
|
page_data = connIntra.get_project_page(project_list[x][0])
|
|
subject_url_line = [x for x in page_data.split("\n") if "https://cdn.intra.42.fr/pdf/pdf" in x][0]
|
|
p = re.compile("([0-9]{3,10})")
|
|
subject_url = p.search(subject_url_line).group(1)
|
|
if(checkPrev and subject_url == project_list[x][2]):
|
|
project_list[x].append(None);
|
|
else:
|
|
project_list[x].append(subject_url);
|
|
|
|
def getUrls(project_list, connIntra, checkPrev):
|
|
threads = []
|
|
for x in range(len(project_list)):
|
|
threads.append(threading.Thread(target=getUrl, args=(project_list, x, checkPrev)))
|
|
threads[-1].start()
|
|
for t in tqdm(range(len(threads))):
|
|
#for t in range(len(threads)):
|
|
threads[t].join()
|
|
|
|
with open("intra_projects.json", 'r') as f:
|
|
project_list = json.loads(f.read())
|
|
|
|
connIntra = Intra42()
|
|
input("set language to english and press enter (fuck l'intra)...")
|
|
getUrls(project_list, connIntra, False)
|
|
input("set language to french and press enter (fuck l'intra)...")
|
|
getUrls(project_list, connIntra, True)
|
|
|
|
with open("result.json", 'w') as f:
|
|
f.write(json.dumps(project_list, indent=4))
|