initial commit
This commit is contained in:
49
scrap.py
Normal file
49
scrap.py
Normal file
@ -0,0 +1,49 @@
|
||||
# **************************************************************************** #
|
||||
# #
|
||||
# ::: :::::::: #
|
||||
# scrap.py :+: :+: :+: #
|
||||
# +:+ +:+ +:+ #
|
||||
# By: tomoron <tomoron@student.42angouleme.fr> +#+ +:+ +#+ #
|
||||
# +#+#+#+#+#+ +#+ #
|
||||
# Created: 2024/11/25 16:39:19 by tomoron #+# #+# #
|
||||
# Updated: 2024/11/26 14:54:43 by tomoron ### ########.fr #
|
||||
# #
|
||||
# **************************************************************************** #
|
||||
|
||||
from login import Intra42
|
||||
from getpass import getpass
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import json
|
||||
import threading
|
||||
|
||||
def getUrl(project_list, x, checkPrev):
|
||||
page_data = connIntra.get_project_page(project_list[x][0])
|
||||
subject_url_line = [x for x in page_data.split("\n") if "https://cdn.intra.42.fr/pdf/pdf" in x][0]
|
||||
p = re.compile("([0-9]{3,10})")
|
||||
subject_url = p.search(subject_url_line).group(1)
|
||||
if(checkPrev and subject_url == project_list[x][2]):
|
||||
project_list[x].append(None);
|
||||
else:
|
||||
project_list[x].append(subject_url);
|
||||
|
||||
def getUrls(project_list, connIntra, checkPrev):
|
||||
threads = []
|
||||
for x in range(len(project_list)):
|
||||
threads.append(threading.Thread(target=getUrl, args=(project_list, x, checkPrev)))
|
||||
threads[-1].start()
|
||||
for t in tqdm(range(len(threads))):
|
||||
#for t in range(len(threads)):
|
||||
threads[t].join()
|
||||
|
||||
with open("intra_projects.json", 'r') as f:
|
||||
project_list = json.loads(f.read())
|
||||
|
||||
connIntra = Intra42()
|
||||
input("set language to english and press enter (fuck l'intra)...")
|
||||
getUrls(project_list, connIntra, False)
|
||||
input("set language to french and press enter (fuck l'intra)...")
|
||||
getUrls(project_list, connIntra, True)
|
||||
|
||||
with open("result.json", 'w') as f:
|
||||
f.write(json.dumps(project_list, indent=4))
|
Reference in New Issue
Block a user