From fd2e5cc112a1a73107316dc2c89a019680e68ee8 Mon Sep 17 00:00:00 2001 From: tomoron Date: Wed, 6 Aug 2025 20:16:29 +0200 Subject: [PATCH] initial commit --- login.py | 103 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ scrap.py | 49 ++++++++++++++++++++++++++ test.py | 4 +++ 3 files changed, 156 insertions(+) create mode 100755 login.py create mode 100644 scrap.py create mode 100644 test.py diff --git a/login.py b/login.py new file mode 100755 index 0000000..4ff5090 --- /dev/null +++ b/login.py @@ -0,0 +1,103 @@ +# **************************************************************************** # +# # +# ::: :::::::: # +# login.py :+: :+: :+: # +# +:+ +:+ +:+ # +# By: tomoron +#+ +:+ +#+ # +# +#+#+#+#+#+ +#+ # +# Created: 2024/11/25 16:22:08 by tomoron #+# #+# # +# Updated: 2025/06/03 03:51:21 by tomoron ### ########.fr # +# # +# **************************************************************************** # + +import requests +import time +import subprocess +import os +import re +import json +import urllib.parse +from getpass import getpass +from bs4 import BeautifulSoup +from collections import defaultdict + + +class Intra42(): + user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.6167.160 Safari/537.36" + + def __init__(self): + self.s = requests.Session() + self.s.headers.update({"User-Agent": Intra42.user_agent}) + + #load refresh token + data = {} + self.load_session() + self.login_user() + + def load_session(): + if(not os.path.isfile('.cookies.json')): + return + + with open(".cookies.json", 'r') as f: + data = json.loads(f.read()) + + for domain, domain_cookies in data.items(): + for cookie_name, cookie_value in domain_cookies.items(): + new_cookie = requests.cookies.create_cookie(domain = domain, name = cookie_name, value = cookie_value) + self.s.cookies.set_cookie(new_cookie) + + + def login_user(self): + response = self.s.get("https://intra.42.fr/") + + if(response.url == 'https://profile-v3.intra.42.fr/'): + return; + if(response.url == 'https://profile.intra.42.fr/'): + return; + + page = BeautifulSoup(response.text, "html.parser") + form_html = page.find_all("form",id="kc-form-login")[0] + form_start = re.compile("^").findall(str(form_html))[0] + url = form_start.split(' ')[1].split("\"")[1] + + username = input("login : ") + password = getpass("password : ") + + username = urllib.parse.quote(username) + password = urllib.parse.quote(password) + + req_body = f'username={username}&password={password}&rememberMe=on&credentialId=' + url = url.replace("amp;", "") + headers = { "Content-Type": "application/x-www-form-urlencoded"} + + response = self.s.post(url, data=req_body ,headers=headers,allow_redirects=False) + if(response.status_code == 200): + print("invalid username or password") + exit() + print("sucessfuly logged in") + + cookies = defaultdict(dict) + for cookie in self.s.cookies: + cookies[cookie.domain][cookie.name] = cookie.value + + with open(".cookies.json", 'w') as f: + f.write(json.dumps(dict(cookies))) + + def request_url(self, url, allow_redirects=False): + res = self.s.get(url, allow_redirects=allow_redirects) + if(res.status_code == 302): + login_intra() + return(self.request_url(url)) + return(res) + + def get_intra_home(self): + response = self.request_url("https://profile.intra.42.fr/") + return(response.text) + + def get_goals(self): + response = self.request_url("https://profile.intra.42.fr/users/me/goals?cursus=42cursus") + return(response.text) + + def get_project_page(self, slug): + response = self.request_url(f"https://projects.intra.42.fr/projects/{slug}") + return(response.text) diff --git a/scrap.py b/scrap.py new file mode 100644 index 0000000..2a155cd --- /dev/null +++ b/scrap.py @@ -0,0 +1,49 @@ +# **************************************************************************** # +# # +# ::: :::::::: # +# scrap.py :+: :+: :+: # +# +:+ +:+ +:+ # +# By: tomoron +#+ +:+ +#+ # +# +#+#+#+#+#+ +#+ # +# Created: 2024/11/25 16:39:19 by tomoron #+# #+# # +# Updated: 2024/11/26 14:54:43 by tomoron ### ########.fr # +# # +# **************************************************************************** # + +from login import Intra42 +from getpass import getpass +from tqdm import tqdm +import re +import json +import threading + +def getUrl(project_list, x, checkPrev): + page_data = connIntra.get_project_page(project_list[x][0]) + subject_url_line = [x for x in page_data.split("\n") if "https://cdn.intra.42.fr/pdf/pdf" in x][0] + p = re.compile("([0-9]{3,10})") + subject_url = p.search(subject_url_line).group(1) + if(checkPrev and subject_url == project_list[x][2]): + project_list[x].append(None); + else: + project_list[x].append(subject_url); + +def getUrls(project_list, connIntra, checkPrev): + threads = [] + for x in range(len(project_list)): + threads.append(threading.Thread(target=getUrl, args=(project_list, x, checkPrev))) + threads[-1].start() + for t in tqdm(range(len(threads))): + #for t in range(len(threads)): + threads[t].join() + +with open("intra_projects.json", 'r') as f: + project_list = json.loads(f.read()) + +connIntra = Intra42() +input("set language to english and press enter (fuck l'intra)...") +getUrls(project_list, connIntra, False) +input("set language to french and press enter (fuck l'intra)...") +getUrls(project_list, connIntra, True) + +with open("result.json", 'w') as f: + f.write(json.dumps(project_list, indent=4)) diff --git a/test.py b/test.py new file mode 100644 index 0000000..46798c4 --- /dev/null +++ b/test.py @@ -0,0 +1,4 @@ +from login import Intra42 +from getpass import getpass + +a = Intra42()