From 3956b4d6868515e54f4dd7f2204bc371fa4d91a0 Mon Sep 17 00:00:00 2001 From: Mylloon Date: Mon, 23 Aug 2021 18:19:04 +0200 Subject: [PATCH] adding parser --- main.py | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/main.py b/main.py index b77c7b4..9e0859e 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,7 @@ from sys import argv from os import environ from dotenv import load_dotenv from cloudscraper import CloudScraper, create_scraper +from re import findall class Scraper: def __init__(self, pseudo, password, app, debug = False): @@ -43,13 +44,38 @@ class Scraper: def search(self, session) -> str: if self.debug: print("Going to search page...", end = " ") reponse = session.get(f"{self.url}/search.php", params = {"keywords": self.requested_app, "sr": "topics", "sf": "titleonly"}) - if self.debug: print(reponse.status_code, reponse.url) - with open("temp.html", "w") as f: # debug - f.writelines(reponse.text) + + if self.debug: print("Results retrieval...", end = " ") + linkList = self.parse(reponse.text) + # if self.debug: print(reponse.status_code, reponse.url) + # with open("temp2.log", "w") as f: # debug + # f.writelines(res) link = "No link for your application was found." return link + def parse(self, htmlPage: str) -> list: + elements = htmlPage.split("\n")[1:] + elements[-1] = elements[-1].split("\n")[0] + for i in range(0, len(elements)): + try: + _title = findall(r"class=\"topictitle\">(.*)<\/a>", elements[i])[0] + except: + _title = None + try: + _author = findall(r"
\n by (.*)", elements[i])[0][-1] + except: + _author = None + print(elements[i]) + try: + _link = findall(r"\./viewtopic\.php\?f=(\d*)&t=(\d*)&", elements[i])[0] + _link = {"f": _link[0], "t": _link[1]} + except: + _link = None + elements[i] = {"title": _title, "author": _author, "linkParams": _link} + + return elements + def work(self) -> str: session = self.connect() link = self.search(session) @@ -58,7 +84,7 @@ class Scraper: if __name__ == "__main__": argv = argv[1:] - if len(argv) == 3: + if len(argv) >= 3 and len(argv) <= 4: print(Scraper(*argv).work()) else: try: