adding parser
This commit is contained in:
parent
82b261104c
commit
3956b4d686
1 changed files with 30 additions and 4 deletions
34
main.py
34
main.py
|
@ -2,6 +2,7 @@ from sys import argv
|
||||||
from os import environ
|
from os import environ
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from cloudscraper import CloudScraper, create_scraper
|
from cloudscraper import CloudScraper, create_scraper
|
||||||
|
from re import findall
|
||||||
|
|
||||||
class Scraper:
|
class Scraper:
|
||||||
def __init__(self, pseudo, password, app, debug = False):
|
def __init__(self, pseudo, password, app, debug = False):
|
||||||
|
@ -43,13 +44,38 @@ class Scraper:
|
||||||
def search(self, session) -> str:
|
def search(self, session) -> str:
|
||||||
if self.debug: print("Going to search page...", end = " ")
|
if self.debug: print("Going to search page...", end = " ")
|
||||||
reponse = session.get(f"{self.url}/search.php", params = {"keywords": self.requested_app, "sr": "topics", "sf": "titleonly"})
|
reponse = session.get(f"{self.url}/search.php", params = {"keywords": self.requested_app, "sr": "topics", "sf": "titleonly"})
|
||||||
if self.debug: print(reponse.status_code, reponse.url)
|
|
||||||
with open("temp.html", "w") as f: # debug
|
if self.debug: print("Results retrieval...", end = " ")
|
||||||
f.writelines(reponse.text)
|
linkList = self.parse(reponse.text)
|
||||||
|
# if self.debug: print(reponse.status_code, reponse.url)
|
||||||
|
# with open("temp2.log", "w") as f: # debug
|
||||||
|
# f.writelines(res)
|
||||||
|
|
||||||
link = "No link for your application was found."
|
link = "No link for your application was found."
|
||||||
return link
|
return link
|
||||||
|
|
||||||
|
def parse(self, htmlPage: str) -> list:
|
||||||
|
elements = htmlPage.split("<tr>\n<td>")[1:]
|
||||||
|
elements[-1] = elements[-1].split("</td>\n</tr>")[0]
|
||||||
|
for i in range(0, len(elements)):
|
||||||
|
try:
|
||||||
|
_title = findall(r"class=\"topictitle\">(.*)<\/a>", elements[i])[0]
|
||||||
|
except:
|
||||||
|
_title = None
|
||||||
|
try:
|
||||||
|
_author = findall(r"<br />\n<i class=\"icon-user\"></i> by <a href=\"\./memberlist\.php\?mode=viewprofile&u=\d+\"( style=\"color: #.*;\" class=\"username-coloured\")?>(.*)</a>", elements[i])[0][-1]
|
||||||
|
except:
|
||||||
|
_author = None
|
||||||
|
print(elements[i])
|
||||||
|
try:
|
||||||
|
_link = findall(r"\./viewtopic\.php\?f=(\d*)&t=(\d*)&", elements[i])[0]
|
||||||
|
_link = {"f": _link[0], "t": _link[1]}
|
||||||
|
except:
|
||||||
|
_link = None
|
||||||
|
elements[i] = {"title": _title, "author": _author, "linkParams": _link}
|
||||||
|
|
||||||
|
return elements
|
||||||
|
|
||||||
def work(self) -> str:
|
def work(self) -> str:
|
||||||
session = self.connect()
|
session = self.connect()
|
||||||
link = self.search(session)
|
link = self.search(session)
|
||||||
|
@ -58,7 +84,7 @@ class Scraper:
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
argv = argv[1:]
|
argv = argv[1:]
|
||||||
if len(argv) == 3:
|
if len(argv) >= 3 and len(argv) <= 4:
|
||||||
print(Scraper(*argv).work())
|
print(Scraper(*argv).work())
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
|
Reference in a new issue