This repository has been archived on 2022-04-07. You can view files and clone it, but cannot push or open issues or pull requests.
mobilismScrap/main.py

120 lines
5.2 KiB
Python
Raw Normal View History

2021-08-23 12:03:00 +02:00
from sys import argv
from os import environ
from dotenv import load_dotenv
2021-08-23 13:56:36 +02:00
from cloudscraper import CloudScraper, create_scraper
2021-08-23 18:19:04 +02:00
from re import findall
2021-08-23 12:03:00 +02:00
class Scraper:
2021-08-23 12:22:31 +02:00
def __init__(self, pseudo, password, app, debug = False):
self.debug = debug
2021-08-23 12:03:00 +02:00
self.url = "https://forum.mobilism.org"
self.requested_app = app
self.loginData = {
"username": pseudo,
"password": password,
2021-08-23 13:39:55 +02:00
"login": "Login"
2021-08-23 12:03:00 +02:00
}
2021-08-23 13:56:36 +02:00
def errorFormat(self, code: int = None, message: str = "") -> str:
2021-08-24 01:51:53 +02:00
"""Pretty error message."""
2021-08-23 21:04:31 +02:00
return f"{f'[{code}]' if code else ''}{' ' if len(message) > 0 and code else ''}{message}."
2021-08-23 12:22:31 +02:00
2021-08-23 13:56:36 +02:00
def connect(self) -> CloudScraper:
2021-08-24 01:51:53 +02:00
"""Login to the forum using credentials."""
2021-08-23 12:30:10 +02:00
session = create_scraper(browser = {"browser": "chrome", "platform": "windows"}) # connect with cloudflare bypasser with a chrome browser on windows
2021-08-23 21:04:31 +02:00
if not session:
2021-08-24 01:51:53 +02:00
raise SystemError(self.errorFormat(message = "The creation of the session failed")) # called only if failed at creating the session
2021-08-23 12:30:10 +02:00
2021-08-24 01:43:26 +02:00
if self.debug: print("Connection attempt...")
2021-08-24 01:51:53 +02:00
reponse = session.post(f"{self.url}/ucp.php", data = self.loginData, params = {"mode": "login"}) # connect to the forum using credentials - params are set by default but its in case forum changing that
2021-08-23 21:04:31 +02:00
if reponse.status_code != 200:
2021-08-24 01:51:53 +02:00
raise ConnectionRefusedError(self.errorFormat(code = reponse.status_code, message = "Unable to connect")) # called only status code isn't 200
2021-08-23 12:27:11 +02:00
return session
2021-08-23 21:04:31 +02:00
def search(self, session) -> list:
2021-08-24 01:51:53 +02:00
"""Do the research."""
if self.debug: print("Going to search page and check connection...", end = " ")
reponse = session.get(f"{self.url}/search.php", params = {"keywords": self.requested_app, "sr": "topics", "sf": "titleonly"}) # fetch results page
2021-08-24 01:43:26 +02:00
if "Sorry but you are not permitted to use the search system. If you're not logged in please" in reponse.text:
2021-08-24 01:51:53 +02:00
raise ConnectionError(self.errorFormat(message = "Connection failed, check credentials")) # called only if login failed
2021-08-23 21:04:31 +02:00
if reponse.status_code != 200:
2021-08-24 01:51:53 +02:00
raise ConnectionError(self.errorFormat(code = reponse.status_code, message = "Impossible to make the search")) # called only status code isn't 200
if self.debug: print(f"Connected.")
2021-08-23 18:19:04 +02:00
2021-08-24 01:51:53 +02:00
if self.debug: print(f"Fetching results for {self.requested_app}...", end = " ")
2021-08-23 12:28:44 +02:00
2021-08-23 21:04:31 +02:00
return self.parse(reponse.text)
2021-08-23 12:27:11 +02:00
2021-08-23 18:19:04 +02:00
def parse(self, htmlPage: str) -> list:
2021-08-24 01:51:53 +02:00
"""Parse HTML reponse to a clean list"""
2021-08-23 21:04:31 +02:00
if "No suitable matches were found." in htmlPage:
return []
2021-08-23 18:19:04 +02:00
elements = htmlPage.split("<tr>\n<td>")[1:]
elements[-1] = elements[-1].split("</td>\n</tr>")[0]
for i in range(0, len(elements)):
try:
_title = findall(r"class=\"topictitle\">(.*)<\/a>", elements[i])[0]
except:
_title = None
try:
2021-08-24 10:09:59 +02:00
_author = findall(r"(<br />|</strong>)\n\n?<i class=\"icon-user\"></i> by <a href=\"\./memberlist\.php\?mode=viewprofile&amp;u=\d+\"( style=\"color: #.*;\" class=\"username-coloured\")?>(.*)</a>", elements[i])[0][-1]
2021-08-23 18:19:04 +02:00
except:
_author = None
try:
_link = findall(r"\./viewtopic\.php\?f=(\d*)&amp;t=(\d*)&amp", elements[i])[0]
_link = {"f": _link[0], "t": _link[1]}
except:
_link = None
2021-08-23 21:04:31 +02:00
elements[i] = {"title": _title, "author": _author, "link": f"https://forum.mobilism.org/viewtopic.php?f={_link['f']}&t={_link['t']}", "linkParams": _link}
2021-08-23 18:19:04 +02:00
return elements
2021-08-23 13:56:36 +02:00
def work(self) -> str:
2021-08-24 01:51:53 +02:00
"""Call all the others methods."""
2021-08-23 12:27:11 +02:00
session = self.connect()
link = self.search(session)
return link
2021-08-23 12:03:00 +02:00
2021-08-23 21:04:31 +02:00
def save(elements):
2021-08-24 01:51:53 +02:00
"""Save all the results parsed to a CSV file."""
2021-08-23 21:04:31 +02:00
taille = len(elements)
if taille == 0:
print("Aucun élément n'a été trouvé avec la recherche.")
return
filename = "results.csv"
with open(filename, "w") as f:
f.write(";".join(list(elements[0].keys())[:-1]))
f.write("\n")
for element in elements:
if element != "linkParams":
2021-08-23 21:13:13 +02:00
f.write(";".join(str(e) for e in list(element.values())[:-1]))
2021-08-23 21:04:31 +02:00
f.write("\n")
print(f"{taille} éléments ont étés enrengistés dans le fichier {filename}.")
2021-08-23 12:03:00 +02:00
if __name__ == "__main__":
argv = argv[1:]
2021-08-24 01:43:26 +02:00
if len(argv) < 1:
print("No App to retrieve.")
exit(1)
load_dotenv()
try:
2021-08-23 12:22:31 +02:00
try:
2021-08-24 01:43:26 +02:00
debug = environ["DEBUG_MOBILISM"].lower() in ("yes", "true", "1")
except:
debug = False
try:
pseudoMobilism = environ["PSEUDO_MOBILISM"]
passwordMobilism = environ["PASSWORD_MOBILISM"]
except:
if len(argv) >= 3:
pseudoMobilism = argv[0]
passwordMobilism = argv[1]
argv = argv[-2:]
else:
raise KeyError
save(Scraper(pseudoMobilism, passwordMobilism, " ".join([n for n in argv]), debug).work())
except KeyError:
print('Please fill in the username and password (with quotes) by args or with .env file and give an app to retrieve.')