2021-08-23 12:03:00 +02:00
from sys import argv
from os import environ
from dotenv import load_dotenv
2021-08-23 13:56:36 +02:00
from cloudscraper import CloudScraper , create_scraper
2021-08-23 18:19:04 +02:00
from re import findall
2021-08-23 12:03:00 +02:00
class Scraper :
2021-08-23 12:22:31 +02:00
def __init__ ( self , pseudo , password , app , debug = False ) :
self . debug = debug
2021-08-23 12:03:00 +02:00
self . url = " https://forum.mobilism.org "
self . requested_app = app
self . loginData = {
" username " : pseudo ,
" password " : password ,
2021-08-23 13:39:55 +02:00
" login " : " Login "
2021-08-23 12:03:00 +02:00
}
2021-08-23 13:56:36 +02:00
def errorFormat ( self , code : int = None , message : str = " " ) - > str :
2021-08-24 01:51:53 +02:00
""" Pretty error message. """
2021-08-23 21:04:31 +02:00
return f " { f ' [ { code } ] ' if code else ' ' } { ' ' if len ( message ) > 0 and code else ' ' } { message } . "
2021-08-23 12:22:31 +02:00
2021-08-23 13:56:36 +02:00
def connect ( self ) - > CloudScraper :
2021-08-24 01:51:53 +02:00
""" Login to the forum using credentials. """
2021-08-23 12:30:10 +02:00
session = create_scraper ( browser = { " browser " : " chrome " , " platform " : " windows " } ) # connect with cloudflare bypasser with a chrome browser on windows
2021-08-23 21:04:31 +02:00
if not session :
2021-08-24 01:51:53 +02:00
raise SystemError ( self . errorFormat ( message = " The creation of the session failed " ) ) # called only if failed at creating the session
2021-08-23 12:30:10 +02:00
2021-08-24 01:43:26 +02:00
if self . debug : print ( " Connection attempt... " )
2021-08-24 01:51:53 +02:00
reponse = session . post ( f " { self . url } /ucp.php " , data = self . loginData , params = { " mode " : " login " } ) # connect to the forum using credentials - params are set by default but its in case forum changing that
2021-08-23 21:04:31 +02:00
if reponse . status_code != 200 :
2021-08-24 01:51:53 +02:00
raise ConnectionRefusedError ( self . errorFormat ( code = reponse . status_code , message = " Unable to connect " ) ) # called only status code isn't 200
2021-08-23 12:27:11 +02:00
return session
2021-08-23 21:04:31 +02:00
def search ( self , session ) - > list :
2021-08-24 01:51:53 +02:00
""" Do the research. """
if self . debug : print ( " Going to search page and check connection... " , end = " " )
reponse = session . get ( f " { self . url } /search.php " , params = { " keywords " : self . requested_app , " sr " : " topics " , " sf " : " titleonly " } ) # fetch results page
2021-08-24 01:43:26 +02:00
if " Sorry but you are not permitted to use the search system. If you ' re not logged in please " in reponse . text :
2021-08-24 01:51:53 +02:00
raise ConnectionError ( self . errorFormat ( message = " Connection failed, check credentials " ) ) # called only if login failed
2021-08-23 21:04:31 +02:00
if reponse . status_code != 200 :
2021-08-24 01:51:53 +02:00
raise ConnectionError ( self . errorFormat ( code = reponse . status_code , message = " Impossible to make the search " ) ) # called only status code isn't 200
if self . debug : print ( f " Connected. " )
2021-08-23 18:19:04 +02:00
2021-08-24 01:51:53 +02:00
if self . debug : print ( f " Fetching results for { self . requested_app } ... " , end = " " )
2021-08-23 12:28:44 +02:00
2021-08-23 21:04:31 +02:00
return self . parse ( reponse . text )
2021-08-23 12:27:11 +02:00
2021-08-23 18:19:04 +02:00
def parse ( self , htmlPage : str ) - > list :
2021-08-24 01:51:53 +02:00
""" Parse HTML reponse to a clean list """
2021-08-23 21:04:31 +02:00
if " No suitable matches were found. " in htmlPage :
return [ ]
2021-08-23 18:19:04 +02:00
elements = htmlPage . split ( " <tr> \n <td> " ) [ 1 : ]
elements [ - 1 ] = elements [ - 1 ] . split ( " </td> \n </tr> " ) [ 0 ]
for i in range ( 0 , len ( elements ) ) :
try :
_title = findall ( r " class= \" topictitle \" >(.*)< \ /a> " , elements [ i ] ) [ 0 ]
except :
_title = None
try :
2021-08-24 10:09:59 +02:00
_author = findall ( r " (<br />|</strong>) \ n \ n?<i class= \" icon-user \" ></i> by <a href= \" \ ./memberlist \ .php \ ?mode=viewprofile&u= \ d+ \" ( style= \" color: #.*; \" class= \" username-coloured \" )?>(.*)</a> " , elements [ i ] ) [ 0 ] [ - 1 ]
2021-08-23 18:19:04 +02:00
except :
_author = None
try :
_link = findall ( r " \ ./viewtopic \ .php \ ?f=( \ d*)&t=( \ d*)& " , elements [ i ] ) [ 0 ]
_link = { " f " : _link [ 0 ] , " t " : _link [ 1 ] }
except :
_link = None
2021-08-24 10:19:33 +02:00
try :
_date = findall ( r " </a> <i class= \" icon-time \" ></i> <small>(.*)</small> " , elements [ i ] ) [ 0 ]
except :
_date = None
print ( " \n " + elements [ i ] + " \n " )
elements [ i ] = { " title " : _title , " author " : _author , " date " : _date , " link " : f " https://forum.mobilism.org/viewtopic.php?f= { _link [ ' f ' ] } &t= { _link [ ' t ' ] } " , " linkParams " : _link }
2021-08-23 18:19:04 +02:00
return elements
2021-08-23 13:56:36 +02:00
def work ( self ) - > str :
2021-08-24 01:51:53 +02:00
""" Call all the others methods. """
2021-08-23 12:27:11 +02:00
session = self . connect ( )
2021-08-24 10:20:57 +02:00
topics = self . search ( session )
2021-08-23 12:27:11 +02:00
2021-08-24 10:20:57 +02:00
return topics
2021-08-23 12:03:00 +02:00
2021-08-23 21:04:31 +02:00
def save ( elements ) :
2021-08-24 01:51:53 +02:00
""" Save all the results parsed to a CSV file. """
2021-08-23 21:04:31 +02:00
taille = len ( elements )
if taille == 0 :
print ( " Aucun élément n ' a été trouvé avec la recherche. " )
return
filename = " results.csv "
with open ( filename , " w " ) as f :
f . write ( " ; " . join ( list ( elements [ 0 ] . keys ( ) ) [ : - 1 ] ) )
f . write ( " \n " )
for element in elements :
if element != " linkParams " :
2021-08-23 21:13:13 +02:00
f . write ( " ; " . join ( str ( e ) for e in list ( element . values ( ) ) [ : - 1 ] ) )
2021-08-23 21:04:31 +02:00
f . write ( " \n " )
print ( f " { taille } éléments ont étés enrengistés dans le fichier { filename } . " )
2021-08-23 12:03:00 +02:00
if __name__ == " __main__ " :
argv = argv [ 1 : ]
2021-08-24 10:22:50 +02:00
if len ( argv ) < 1 : # no args
2021-08-24 01:43:26 +02:00
print ( " No App to retrieve. " )
exit ( 1 )
2021-08-24 10:22:50 +02:00
load_dotenv ( ) # load .env file
2021-08-24 01:43:26 +02:00
try :
2021-08-24 10:22:50 +02:00
try : # for logs
2021-08-24 01:43:26 +02:00
debug = environ [ " DEBUG_MOBILISM " ] . lower ( ) in ( " yes " , " true " , " 1 " )
except :
debug = False
2021-08-24 10:22:50 +02:00
try : # try to fetch credentials from de .env first
2021-08-24 01:43:26 +02:00
pseudoMobilism = environ [ " PSEUDO_MOBILISM " ]
passwordMobilism = environ [ " PASSWORD_MOBILISM " ]
2021-08-24 10:22:50 +02:00
except : # if it failed try to get from the cli
2021-08-24 01:43:26 +02:00
if len ( argv ) > = 3 :
pseudoMobilism = argv [ 0 ]
passwordMobilism = argv [ 1 ]
argv = argv [ - 2 : ]
2021-08-24 10:22:50 +02:00
else : # if it failed again there is a problem
2021-08-24 01:43:26 +02:00
raise KeyError
2021-08-24 10:22:50 +02:00
save ( Scraper ( pseudoMobilism , passwordMobilism , " " . join ( [ n for n in argv ] ) , debug ) . work ( ) ) # call the work() function
2021-08-24 01:43:26 +02:00
except KeyError :
print ( ' Please fill in the username and password (with quotes) by args or with .env file and give an app to retrieve. ' )