skip some step when nothing was found and remove potential html garbage

This commit is contained in:
Mylloon 2021-08-24 13:30:53 +02:00
parent 524d166c6b
commit 6ec0518ff4

14
main.py
View file

@ -82,10 +82,13 @@ class Scraper:
def getInfos(self, session: CloudScraper, elements: list) -> list: def getInfos(self, session: CloudScraper, elements: list) -> list:
"""Go to the first n pages and get a lot of infos""" """Go to the first n pages and get a lot of infos"""
size = len(elements)
if size == 0:
return []
page = 3 page = 3
if self.debug: print(f"Going to the {page} first pages...", end = " ") if self.debug: print(f"Going to the {page} first pages...", end = " ")
results = [] results = []
for i in range(0, len(elements)): for i in range(0, size):
if i < page: if i < page:
reponse = session.get(f"{self.url}/viewtopic.php", params = elements[i]["linkParams"]) # fetch results page reponse = session.get(f"{self.url}/viewtopic.php", params = elements[i]["linkParams"]) # fetch results page
results.append(reponse) results.append(reponse)
@ -122,6 +125,7 @@ class Scraper:
_downloadLinks = sub(r"\n|<a class=\"postlink\" href=\"|\(Closed Filehost\) ?|<span style=\"font-weight: bold\">|</span>|\">(\S*)</a>", "", _downloadLinks) # remove html garbage _downloadLinks = sub(r"\n|<a class=\"postlink\" href=\"|\(Closed Filehost\) ?|<span style=\"font-weight: bold\">|</span>|\">(\S*)</a>", "", _downloadLinks) # remove html garbage
_downloadLinks = sub(r"<br />\n?", "\n", _downloadLinks) # convert newline html to \n _downloadLinks = sub(r"<br />\n?", "\n", _downloadLinks) # convert newline html to \n
_downloadLinks = sub(r"Mirrors(?!:)|Mirror(?!s)(?!:)", "Mirror:", _downloadLinks) # add ":" _downloadLinks = sub(r"Mirrors(?!:)|Mirror(?!s)(?!:)", "Mirror:", _downloadLinks) # add ":"
_downloadLinks = _downloadLinks.split('">')[0]
elements[i] = {"changelogs": _changelogs, "downloadLinks": _downloadLinks} elements[i] = {"changelogs": _changelogs, "downloadLinks": _downloadLinks}
return elements return elements
@ -129,6 +133,8 @@ class Scraper:
def prettyPrint(self, topics: tuple[list[dict], list[dict]]) -> list: def prettyPrint(self, topics: tuple[list[dict], list[dict]]) -> list:
"""Show a pretty message with all the specialized infos""" """Show a pretty message with all the specialized infos"""
topics, topicsInfos = topics topics, topicsInfos = topics
if len(topics) == 0:
return []
print("\n") print("\n")
result = [] result = []
for i in range(0, len(topicsInfos)): for i in range(0, len(topicsInfos)):
@ -154,8 +160,8 @@ class Scraper:
def save(self, elements: list) -> None: def save(self, elements: list) -> None:
"""Save all the results parsed to a CSV file.""" """Save all the results parsed to a CSV file."""
taille = len(elements) size = len(elements)
if taille == 0: if size == 0:
print("No elements were found with the search.") print("No elements were found with the search.")
return return
filename = "results.csv" filename = "results.csv"
@ -168,7 +174,7 @@ class Scraper:
if element != "linkParams": if element != "linkParams":
f.write(";".join(str(e) for e in list(element.values())[:-1])) f.write(";".join(str(e) for e in list(element.values())[:-1]))
f.write("\n") f.write("\n")
print(f"{taille} elements have been registered in the {filename} file.") print(f"{size} elements have been registered in the {filename} file.")
if __name__ == "__main__": if __name__ == "__main__":
argv = argv[1:] argv = argv[1:]