adding some word automatically

This commit is contained in:
Mylloon 2021-08-06 02:51:36 +02:00
parent ca209798f3
commit 0ef9e3291f

11
main.py
View file

@ -35,15 +35,16 @@ def load(variables) -> dict:
def cleanTweet(tweet: str) -> str: def cleanTweet(tweet: str) -> str:
"""Remove all unwanted elements from the tweet.""" """Remove all unwanted elements from the tweet."""
tweet = tweet.lower() # convert to lower case tweet = tweet.lower() # convert to lower case
tweet = sub(r"(https?:\/\/\S+|www.\S+)", " ", tweet) # remove URLs tweet = sub(r"(https?:\/\/\S+|www.\S+)", " ", tweet) # remove URLs
hashtagMatch = findall(r"#\S+", tweet) # check all hashtags hashtagMatch = findall(r"#\S+", tweet) # check all hashtags
if len(hashtagMatch) < 3: # if less than 3 if len(hashtagMatch) < 3: # if less than 3
tweet = sub(r"#\S+", " ", tweet) # remove them tweet = sub(r"#\S+", " ", tweet) # remove them
else: else:
return "" # too much hashtags, ignoring tweet return "" # too much hashtags, ignoring tweet
tweet = sub(r"@\S+", " ", tweet) # remove usernames tweet = sub(r"@\S+", " ", tweet) # remove usernames
tweet = sub(r" *?[^\w\s]+", " ", tweet) # remove everything who is not a letter or a number or a space tweet = sub(r" *?[^\w\s]+", " ", tweet) # remove everything who is not a letter or a number or a space
tweet = sub(r"\S+(?=si|ci)", " ", tweet) # remove element of the word only if the last syllable can be matched (so more words will be answered without adding them manually)
tweet = sub(r"(?<=ui)i+|(?<=na)a+(?<!n)|(?<=quoi)i+|(?<=no)o+(?<!n)|(?<=hei)i+(?<!n)|(?<=si)i+", "", tweet) # remove key smashing in certains words tweet = sub(r"(?<=ui)i+|(?<=na)a+(?<!n)|(?<=quoi)i+|(?<=no)o+(?<!n)|(?<=hei)i+(?<!n)|(?<=si)i+", "", tweet) # remove key smashing in certains words
return tweet.strip() return tweet.strip()
@ -204,7 +205,7 @@ if __name__ == "__main__":
"non": ["non", "nn"], "non": ["non", "nn"],
"nan": ["nan"], "nan": ["nan"],
"hein": ["hein", "1"], "hein": ["hein", "1"],
"ci": ["ci", "si", "aussi"], "ci": ["ci", "si"],
"con": ["con"], "con": ["con"],
"ok": ["ok", "okay", "oké", "k"], "ok": ["ok", "okay", "oké", "k"],
"ouais": ["ouais", "oué"], "ouais": ["ouais", "oué"],