removing emojis
This commit is contained in:
parent
852f5eb0a9
commit
e89a52ff41
2 changed files with 55 additions and 0 deletions
2
main.py
2
main.py
|
@ -6,6 +6,7 @@ from random import choice
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pytz import timezone
|
from pytz import timezone
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
|
from utils.remove_unicode import emojis
|
||||||
|
|
||||||
def load(variables) -> dict:
|
def load(variables) -> dict:
|
||||||
"""Load environment variables."""
|
"""Load environment variables."""
|
||||||
|
@ -46,6 +47,7 @@ class Listener(StreamListener):
|
||||||
tweet = status.text
|
tweet = status.text
|
||||||
# recovery of the last "usable" word of the tweet
|
# recovery of the last "usable" word of the tweet
|
||||||
regex = r"https?:\/\/\S+| +?\?|\?| +?\!| ?\!|-|~|(?<=ui)i+|@\S+|\.+|(?<=na)a+(?<!n)|(?<=quoi)i+|(?<=no)o+(?<!n)|…|\^+"
|
regex = r"https?:\/\/\S+| +?\?|\?| +?\!| ?\!|-|~|(?<=ui)i+|@\S+|\.+|(?<=na)a+(?<!n)|(?<=quoi)i+|(?<=no)o+(?<!n)|…|\^+"
|
||||||
|
regex += f"|{emojis()}"
|
||||||
tweetText = sub(regex, " ", tweet.lower())
|
tweetText = sub(regex, " ", tweet.lower())
|
||||||
lastWord = tweetText.split()[-1:][0]
|
lastWord = tweetText.split()[-1:][0]
|
||||||
if keys["VERBOSE"]:
|
if keys["VERBOSE"]:
|
||||||
|
|
53
utils/remove_unicode.py
Normal file
53
utils/remove_unicode.py
Normal file
|
@ -0,0 +1,53 @@
|
||||||
|
from requests import get
|
||||||
|
|
||||||
|
def emojis() -> str:
|
||||||
|
page = get("https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt")
|
||||||
|
lines = page.text.split("\n")
|
||||||
|
|
||||||
|
blacklist = [ # blacklist of element who are not really emojis
|
||||||
|
"number sign",
|
||||||
|
"digit zero..digit nine",
|
||||||
|
"copyright",
|
||||||
|
"registered",
|
||||||
|
"trade mark",
|
||||||
|
"information"
|
||||||
|
]
|
||||||
|
|
||||||
|
unicodes = []
|
||||||
|
extendedEmoji = {}
|
||||||
|
for line in lines: # check all lines
|
||||||
|
if not line.startswith("#") and len(line) > 0: # ignores comment lines and blank lines
|
||||||
|
if line.split(')')[1].strip() not in blacklist: # check if the emoji isn't in the blacklist
|
||||||
|
temp = f"{line.split(';')[0]}".strip() # recovery of the first column
|
||||||
|
if ".." in temp: # if it is a "list" of emojis, adding to a dict
|
||||||
|
extendedEmoji[temp.split("..")[0]] = temp.split("..")[1]
|
||||||
|
else:
|
||||||
|
unicodes.append(temp)
|
||||||
|
unicodes = list(set(unicodes) - {""}) # removal of duplicates and especially of extra spaces
|
||||||
|
|
||||||
|
def _uChar(string: str): # choice between \u and \U in addition of the "0" to complete the code
|
||||||
|
stringLen = len(string)
|
||||||
|
if stringLen > 7: # Can't be more than 7 anyways
|
||||||
|
raise Exception(f"{string} is too long! ({stringLen})")
|
||||||
|
u, totalLong = "U", 7 # Should be 7 characters long if it is a capital U
|
||||||
|
if stringLen < 4: # 4 characters long if smaller than 4
|
||||||
|
u, totalLong = "u", 4 # Should be 4 characters long if it is a lowercase u
|
||||||
|
resultat = ""
|
||||||
|
while len(f"{resultat}{string}") <= totalLong: # Adding the 0
|
||||||
|
resultat += "0"
|
||||||
|
return f"\{u}{resultat}" # Return the right "U" with the right number of 0
|
||||||
|
|
||||||
|
for i in range(0, len(unicodes)): # add unicode syntax to the list
|
||||||
|
unicodes[i] = f"{_uChar(unicodes[i])}{unicodes[i]}"
|
||||||
|
|
||||||
|
for mot in extendedEmoji.items(): # add unicode syntax to the dict
|
||||||
|
extendedEmoji[mot[0]] = f"{_uChar(mot[1])}{mot[1]}"
|
||||||
|
temp = f"{_uChar(mot[0])}{mot[0]}-{extendedEmoji[mot[0]]}"
|
||||||
|
if temp not in unicodes: # if not already in the list
|
||||||
|
unicodes.append(temp) # add the item to the list
|
||||||
|
|
||||||
|
resultat = "["
|
||||||
|
for code in unicodes: # conversion of the list into a string with | to separate all the emojis
|
||||||
|
resultat += f"{code}|"
|
||||||
|
|
||||||
|
return f"{resultat[:-1]}]+"
|
Reference in a new issue