From e89a52ff41f15ca37066367d6e8adce9cc440423 Mon Sep 17 00:00:00 2001 From: Mylloon Date: Thu, 5 Aug 2021 02:26:21 +0200 Subject: [PATCH] removing emojis --- main.py | 2 ++ utils/remove_unicode.py | 53 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 utils/remove_unicode.py diff --git a/main.py b/main.py index a9ba823..b366862 100644 --- a/main.py +++ b/main.py @@ -6,6 +6,7 @@ from random import choice from datetime import datetime from pytz import timezone from queue import Queue +from utils.remove_unicode import emojis def load(variables) -> dict: """Load environment variables.""" @@ -46,6 +47,7 @@ class Listener(StreamListener): tweet = status.text # recovery of the last "usable" word of the tweet regex = r"https?:\/\/\S+| +?\?|\?| +?\!| ?\!|-|~|(?<=ui)i+|@\S+|\.+|(?<=na)a+(? str: + page = get("https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt") + lines = page.text.split("\n") + + blacklist = [ # blacklist of element who are not really emojis + "number sign", + "digit zero..digit nine", + "copyright", + "registered", + "trade mark", + "information" + ] + + unicodes = [] + extendedEmoji = {} + for line in lines: # check all lines + if not line.startswith("#") and len(line) > 0: # ignores comment lines and blank lines + if line.split(')')[1].strip() not in blacklist: # check if the emoji isn't in the blacklist + temp = f"{line.split(';')[0]}".strip() # recovery of the first column + if ".." in temp: # if it is a "list" of emojis, adding to a dict + extendedEmoji[temp.split("..")[0]] = temp.split("..")[1] + else: + unicodes.append(temp) + unicodes = list(set(unicodes) - {""}) # removal of duplicates and especially of extra spaces + + def _uChar(string: str): # choice between \u and \U in addition of the "0" to complete the code + stringLen = len(string) + if stringLen > 7: # Can't be more than 7 anyways + raise Exception(f"{string} is too long! ({stringLen})") + u, totalLong = "U", 7 # Should be 7 characters long if it is a capital U + if stringLen < 4: # 4 characters long if smaller than 4 + u, totalLong = "u", 4 # Should be 4 characters long if it is a lowercase u + resultat = "" + while len(f"{resultat}{string}") <= totalLong: # Adding the 0 + resultat += "0" + return f"\{u}{resultat}" # Return the right "U" with the right number of 0 + + for i in range(0, len(unicodes)): # add unicode syntax to the list + unicodes[i] = f"{_uChar(unicodes[i])}{unicodes[i]}" + + for mot in extendedEmoji.items(): # add unicode syntax to the dict + extendedEmoji[mot[0]] = f"{_uChar(mot[1])}{mot[1]}" + temp = f"{_uChar(mot[0])}{mot[0]}-{extendedEmoji[mot[0]]}" + if temp not in unicodes: # if not already in the list + unicodes.append(temp) # add the item to the list + + resultat = "[" + for code in unicodes: # conversion of the list into a string with | to separate all the emojis + resultat += f"{code}|" + + return f"{resultat[:-1]}]+"