Last active
February 16, 2025 17:17
-
-
Save msenol86/44082269be46aa446ccda9d02202e523 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import re | |
EMOJI_TEST_FILENAME = "emoji-test.txt" | |
EMOJI_DATA_URL = "https://unicode.org/Public/emoji/16.0/emoji-test.txt" | |
def download_latest_emoji_test_data() : | |
response = urllib.request.urlopen(EMOJI_DATA_URL) | |
emoji_test_file = response.read() | |
with open(EMOJI_TEST_FILENAME, "wb") as tmp_file: | |
tmp_file.write(emoji_test_file) | |
def convert_unicode_chars_2(p_string_in_unicode): | |
""" | |
:param p_string: u'1F469 200D 1F469 200D 1F467 200D 1F466' | |
:return: u'\U0001f469\u200d\U0001f469\u200d\U0001f467\u200d\U0001f466' | |
""" | |
return u"".join([chr(int(a_char, base=16)) for a_char in p_string_in_unicode.split(u" ")]) | |
def get_normalize_short_names(p_string_in_unicode): | |
""" | |
family: woman, woman, girl, girl -> family_woman_woman_girl_girl | |
UP! button -> UP_button | |
Japanese “free of charge” button -> Japanese_free_of_charge_button | |
flag: Cocos (Keeling) Islands -> flag_Cocos_Keeling_Islands | |
three o’clock -> three_oclock | |
rescue worker’s helmet -> rescue_workers_helmet | |
flag: São Tomé & Príncipe -> flag_São_Tomé_Príncipe | |
:param p_string: input format men holding hands: dark skin tone, medium-dark skin tone | |
:return: output format men_holding_hands_dark_skin_tone_medium-dark_skin_tone | |
""" | |
# remove all non alphanumeric chars except space and dash | |
temp1 = "".join([c for c in p_string_in_unicode if c.isalnum() or c in [" ", "-"]]) | |
# replace multiple spaces with single space | |
temp2 = re.sub(r" +", r" ", temp1) | |
# replace all spaces with underscore | |
temp3 = re.sub(r" ", r"_", temp2) | |
return temp3 | |
def load_emoji_lookup(): | |
try : | |
with open(EMOJI_TEST_FILENAME, "r", encoding="utf8") as unicode_data: | |
unicode_data_rows = unicode_data.readlines() | |
quailifed_emojis = [a_line for a_line in unicode_data_rows if | |
re.search(r'(minimally-qualified #|fully-qualified #)', a_line)] | |
tmp_dict = dict() | |
print("Count of quailifed_emojis: " + str(len(quailifed_emojis))) | |
for an_emoji_row in quailifed_emojis: | |
emoji_shortname = " ".join(an_emoji_row.split("#")[1].split(" ")[2:]).strip() | |
emoji_in_unicode = an_emoji_row.split("#")[1].split(" ")[1].strip() | |
tmp_dict[emoji_in_unicode] = u"" + get_normalize_short_names(emoji_shortname) | |
return tmp_dict | |
except FileNotFoundError as e: | |
print(EMOJI_TEST_FILENAME + " file not found. Downloading it ...") | |
download_latest_emoji_test_data() | |
print("File downloaded. Re-run the script") | |
return None | |
def is_contains_emoji(p_string_in_unicode): | |
""" | |
Instead of searching all chars of a text in a emoji lookup dictionary this function just | |
checks whether any char in the text is in unicode emoji range | |
It is much faster than a dictionary lookup for a large text | |
However it only tells whether a text contains an emoji. It does not return the found emojis | |
""" | |
range_min = ord(u'\U0001F300') # 127744 | |
range_max = ord(u"\U0001FAF8") # 129784 | |
range_min_2 = 126980 | |
range_max_2 = 127569 | |
range_min_3 = 169 | |
range_max_3 = 174 | |
range_min_4 = 8205 | |
range_max_4 = 12953 | |
if p_string_in_unicode: | |
for a_char in p_string_in_unicode: | |
char_code = ord(a_char) | |
if range_min <= char_code <= range_max: | |
# or range_min_2 <= char_code <= range_max_2 or range_min_3 <= char_code <= range_max_3 or range_min_4 <= char_code <= range_max_4: | |
return True | |
elif range_min_2 <= char_code <= range_max_2: | |
return True | |
elif range_min_3 <= char_code <= range_max_3: | |
return True | |
elif range_min_4 <= char_code <= range_max_4: | |
return True | |
return False | |
else: | |
return False | |
def test_emoji_range_with(emoji_dict : dict) : | |
passed_items = dict() | |
for a_key, a_value in emoji_dict.items(): | |
if is_contains_emoji(a_key): | |
passed_items[a_key] = a_value | |
set1 = set(emoji_dict.items()) | |
set2 = set(passed_items.items()) | |
return set1 ^ set2 | |
emoji_dict = load_emoji_lookup() | |
if emoji_dict is not None: | |
missing_emojis_list = [p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""] | |
if len(missing_emojis_list) != 0: | |
print("List of emojis which are not in the range: " + str([p_key for p_key in test_emoji_range_with(emoji_dict) if p_key[0] != ""])) | |
else: | |
print("Range values are correct and detects all emojis") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Better use the "latest" source: https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt