Created
May 19, 2019 17:58
-
-
Save alexott/8d07abb61775bf56efea9f054d4bca18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
test ♈ up ☝️ light skin ☝🏻 Zimbabwe 🇿🇼 England 🏴 keycap0 0️⃣end 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Module to work with emojis in text""" | |
import pickle | |
# TODO: add the function that will load all emojis, with their names, etc | |
def add_emoji(emojis, ch1, ch2 = ''): | |
# print('ch1=' + ch1 + ', ch2=' + ch2) | |
maybe_map = emojis.get(ch1, False) | |
if maybe_map: | |
if isinstance(maybe_map, dict): | |
maybe_map[ch2]=True | |
else: | |
maybe_map={ch2: True, '': True} | |
elif len(ch2) != 0: | |
emojis[ch1]={ch2: True} | |
else: | |
emojis[ch1]=True | |
# emoji-all.txt consists of files emoji-zwj-sequences.txt, emoji-data.txt, and | |
# emoji-sequences.txt downloaded from https://unicode.org/Public/emoji/12.0/, | |
# and with manually removed entries for #, numbers, and copyright/trademark/regmark | |
# TODO: make a list of exclusions, and load all files without manual editing | |
def load_emojis(fname="emoji-all.txt"): | |
"""Loads emojis from emoji-sequences.txt downloaded from https://unicode.org/Public/emoji/12.0/. | |
Returns a dictionary where key is the first character of emoji, and value is either True | |
if emoji consists of only of one unicode character, or dictionary with the rest of unicode characters. | |
In this dictionary the '' key represents the emoji that may consist of one, or more optional characters""" | |
emojis = {} | |
with open(fname, "r") as f: | |
for cnt, line in enumerate(f): | |
idx = line.find(';') | |
if len(line) == 0 or line[0] == '#' or idx == -1: | |
continue | |
line = line[0:idx] | |
line = line.strip() | |
if len(line) == 0: | |
continue | |
idx = line.find('..') | |
if idx != -1: | |
first_num=line[0:idx] | |
sec_num=line[(idx+2):] | |
for i in range(int(first_num, 16), int(sec_num, 16)+1): | |
add_emoji(emojis, chr(i)) | |
else: | |
idx=line.find(' ') | |
if idx != -1: | |
first_num=line[0:idx] | |
sec_num=line[(idx+1):] | |
if sec_num.find(' ') != -1: | |
sec_str='' | |
for i in sec_num.split(' '): | |
sec_str=sec_str+chr(int(i, 16)) | |
add_emoji(emojis, chr(int(first_num, 16)), sec_str) | |
else: | |
add_emoji(emojis, chr(int(first_num, 16)), chr(int(sec_num, 16))) | |
else: | |
add_emoji(emojis, chr(int(line, 16))) | |
return emojis | |
def generate_pickle(pickle_file="emojis.pickle", emoji_file="emoji-all.txt"): | |
emojis = load_emojis(emoji_file) | |
with open(pickle_file, "wb") as f: | |
pickle.dump(emojis, f) | |
def load_pickle(pickle_file="emojis.pickle"): | |
emojis={} | |
try: | |
with open(pickle_file, "rb") as f: | |
emojis=pickle.load(f) | |
except pickle.PickleError as ex: | |
print('Pickling error: {}'.foramt(ex)) | |
except IOError: | |
print('Cannot open ' + pickle_file) | |
return emojis | |
def strip_emojis(emojis, txt): | |
txt_len=len(txt) | |
cnt=0 | |
res_text='' | |
while cnt < txt_len: | |
c = txt[cnt] | |
if c in emojis: | |
maybe_map=emojis[c] | |
if isinstance(maybe_map, dict): | |
found_full=False | |
sub_text=txt[(cnt+1):] | |
for k,v in maybe_map.items(): | |
if k == '': | |
continue | |
if sub_text.startswith(k): | |
found_full=True | |
cnt = cnt + len(k) | |
if not found_full and '' not in maybe_map: | |
res_text = res_text + c | |
else: | |
res_text = res_text + c | |
cnt = cnt + 1 | |
return res_text | |
# Test: | |
# with open('emoji-test.txt', encoding='utf8') as f: | |
# emoji_test = f.read().strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment