Last active
January 7, 2021 10:05
-
-
Save howard-haowen/7cd623cfd11400c81f0ad63b1035f2d6 to your computer and use it in GitHub Desktop.
Clean a list of tokens
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install zhon | |
import re | |
import zhon.hanzi as hanzi | |
from string import punctuation as en_punc | |
from string import ascii_letters as roman_letters | |
zh_punc = hanzi.punctuation | |
punc_set = set(zh_punc).union(set(en_punc)) #puncs in both English and Chinese | |
punc_list = list(punc_set) | |
punc_list.append('※') # add additonal puncs here | |
punc_list.append('≧') | |
zh_num = list('一二三四五六七八九十零') | |
en_alpha = list(roman_letters) | |
def remove_punc(myStr): | |
res = myStr.translate({ord(p): "" for p in punc_list}) #replace every p with empty space | |
return res | |
def remove_url(myStr): | |
res = re.sub(r"http\S+", "", myStr) | |
return res | |
def remove_single_alpha(myStr): | |
res = myStr.translate({ord(p): "" for p in en_alpha}) #replace every p with empty space | |
return res | |
def has_en_num(myStr): | |
return any(char.isdigit() for char in myStr) | |
def clean_tokens(tokenList): | |
res = [remove_punc(tok) for tok in tokenList] # remove puncs | |
res = [remove_url(tok) for tok in res] # remove urls | |
res = [remove_single_alpha(tok) for tok in res] # remove single-letter strings | |
res = [tok for tok in res if not has_en_num(tok)] # remove tokens containing digits | |
res = [tok for tok in res if not tok.isnumeric()] # remove decimal characters (like: 0, 1, 2..), digits (like: subscript, superscript), and characters having Unicode numeric value property (like: fraction, roman numerals, currency numerators) | |
res = [tok for tok in res if tok not in zh_num] # remove tokens that are one of the 11 Chinese numerals | |
res = [tok for tok in res if not bool(re.search(r'(一|二|三|四|五|六|七|八|九|十|零)+(年|月|日)+', tok))] # remove tokens related to years. months, or dates | |
res = [tok for tok in res if not bool(re.search(r'第(一|二|三|四|五|六|七|八|九|十|零)+', tok))] # remove tokens that are cardinals | |
res = [tok for tok in res if not bool(re.search(r'\s+', tok))] # remove one or more spaces | |
res = [tok for tok in res if tok != ''] # remove empty strings | |
res_str = " ".join(res) | |
return res_str |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment