Skip to content

Instantly share code, notes, and snippets.

@howard-haowen
Last active January 7, 2021 10:05
Show Gist options
  • Save howard-haowen/7cd623cfd11400c81f0ad63b1035f2d6 to your computer and use it in GitHub Desktop.
Save howard-haowen/7cd623cfd11400c81f0ad63b1035f2d6 to your computer and use it in GitHub Desktop.
Clean a list of tokens
#!pip install zhon
import re
import zhon.hanzi as hanzi
from string import punctuation as en_punc
from string import ascii_letters as roman_letters
zh_punc = hanzi.punctuation
punc_set = set(zh_punc).union(set(en_punc)) #puncs in both English and Chinese
punc_list = list(punc_set)
punc_list.append('※') # add additonal puncs here
punc_list.append('≧')
zh_num = list('一二三四五六七八九十零')
en_alpha = list(roman_letters)
def remove_punc(myStr):
res = myStr.translate({ord(p): "" for p in punc_list}) #replace every p with empty space
return res
def remove_url(myStr):
res = re.sub(r"http\S+", "", myStr)
return res
def remove_single_alpha(myStr):
res = myStr.translate({ord(p): "" for p in en_alpha}) #replace every p with empty space
return res
def has_en_num(myStr):
return any(char.isdigit() for char in myStr)
def clean_tokens(tokenList):
res = [remove_punc(tok) for tok in tokenList] # remove puncs
res = [remove_url(tok) for tok in res] # remove urls
res = [remove_single_alpha(tok) for tok in res] # remove single-letter strings
res = [tok for tok in res if not has_en_num(tok)] # remove tokens containing digits
res = [tok for tok in res if not tok.isnumeric()] # remove decimal characters (like: 0, 1, 2..), digits (like: subscript, superscript), and characters having Unicode numeric value property (like: fraction, roman numerals, currency numerators)
res = [tok for tok in res if tok not in zh_num] # remove tokens that are one of the 11 Chinese numerals
res = [tok for tok in res if not bool(re.search(r'(一|二|三|四|五|六|七|八|九|十|零)+(年|月|日)+', tok))] # remove tokens related to years. months, or dates
res = [tok for tok in res if not bool(re.search(r'第(一|二|三|四|五|六|七|八|九|十|零)+', tok))] # remove tokens that are cardinals
res = [tok for tok in res if not bool(re.search(r'\s+', tok))] # remove one or more spaces
res = [tok for tok in res if tok != ''] # remove empty strings
res_str = " ".join(res)
return res_str
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment