howard-haowen · January 7, 2021 10:05
diff --git a/CleanTokens.py b/CleanTokens.py
 #!pip install zhon

 import re
 import zhon.hanzi as hanzi
 from string import punctuation as en_punc
 from string import ascii_letters as roman_letters

 zh_punc = hanzi.punctuation
 punc_set = set(zh_punc).union(set(en_punc)) #puncs in both English and Chinese
 punc_list = list(punc_set)
 punc_list.append('※') # add additonal puncs here  
 punc_list.append('≧') 
 zh_num = list('一二三四五六七八九十零')
 en_alpha = list(roman_letters)

 def remove_punc(myStr):  
  res = myStr.translate({ord(p): "" for p in punc_list}) #replace every p with empty space 
  return res

 def remove_url(myStr):  
  res = re.sub(r"http\S+", "", myStr)
  return res

 def remove_single_alpha(myStr):  
  res = myStr.translate({ord(p): "" for p in en_alpha}) #replace every p with empty space 
  return res

 def has_en_num(myStr):
  return any(char.isdigit() for char in myStr)

 def clean_tokens(tokenList):
  res = [remove_punc(tok) for tok in tokenList] # remove puncs
  res = [remove_url(tok) for tok in res] # remove urls
  res = [remove_single_alpha(tok) for tok in res] # remove single-letter strings 
  res = [tok for tok in res if not has_en_num(tok)] # remove tokens containing digits
  res = [tok for tok in res if not tok.isnumeric()] # remove decimal characters (like: 0, 1, 2..), digits (like: subscript, superscript), and characters having Unicode numeric value property (like: fraction, roman numerals, currency numerators) 
  res = [tok for tok in res if tok not in zh_num] # remove tokens that are one of the 11 Chinese numerals
  res = [tok for tok in res if not bool(re.search(r'(一|二|三|四|五|六|七|八|九|十|零)+(年|月|日)+', tok))] # remove tokens related to years. months, or dates
  res = [tok for tok in res if not bool(re.search(r'第(一|二|三|四|五|六|七|八|九|十|零)+', tok))] # remove tokens that are cardinals 
  res = [tok for tok in res if not bool(re.search(r'\s+', tok))] # remove one or more spaces
  res = [tok for tok in res if tok != ''] # remove empty strings
  res_str = " ".join(res)
  return res_str
	#!pip install zhon

	import re
	import zhon.hanzi as hanzi
	from string import punctuation as en_punc
	from string import ascii_letters as roman_letters

	zh_punc = hanzi.punctuation
	punc_set = set(zh_punc).union(set(en_punc)) #puncs in both English and Chinese
	punc_list = list(punc_set)
	punc_list.append('※') # add additonal puncs here
	punc_list.append('≧')
	zh_num = list('一二三四五六七八九十零')
	en_alpha = list(roman_letters)

	def remove_punc(myStr):
	res = myStr.translate({ord(p): "" for p in punc_list}) #replace every p with empty space
	return res

	def remove_url(myStr):
	res = re.sub(r"http\S+", "", myStr)
	return res

	def remove_single_alpha(myStr):
	res = myStr.translate({ord(p): "" for p in en_alpha}) #replace every p with empty space
	return res

	def has_en_num(myStr):
	return any(char.isdigit() for char in myStr)

	def clean_tokens(tokenList):
	res = [remove_punc(tok) for tok in tokenList] # remove puncs
	res = [remove_url(tok) for tok in res] # remove urls
	res = [remove_single_alpha(tok) for tok in res] # remove single-letter strings
	res = [tok for tok in res if not has_en_num(tok)] # remove tokens containing digits
	res = [tok for tok in res if not tok.isnumeric()] # remove decimal characters (like: 0, 1, 2..), digits (like: subscript, superscript), and characters having Unicode numeric value property (like: fraction, roman numerals, currency numerators)
	res = [tok for tok in res if tok not in zh_num] # remove tokens that are one of the 11 Chinese numerals
	res = [tok for tok in res if not bool(re.search(r'(一\|二\|三\|四\|五\|六\|七\|八\|九\|十\|零)+(年\|月\|日)+', tok))] # remove tokens related to years. months, or dates
	res = [tok for tok in res if not bool(re.search(r'第(一\|二\|三\|四\|五\|六\|七\|八\|九\|十\|零)+', tok))] # remove tokens that are cardinals
	res = [tok for tok in res if not bool(re.search(r'\s+', tok))] # remove one or more spaces
	res = [tok for tok in res if tok != ''] # remove empty strings
	res_str = " ".join(res)
	return res_str