Skip to content

Instantly share code, notes, and snippets.

@iamalbert
Created September 29, 2016 08:43
Show Gist options
  • Save iamalbert/771fe9e80be56e582668535e68556291 to your computer and use it in GitHub Desktop.
Save iamalbert/771fe9e80be56e582668535e68556291 to your computer and use it in GitHub Desktop.
normalize_text
FULL2HALF = {
r' ' : r' ', # full space and halfspace
r'!' : r'!', r'"' : r'"', r'#' : r'#', r'$' : r'$', r'%' : r'%',
r'&' : r'&', r''' : r"'", r'(' : r'(', r')' : r')', r'*' : r'*',
r'+' : r'+', r',' : r',', r'-' : r'-', r'.' : r'.', r'/' : r'/',
r'0' : r'0', r'1' : r'1', r'2' : r'2', r'3' : r'3', r'4' : r'4',
r'5' : r'5', r'6' : r'6', r'7' : r'7', r'8' : r'8', r'9' : r'9',
r':' : r':', r';' : r';', r'<' : r'<', r'=' : r'=', r'>' : r'>',
r'?' : r'?', r'@' : r'@', r'A' : r'A', r'B' : r'B', r'C' : r'C',
r'D' : r'D', r'E' : r'E', r'F' : r'F', r'G' : r'G', r'H' : r'H',
r'I' : r'I', r'J' : r'J', r'K' : r'K', r'L' : r'L', r'M' : r'M',
r'N' : r'N', r'O' : r'O', r'P' : r'P', r'Q' : r'Q', r'R' : r'R',
r'S' : r'S', r'T' : r'T', r'U' : r'U', r'V' : r'V', r'W' : r'W',
r'X' : r'X', r'Y' : r'Y', r'Z' : r'Z', r'[' : r'[', r'\' : '\\', # since r'\' is a syntax error
r']' : r']', r'^' : r'^', r'_' : r'_', r'`' : r'`', r'a' : r'a',
r'b' : r'b', r'c' : r'c', r'd' : r'd', r'e' : r'e', r'f' : r'f',
r'g' : r'g', r'h' : r'h', r'i' : r'i', r'j' : r'j', r'k' : r'k',
r'l' : r'l', r'm' : r'm', r'n' : r'n', r'o' : r'o', r'p' : r'p',
r'q' : r'q', r'r' : r'r', r's' : r's', r't' : r't', r'u' : r'u',
r'v' : r'v', r'w' : r'w', r'x' : r'x', r'y' : r'y', r'z' : r'z',
r'{' : r'{', r'|' : r'|', r'}' : r'}', r'~' : r'~',
}
_FULL2HALF = { ord(h): f for h,f in FULL2HALF.items() }
def full2half(s):
return s.translate(_FULL2HALF)
def normalizedText(sent):
sent = re.sub(URL_REG, "<URL>", sent)
sent = html.unescape(sent)
sent = HanziConv.toTraditional(full2half(sent))
sent = re.sub(r"((.)\2{3,})", r"\2\2\2", sent)
sent = re.sub(r"\s+", r" ", sent)
arr = re.findall(LING_REG, sent)
isCJK = [ re.match(CJK_REG, chunk) for chunk in arr ]
char = []
seg = []
for i, chunk in enumerate(arr):
if isCJK[i]:
seg.extend( jieba.cut(chunk) )
char.extend( chunk )
else:
if chunk == " ":
if 0 < i < len(arr)-1 and isCJK[i-1] and isCJK[i+1]:
seg.append( chunk )
char.append( chunk )
else:
seg.append( chunk )
char.append( chunk )
return char, seg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment