Last active
October 13, 2018 03:02
-
-
Save jiqiujia/0546377019d24a229a8298561b275506 to your computer and use it in GitHub Desktop.
nlp
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###byte pair encoding | |
###Neural Machine Translation of Rare Words with Subword Units | |
###from https://plmsmile.github.io/2017/10/19/subword-units/ | |
import re | |
def process_raw_words(words, endtag='-'): | |
'''把单词分割成最小的符号,并且加上结尾符号''' | |
vocabs = {} | |
for word, count in words.items(): | |
# 加上空格 | |
word = re.sub(r'([a-zA-Z])', r' \1', word) | |
word += ' ' + endtag | |
vocabs[word] = count | |
return vocabs | |
def get_symbol_pairs(vocabs): | |
''' 获得词汇中所有的字符pair,连续长度为2,并统计出现次数 | |
Args: | |
vocabs: 单词dict,(word, count)单词的出现次数。单词已经分割为最小的字符 | |
Returns: | |
pairs: ((符号1, 符号2), count) | |
''' | |
#pairs = collections.defaultdict(int) | |
pairs = dict() | |
for word, freq in vocabs.items(): | |
# 单词里的符号 | |
symbols = word.split() | |
for i in range(len(symbols) - 1): | |
p = (symbols[i], symbols[i + 1]) | |
pairs[p] = pairs.get(p, 0) + freq | |
return pairs | |
def merge_symbols(symbol_pair, vocabs): | |
'''把vocabs中的所有单词中的'a b'字符串用'ab'替换 | |
Args: | |
symbol_pair: (a, b) 两个符号 | |
vocabs: 用subword(symbol)表示的单词,(word, count)。其中word使用subword空格分割 | |
Returns: | |
vocabs_new: 替换'a b'为'ab'的新词汇表 | |
''' | |
vocabs_new = {} | |
raw = ' '.join(symbol_pair) | |
merged = ''.join(symbol_pair) | |
# 非字母和数字字符做转义 | |
bigram = re.escape(raw) | |
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)') | |
for word, count in vocabs.items(): | |
word_new = p.sub(merged, word) | |
vocabs_new[word_new] = count | |
return vocabs_new | |
raw_words = {"low":5, "lower":2, "newest":6, "widest":3} | |
vocabs = process_raw_words(raw_words) | |
print(vocabs) | |
num_merges = 10 | |
for i in range(num_merges): | |
pairs = get_symbol_pairs(vocabs) | |
# 选择出现频率最高的pair | |
symbol_pair = max(pairs, key=pairs.get) | |
vocabs = merge_symbols(symbol_pair, vocabs) | |
print (vocabs) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:GBK -*- | |
"""汉字处理的工具: | |
判断unicode是否是汉字,数字,英文,或者其他字符。 | |
全角符号转半角符号。""" | |
def is_chinese(uchar): | |
"""判断一个unicode是否是汉字""" | |
if uchar >= u'u4e00' and uchar<=u'u9fa5': | |
return True | |
else: | |
return False | |
def is_number(uchar): | |
"""判断一个unicode是否是数字""" | |
if uchar >= u'u0030' and uchar<=u'u0039': | |
return True | |
else: | |
return False | |
def is_alphabet(uchar): | |
"""判断一个unicode是否是英文字母""" | |
if (uchar >= u'u0041' and uchar<=u'u005a') or (uchar >= u'u0061' and uchar<=u'u007a'): | |
return True | |
else: | |
return False | |
def is_other(uchar): | |
"""判断是否非汉字,数字和英文字符""" | |
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)): | |
return True | |
else: | |
return False | |
def B2Q(uchar): | |
"""半角转全角""" | |
inside_code=ord(uchar) | |
if inside_code<0x0020 or inside_code>0x7e: #不是半角字符就返回原来的字符 | |
return uchar | |
if inside_code==0x0020: #除了空格其他的全角半角的公式为:半角=全角-0xfee0 | |
inside_code=0x3000 | |
else: | |
inside_code+=0xfee0 | |
return unichr(inside_code) | |
def Q2B(uchar): | |
"""全角转半角""" | |
inside_code=ord(uchar) | |
if inside_code==0x3000: | |
inside_code=0x0020 | |
else: | |
inside_code-=0xfee0 | |
if inside_code<0x0020 or inside_code>0x7e: #转完之后不是半角字符返回原来的字符 | |
return uchar | |
return unichr(inside_code) | |
def stringQ2B(ustring): | |
"""把字符串全角转半角""" | |
return "".join([Q2B(uchar) for uchar in ustring]) | |
def uniform(ustring): | |
"""格式化字符串,完成全角转半角,大写转小写的工作""" | |
return stringQ2B(ustring).lower() | |
def string2List(ustring): | |
"""将ustring按照中文,字母,数字分开""" | |
retList=[] | |
utmp=[] | |
for uchar in ustring: | |
if is_other(uchar): | |
if len(utmp)==0: | |
continue | |
else: | |
retList.append("".join(utmp)) | |
utmp=[] | |
else: | |
utmp.append(uchar) | |
if len(utmp)!=0: | |
retList.append("".join(utmp)) | |
return retList | |
if __name__=="__main__": | |
#test Q2B and B2Q | |
for i in range(0x0020,0x007F): | |
print Q2B(B2Q(unichr(i))),B2Q(unichr(i)) | |
#test uniform | |
ustring=u'中国 人名a高频A' | |
ustring=uniform(ustring) | |
ret=string2List(ustring) | |
print ret |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import unicodedata | |
import string | |
all_letters = string.ascii_letters + " .,;'" | |
n_letters = len(all_letters) | |
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427 | |
def unicodeToAscii(s): | |
return ''.join( | |
c for c in unicodedata.normalize('NFD', s) | |
if unicodedata.category(c) != 'Mn' | |
and c in all_letters | |
) | |
print(unicodeToAscii('Ślusàrski')) | |
#python2 and python3 print unicode | |
import sys, codecs | |
print(sys.stdout.encoding) | |
if sys.stdout.encoding is None or sys.stdout.encoding == 'ANSI_X3.4-1968': | |
utf8_writer = codecs.getwriter('UTF-8') | |
if sys.version_info.major < 3: | |
sys.stdout = utf8_writer(sys.stdout, errors='replace') | |
else: | |
sys.stdout = utf8_writer(sys.stdout.buffer, errors='replace') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment