Skip to content

Instantly share code, notes, and snippets.

@jiqiujia
Last active October 13, 2018 03:02
Show Gist options
  • Save jiqiujia/0546377019d24a229a8298561b275506 to your computer and use it in GitHub Desktop.
Save jiqiujia/0546377019d24a229a8298561b275506 to your computer and use it in GitHub Desktop.
nlp
###byte pair encoding
###Neural Machine Translation of Rare Words with Subword Units
###from https://plmsmile.github.io/2017/10/19/subword-units/
import re
def process_raw_words(words, endtag='-'):
'''把单词分割成最小的符号,并且加上结尾符号'''
vocabs = {}
for word, count in words.items():
# 加上空格
word = re.sub(r'([a-zA-Z])', r' \1', word)
word += ' ' + endtag
vocabs[word] = count
return vocabs
def get_symbol_pairs(vocabs):
''' 获得词汇中所有的字符pair,连续长度为2,并统计出现次数
Args:
vocabs: 单词dict,(word, count)单词的出现次数。单词已经分割为最小的字符
Returns:
pairs: ((符号1, 符号2), count)
'''
#pairs = collections.defaultdict(int)
pairs = dict()
for word, freq in vocabs.items():
# 单词里的符号
symbols = word.split()
for i in range(len(symbols) - 1):
p = (symbols[i], symbols[i + 1])
pairs[p] = pairs.get(p, 0) + freq
return pairs
def merge_symbols(symbol_pair, vocabs):
'''把vocabs中的所有单词中的'a b'字符串用'ab'替换
Args:
symbol_pair: (a, b) 两个符号
vocabs: 用subword(symbol)表示的单词,(word, count)。其中word使用subword空格分割
Returns:
vocabs_new: 替换'a b'为'ab'的新词汇表
'''
vocabs_new = {}
raw = ' '.join(symbol_pair)
merged = ''.join(symbol_pair)
# 非字母和数字字符做转义
bigram = re.escape(raw)
p = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
for word, count in vocabs.items():
word_new = p.sub(merged, word)
vocabs_new[word_new] = count
return vocabs_new
raw_words = {"low":5, "lower":2, "newest":6, "widest":3}
vocabs = process_raw_words(raw_words)
print(vocabs)
num_merges = 10
for i in range(num_merges):
pairs = get_symbol_pairs(vocabs)
# 选择出现频率最高的pair
symbol_pair = max(pairs, key=pairs.get)
vocabs = merge_symbols(symbol_pair, vocabs)
print (vocabs)
#!/usr/bin/env python
# -*- coding:GBK -*-
"""汉字处理的工具:
判断unicode是否是汉字,数字,英文,或者其他字符。
全角符号转半角符号。"""
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'u4e00' and uchar<=u'u9fa5':
return True
else:
return False
def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar >= u'u0030' and uchar<=u'u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar >= u'u0041' and uchar<=u'u005a') or (uchar >= u'u0061' and uchar<=u'u007a'):
return True
else:
return False
def is_other(uchar):
"""判断是否非汉字,数字和英文字符"""
if not (is_chinese(uchar) or is_number(uchar) or is_alphabet(uchar)):
return True
else:
return False
def B2Q(uchar):
"""半角转全角"""
inside_code=ord(uchar)
if inside_code<0x0020 or inside_code>0x7e: #不是半角字符就返回原来的字符
return uchar
if inside_code==0x0020: #除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code=0x3000
else:
inside_code+=0xfee0
return unichr(inside_code)
def Q2B(uchar):
"""全角转半角"""
inside_code=ord(uchar)
if inside_code==0x3000:
inside_code=0x0020
else:
inside_code-=0xfee0
if inside_code<0x0020 or inside_code>0x7e: #转完之后不是半角字符返回原来的字符
return uchar
return unichr(inside_code)
def stringQ2B(ustring):
"""把字符串全角转半角"""
return "".join([Q2B(uchar) for uchar in ustring])
def uniform(ustring):
"""格式化字符串,完成全角转半角,大写转小写的工作"""
return stringQ2B(ustring).lower()
def string2List(ustring):
"""将ustring按照中文,字母,数字分开"""
retList=[]
utmp=[]
for uchar in ustring:
if is_other(uchar):
if len(utmp)==0:
continue
else:
retList.append("".join(utmp))
utmp=[]
else:
utmp.append(uchar)
if len(utmp)!=0:
retList.append("".join(utmp))
return retList
if __name__=="__main__":
#test Q2B and B2Q
for i in range(0x0020,0x007F):
print Q2B(B2Q(unichr(i))),B2Q(unichr(i))
#test uniform
ustring=u'中国 人名a高频A'
ustring=uniform(ustring)
ret=string2List(ustring)
print ret
import unicodedata
import string
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
print(unicodeToAscii('Ślusàrski'))
#python2 and python3 print unicode
import sys, codecs
print(sys.stdout.encoding)
if sys.stdout.encoding is None or sys.stdout.encoding == 'ANSI_X3.4-1968':
utf8_writer = codecs.getwriter('UTF-8')
if sys.version_info.major < 3:
sys.stdout = utf8_writer(sys.stdout, errors='replace')
else:
sys.stdout = utf8_writer(sys.stdout.buffer, errors='replace')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment