Created
May 8, 2011 19:06
-
-
Save utahta/961598 to your computer and use it in GitHub Desktop.
huge data book chapter 8 AhoCorasick
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# vim:fileencoding=utf8 | |
from pyahocorasick import AhoCorasick | |
terms = [] | |
fp = open('keyword.utf8.uniq.txt', 'rb') | |
for line in fp: | |
terms.append(line.strip().decode('utf8')) | |
fp.close() | |
ac = AhoCorasick(terms) | |
#print unicode(ac) | |
text = u"""今日は天気がよかったので、近くの海まで愛犬のしなもんと一緒にお散歩。写真は海辺を楽しそうに歩くしなもん。その>あとついでにお買い物にも行ってきました。「はてなの本」を買ったので、はてなダイアリーの便利な商品紹介ツール「はまぞう」を | |
使って紹介してみるよ。とてもおもしろいのでみんなも読んでみてね。""" | |
results = ac.match(text) | |
rows = [u'今日',u'天気',u'しなもん',u'散歩',u'写真',u'海辺',u'しなもん',u'はてな',u'はてなの本',u'はてな',u'はてなダイア | |
リ',u'はてなダイアリー',u'ダイアリー',u'商品',u'はまぞう',u'おもしろい'] | |
assert len(results) == len(rows) | |
for i in xrange(len(results)): | |
lindex = results[i][0] | |
rindex = results[i][0]+results[i][1] | |
assert text[lindex:rindex] == rows[i] | |
print 'ok.' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment