Created
October 25, 2013 13:51
-
-
Save chezou/7155025 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import Mykytea | |
import sys, codecs | |
class WordSegmenter: | |
def __init__(self, opt = "-out tok"): | |
self.mk = Mykytea.Mykytea(opt) | |
def showTags(self, t): | |
for word in t: | |
out = word.surface + "\t" | |
for t1 in word.tag: | |
for t2 in t1: | |
for t3 in t2: | |
out = out + "|" + str(t3) | |
out += "\t" | |
out += "\t" | |
print out | |
def segment_words(self, text): | |
ret = "" | |
for word in self.mk.getWS(text): | |
ret += word | |
ret += "|" | |
return ret[:-1] | |
if __name__ == "__main__": | |
argvs = sys.argv | |
# the first arguement is the input file | |
ws = WordSegmenter() | |
f = open(argvs[1], "r") | |
f_out = codecs.open(argvs[1] + "_segmented", "w") | |
for line in f: | |
elements = line.split("\t") | |
#print elements | |
#print "\t".join(elements) | |
tweet = elements[0] # the first column | |
elements[0] = ws.segment_words(tweet) | |
f_out.write("\t".join(elements)) | |
#分かち書きを取得 | |
#解析結果を文字列で取得 | |
#print mk.getTagsToString(s) | |
#1位のタグを取得 | |
#t = mk.getTags(s) | |
#showTags(t) | |
#すべてのタグを取得 | |
#tt = mk.getAllTags(s) | |
#showTags(tt) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ time python MyKytea_sample.py sample.txt | |
real 1m42.646s | |
user 1m31.952s | |
sys 0m9.497s | |
$ time python MyKytea_sample.new.py sample.txt | |
real 0m3.380s | |
user 0m3.043s | |
sys 0m0.319s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment