Skip to content

Instantly share code, notes, and snippets.

@chezou
Created October 25, 2013 13:51
Show Gist options
  • Save chezou/7155025 to your computer and use it in GitHub Desktop.
Save chezou/7155025 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import Mykytea
import sys, codecs
class WordSegmenter:
def __init__(self, opt = "-out tok"):
self.mk = Mykytea.Mykytea(opt)
def showTags(self, t):
for word in t:
out = word.surface + "\t"
for t1 in word.tag:
for t2 in t1:
for t3 in t2:
out = out + "|" + str(t3)
out += "\t"
out += "\t"
print out
def segment_words(self, text):
ret = ""
for word in self.mk.getWS(text):
ret += word
ret += "|"
return ret[:-1]
if __name__ == "__main__":
argvs = sys.argv
# the first arguement is the input file
ws = WordSegmenter()
f = open(argvs[1], "r")
f_out = codecs.open(argvs[1] + "_segmented", "w")
for line in f:
elements = line.split("\t")
#print elements
#print "\t".join(elements)
tweet = elements[0] # the first column
elements[0] = ws.segment_words(tweet)
f_out.write("\t".join(elements))
#分かち書きを取得
#解析結果を文字列で取得
#print mk.getTagsToString(s)
#1位のタグを取得
#t = mk.getTags(s)
#showTags(t)
#すべてのタグを取得
#tt = mk.getAllTags(s)
#showTags(tt)
$ time python MyKytea_sample.py sample.txt
real 1m42.646s
user 1m31.952s
sys 0m9.497s
$ time python MyKytea_sample.new.py sample.txt
real 0m3.380s
user 0m3.043s
sys 0m0.319s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment