Created
December 24, 2014 15:01
-
-
Save ianlewis/3e07abf05c96d3bee948 to your computer and use it in GitHub Desktop.
A work in progress tool for converting Eijiro database into something more usable.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A tool for converting Eijiro database into something more usable. | |
# A work in progress. | |
import sys | |
import MeCab | |
from lxml import etree | |
from multiprocessing import Process, Pipe | |
KEEP_WORD_TYPES = [ | |
["名詞", "一般"], | |
["名詞", "形容動詞語幹"], | |
["名詞", "固有名詞"], | |
["名詞", "サ変接続"], | |
["名詞", "代名詞"] | |
] | |
def get_words(tagger, data): | |
words = [] | |
node = tagger.parseToNode(data.split('\xe2\x97\x86')[0]) | |
node = node.next | |
while node: | |
print node.feature.split(",")[:2] | |
if node.feature.split(",")[:2] in KEEP_WORD_TYPES: | |
words.append(node.surface) | |
node = node.next | |
return words | |
def process_trans(conn): | |
tagger = MeCab.Tagger("-Ochasen") | |
data = conn.recv() | |
while data is not None: | |
node = tagger.parseToNode(data.split('\xe2\x97\x86')[0]) | |
node = node.next | |
while node: | |
node = node.next | |
type | |
conn.send() | |
data = conn.recv() | |
def aggregate(connections): | |
# TODO | |
pass | |
class Eijiro(object): | |
in_word = False | |
word = None | |
in_trans = False | |
trans = None | |
def start(self, tag, attrib): | |
if tag == "word": | |
self.in_word = True | |
elif tag == "trans": | |
self.in_trans = True | |
def end(self, tag, attrib): | |
if tag == "word": | |
self.in_word = False | |
elif tag == "trans": | |
self.in_trans = False | |
elif tag == "record": | |
self.in_word = False | |
self.in_trans = False | |
# TODO: Do stuff | |
def data(self, data): | |
if self.in_word is True: | |
self.word = data | |
elif self.in_trans is True: | |
self.trans = data.split('\xe2\x97\x86') | |
def main(): | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment