Skip to content

Instantly share code, notes, and snippets.

@ianlewis
Created December 24, 2014 15:01
Show Gist options
  • Save ianlewis/3e07abf05c96d3bee948 to your computer and use it in GitHub Desktop.
Save ianlewis/3e07abf05c96d3bee948 to your computer and use it in GitHub Desktop.
A work in progress tool for converting Eijiro database into something more usable.
# A tool for converting Eijiro database into something more usable.
# A work in progress.
import sys
import MeCab
from lxml import etree
from multiprocessing import Process, Pipe
KEEP_WORD_TYPES = [
["名詞", "一般"],
["名詞", "形容動詞語幹"],
["名詞", "固有名詞"],
["名詞", "サ変接続"],
["名詞", "代名詞"]
]
def get_words(tagger, data):
words = []
node = tagger.parseToNode(data.split('\xe2\x97\x86')[0])
node = node.next
while node:
print node.feature.split(",")[:2]
if node.feature.split(",")[:2] in KEEP_WORD_TYPES:
words.append(node.surface)
node = node.next
return words
def process_trans(conn):
tagger = MeCab.Tagger("-Ochasen")
data = conn.recv()
while data is not None:
node = tagger.parseToNode(data.split('\xe2\x97\x86')[0])
node = node.next
while node:
node = node.next
type
conn.send()
data = conn.recv()
def aggregate(connections):
# TODO
pass
class Eijiro(object):
in_word = False
word = None
in_trans = False
trans = None
def start(self, tag, attrib):
if tag == "word":
self.in_word = True
elif tag == "trans":
self.in_trans = True
def end(self, tag, attrib):
if tag == "word":
self.in_word = False
elif tag == "trans":
self.in_trans = False
elif tag == "record":
self.in_word = False
self.in_trans = False
# TODO: Do stuff
def data(self, data):
if self.in_word is True:
self.word = data
elif self.in_trans is True:
self.trans = data.split('\xe2\x97\x86')
def main():
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment