Created
January 16, 2020 16:08
-
-
Save mkyt/746ce66c418fb82951cf79c6ba871b43 to your computer and use it in GitHub Desktop.
convert EIJIRO text data into JSON format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
"""eijiro.py - convert EIJIRO text data into JSON format | |
Input: Raw EIJIRO text data (can be purchased from https://booth.pm/ja/items/777563 ) | |
Output: Dictionary data in JSON format | |
< Data Schema > | |
top level: Word[] | |
Word: { | |
title: string | |
meanings: Definition[] | |
level: int? // SVL (standard vocabulary level) | |
phonetic: string? // phonetic transcription | |
kana: string? // pronunciation represented in kana | |
syllables: string? // syllabification | |
forms: string? // different forms (sg vs pl nouns, verb tenses, etc) | |
same_sound: string? | |
region_sound: string? | |
spell_warning: string? | |
} | |
Definition: { | |
wc: string? | |
body: string | |
} | |
""" | |
import sys | |
import re | |
import json | |
INPUT_FILE = 'EIJIRO-1448.TXT' | |
OUTPUT_FILE = 'eijiro-1448.json' | |
def load_file(fname): | |
'''return list of entryies''' | |
f = open(fname, 'r', encoding='cp932') | |
d = f.read() | |
d = '\n' + d | |
return d.split('\n■')[1:] | |
def collect_words(entries): | |
expr = re.compile(r' \{(.+)\}$') | |
d = {} | |
def add(t, wc, defin): | |
item = (wc, defin) | |
if t in d: | |
d[t].append(item) | |
else: | |
d[t] = [item] | |
for item in entries: | |
title, defin = item.split(' : ') | |
title = title.strip() | |
defin = defin.strip() | |
m = expr.search(title) | |
if m is not None: | |
wc = m.group(1) | |
title = title[:-(len(wc)+3)].strip() | |
add(title, wc, defin) | |
else: | |
add(title, None, defin) | |
return d | |
kls2key = { | |
'レベル': 'level', | |
'発音': 'phonetic', | |
'発音!': 'phonetic', | |
'@': 'kana', | |
'分節': 'syllables', | |
'変化': 'forms', | |
'同音': 'same_sound', | |
'発音の地域差': 'region_sound', | |
'スペリングに注意': 'spell_warning' | |
} | |
def objectify(dct): | |
res = [] | |
for i, item in enumerate(dct.items()): | |
#if i % 100000 == 0: | |
# print('processing #{} of {}'.format(i, len(dct))) | |
k, vs = item | |
entry = {} | |
entry['title'] = k | |
meanings = [] | |
for v in vs: | |
wc, defin = v | |
if wc is None and defin.startswith('【'): # info line | |
for s in defin[1:].split('、【'): | |
try: | |
kls, content = s.split('】') | |
except: | |
s = s.replace('】】', '】') # workaround for bug in `piranha` | |
kls, content = s.split('】') | |
key = kls2key[kls] | |
if key == 'level': | |
content = int(content) | |
entry[key] = content | |
else: # definition line | |
dfn = {'body': defin} | |
if wc is not None: | |
dfn['wc'] = wc | |
meanings.append(dfn) | |
entry['meanings'] = meanings | |
res.append(entry) | |
return res | |
def main(): | |
print('loading dictionary file') | |
entries = load_file(INPUT_FILE) | |
print('collecting entry for each word') | |
dct = collect_words(entries) | |
print('generating word object') | |
res = objectify(dct) | |
print('writing json') | |
json.dump(res, open(OUTPUT_FILE, 'w'), ensure_ascii=False) | |
return 0 | |
if __name__ == '__main__': | |
sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment