mkyt · January 16, 2020 16:08
diff --git a/eijiro.py b/eijiro.py
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """eijiro.py  - convert EIJIRO text data into JSON format

 Input: Raw EIJIRO text data (can be purchased from https://booth.pm/ja/items/777563 )
 Output: Dictionary data in JSON format


 < Data Schema >

 top level: Word[]

 Word: {
  title: string
  meanings: Definition[]
  level: int?          // SVL (standard vocabulary level)
  phonetic: string?    // phonetic transcription
  kana: string?        // pronunciation represented in kana
  syllables: string?     // syllabification
  forms: string?         // different forms (sg vs pl nouns, verb tenses, etc)
  same_sound: string?
  region_sound: string?
  spell_warning: string?
 }

 Definition: {
  wc: string?
  body: string
 }

 """

 import sys
 import re
 import json


 INPUT_FILE = 'EIJIRO-1448.TXT'
 OUTPUT_FILE = 'eijiro-1448.json'


 def load_file(fname):
    '''return list of entryies'''
    f = open(fname, 'r', encoding='cp932')
    d = f.read()
    d = '\n' + d
    return d.split('\n■')[1:]


 def collect_words(entries):
    expr = re.compile(r' \{(.+)\}$')
    d = {}
    def add(t, wc, defin):
        item = (wc, defin)
        if t in d:
            d[t].append(item)
        else:
            d[t] = [item]
    for item in entries:
        title, defin = item.split(' : ')
        title = title.strip()
        defin = defin.strip()
        m = expr.search(title)
        if m is not None:
            wc = m.group(1)
            title = title[:-(len(wc)+3)].strip()
            add(title, wc, defin)
        else:
            add(title, None, defin)
    return d


 kls2key = {
    'レベル': 'level',
    '発音': 'phonetic',
    '発音！': 'phonetic',
    '＠': 'kana',
    '分節': 'syllables',
    '変化': 'forms',
    '同音': 'same_sound',
    '発音の地域差': 'region_sound',
    'スペリングに注意': 'spell_warning'
 }


 def objectify(dct):
    res = []
    for i, item in enumerate(dct.items()):
        #if i % 100000 == 0:
        #    print('processing #{} of {}'.format(i, len(dct)))
        k, vs = item
        entry = {}
        entry['title'] = k
        meanings = []
        for v in vs:
            wc, defin = v
            if wc is None and defin.startswith('【'):  # info line
                for s in defin[1:].split('、【'):
                    try:
                        kls, content = s.split('】')
                    except:
                        s = s.replace('】】', '】')  # workaround for bug in `piranha`
                        kls, content = s.split('】')
                    key = kls2key[kls]
                    if key == 'level':
                        content = int(content)
                    entry[key] = content
            else:  # definition line
                dfn = {'body': defin}
                if wc is not None:
                    dfn['wc'] = wc
                meanings.append(dfn)
        entry['meanings'] = meanings
        res.append(entry)
    return res


 def main():
    print('loading dictionary file')
    entries = load_file(INPUT_FILE)
    print('collecting entry for each word')
    dct = collect_words(entries)
    print('generating word object')
    res = objectify(dct)
    print('writing json')
    json.dump(res, open(OUTPUT_FILE, 'w'), ensure_ascii=False)
    return 0


 if __name__ == '__main__':
    sys.exit(main())
	#!/usr/bin/env python3
	# -- coding: utf-8 --
	"""eijiro.py - convert EIJIRO text data into JSON format

	Input: Raw EIJIRO text data (can be purchased from https://booth.pm/ja/items/777563 )
	Output: Dictionary data in JSON format


	< Data Schema >

	top level: Word[]

	Word: {
	title: string
	meanings: Definition[]
	level: int? // SVL (standard vocabulary level)
	phonetic: string? // phonetic transcription
	kana: string? // pronunciation represented in kana
	syllables: string? // syllabification
	forms: string? // different forms (sg vs pl nouns, verb tenses, etc)
	same_sound: string?
	region_sound: string?
	spell_warning: string?
	}

	Definition: {
	wc: string?
	body: string
	}

	"""

	import sys
	import re
	import json


	INPUT_FILE = 'EIJIRO-1448.TXT'
	OUTPUT_FILE = 'eijiro-1448.json'


	def load_file(fname):
	'''return list of entryies'''
	f = open(fname, 'r', encoding='cp932')
	d = f.read()
	d = '\n' + d
	return d.split('\n■')[1:]


	def collect_words(entries):
	expr = re.compile(r' \{(.+)\}$')
	d = {}
	def add(t, wc, defin):
	item = (wc, defin)
	if t in d:
	d[t].append(item)
	else:
	d[t] = [item]
	for item in entries:
	title, defin = item.split(' : ')
	title = title.strip()
	defin = defin.strip()
	m = expr.search(title)
	if m is not None:
	wc = m.group(1)
	title = title[:-(len(wc)+3)].strip()
	add(title, wc, defin)
	else:
	add(title, None, defin)
	return d


	kls2key = {
	'レベル': 'level',
	'発音': 'phonetic',
	'発音！': 'phonetic',
	'＠': 'kana',
	'分節': 'syllables',
	'変化': 'forms',
	'同音': 'same_sound',
	'発音の地域差': 'region_sound',
	'スペリングに注意': 'spell_warning'
	}


	def objectify(dct):
	res = []
	for i, item in enumerate(dct.items()):
	#if i % 100000 == 0:
	# print('processing #{} of {}'.format(i, len(dct)))
	k, vs = item
	entry = {}
	entry['title'] = k
	meanings = []
	for v in vs:
	wc, defin = v
	if wc is None and defin.startswith('【'): # info line
	for s in defin[1:].split('、【'):
	try:
	kls, content = s.split('】')
	except:
	s = s.replace('】】', '】') # workaround for bug in `piranha`
	kls, content = s.split('】')
	key = kls2key[kls]
	if key == 'level':
	content = int(content)
	entry[key] = content
	else: # definition line
	dfn = {'body': defin}
	if wc is not None:
	dfn['wc'] = wc
	meanings.append(dfn)
	entry['meanings'] = meanings
	res.append(entry)
	return res


	def main():
	print('loading dictionary file')
	entries = load_file(INPUT_FILE)
	print('collecting entry for each word')
	dct = collect_words(entries)
	print('generating word object')
	res = objectify(dct)
	print('writing json')
	json.dump(res, open(OUTPUT_FILE, 'w'), ensure_ascii=False)
	return 0


	if __name__ == '__main__':
	sys.exit(main())