Skip to content

Instantly share code, notes, and snippets.

@yusuke024
Created August 15, 2016 14:05
Show Gist options
  • Save yusuke024/9aaed808ad07c2bbf2230442f2b32c1b to your computer and use it in GitHub Desktop.
Save yusuke024/9aaed808ad07c2bbf2230442f2b32c1b to your computer and use it in GitHub Desktop.
import json
import sys
import xml.etree.ElementTree as ET
def main():
# preprocess
kanji_info = {}
# from kanji dict xml file
tree = ET.parse('./res/kanji.xml')
root = tree.getroot()
for ch in root.iter('character'):
info = {}
info['codepoint'] = ch.find('codepoint/cp_value[@cp_type="ucs"]').text
stroke_count = ch.find('misc/stroke_count')
info['stroke_count'] = int(stroke_count.text) if stroke_count is not None else 0
cp = int(info['codepoint'], 16)
kanji_info[cp] = info
print >> sys.stderr, 'Finish processing Kanjidict file'
# from jlpt files
jlpt = []
for level in range(5):
with open('./res/jlpt-n{:d}.txt'.format(level+1)) as f:
kanji = [kanji_info[ord(unicode(ch.strip(), 'utf8'))] for ch in f if len(ch) > 0]
jlpt.append({'title': 'N%d' % (level+1), 'characters': kanji})
# with open('./res/radical.txt') as f:
# kanji = [kanji_info[k] for k in [ord(unicode(ch.strip(), 'utf8')) for ch in f if len(ch) > 0] if k in kanji_info]
# jlpt["rad"] = kanji
json.dump(jlpt, sys.stdout, separators=(',', ':'), sort_keys=True)
print >> sys.stderr, 'Finish processing JLPT files'
if __name__ == '__main__':
sys.exit(int(main() or 0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment