Skip to content

Instantly share code, notes, and snippets.

@yusuke024
Last active July 27, 2016 22:19
Show Gist options
  • Save yusuke024/e0f3e605ee1535fd392135436c6acf99 to your computer and use it in GitHub Desktop.
Save yusuke024/e0f3e605ee1535fd392135436c6acf99 to your computer and use it in GitHub Desktop.
#!/bin/sh
DEST_DIR=./res
if [ ! -d ${DEST_DIR} ]
then
mkdir -p ${DEST_DIR}
fi
if [ ! -f ${DEST_DIR}/kanji.xml ]
then
curl -L "http://www.csse.monash.edu.au/~jwb/kanjidic2/kanjidic2.xml.gz" | gzip -d > ${DEST_DIR}/kanji.xml
fi
if [ ! -f ${DEST_DIR}/radical.xml ]
then
# curl -L "http://www.kanjicafe.com/downloads/kradfile-u.gz" | gzip -d > ${DEST_DIR}/radical.txt
curl -L "http://kanji.sljfaq.org/radicals.html" | \
pup "#choice_table .choice text{}" > ${DEST_DIR}/radical.txt
fi
for i in {1..5}
do
if [ ! -f ${DEST_DIR}/jlpt-n${i}.txt ]
then
curl http://www.tanos.co.uk/jlpt/jlpt${i}/kanji/ | \
pup '#contentright table:nth-of-type(2) tbody tr td:first-child a text{}' > ${DEST_DIR}/jlpt-n${i}.txt
fi
done
import json
import sys
import xml.etree.ElementTree as ET
def main():
# preprocess
kanji_info = {}
# from kanji dict xml file
tree = ET.parse('./res/kanji.xml')
root = tree.getroot()
for ch in root.iter('character'):
cp_string = ch.find('codepoint/cp_value[@cp_type="ucs"]').text
cp = int(cp_string, 16)
kanji_info[cp] = cp_string
print >> sys.stderr, 'Finish processing Kanjidict file'
# from jlpt files
jlpt = {}
for level in range(5):
with open('./res/jlpt-n{:d}.txt'.format(level+1)) as f:
kanji = [kanji_info[ord(unicode(ch.strip(), 'utf8'))] for ch in f if len(ch) > 0]
jlpt["n%d" % (level+1)] = kanji
with open('./res/radical.txt') as f:
kanji = [kanji_info[k] for k in [ord(unicode(ch.strip(), 'utf8')) for ch in f if len(ch) > 0] if k in kanji_info]
jlpt["rad"] = kanji
json.dump(jlpt, sys.stdout, separators=(',', ':'), sort_keys=True)
print >> sys.stderr, 'Finish processing JLPT files'
if __name__ == '__main__':
sys.exit(int(main() or 0))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment