Skip to content

Instantly share code, notes, and snippets.

@maowug
Created May 28, 2013 02:32
Show Gist options
  • Save maowug/5660181 to your computer and use it in GitHub Desktop.
Save maowug/5660181 to your computer and use it in GitHub Desktop.
reFormat JMdict_e with xmltodict ( and jsonpickle for pretty json view)
#!/usr/bin/env python
#encoding: utf-8
#__author__ = 'actor2019'
import xmltodict
import cPickle
import simplejson as json
# load sample100 from JMdict_e_typeDict
# entries= cPickle.load(open('JMdict_e_sample100_cPickle.data','rb'))
data= cPickle.load(open('JMdict_e_typeDict.data','rb'))
entries=data['JMdict']['entry']
nes=[]
for idx,e in enumerate(entries):
#new entry
ne={}
# properties not very common
ps={}
# the forms of the title
forms=[]
ne['pt']=0
# meanings
senses=[]
for k in e.keys():
if k=='ent_seq':
#_id: JMdict_1000110[2]
ne['_id']='JMdict'+'_'+e['ent_seq']
elif k=='r_ele':
list_r_ele= e[k] if type(e[k])==list else [e[k]]
for idx,r_ele in enumerate(list_r_ele):
for key in r_ele.keys():
if key=='reb':
if idx==0:
# reb -> title
ne['title']=r_ele['reb']
else:
forms.append(r_ele['reb'])
else:
if key in ['re_nokanji','re_inf','re_pri']:
# 're_pri' in r_ele could be a list: ["ichi1", "news1", "nf10"](めいはく)
# ps.append({key:r_ele[key] if type(r_ele[key])==list else [r_ele[key]]})
ps[key]=r_ele[key] if type(r_ele[key])==list else [r_ele[key]]
#eof: for key in r_ele.keys()
#eof: enumerate(list_r_ele)
elif k=='info':
pass
elif k=='k_ele':
list_k_ele=e[k] if type(e[k])==list else [e[k]]
for k_ele in list_k_ele:
forms.append(k_ele['keb'])
if k_ele.has_key('ke_inf'):
if ps.has_key('ke_inf'):
if not filter(lambda inf: inf==k_ele['ke_inf'],ps['ke_inf']):
ps['ke_inf'].append(k_ele['ke_inf'])
else:
ps['ke_inf']=[k_ele['ke_inf']]
elif k=='sense':
list_senses=e[k] if type(e[k])==list else [e[k]]
for sense in list_senses:
keytolist=lambda key: sense[key] if type(sense[key])==list else [sense[key]]
thisSense={}
for key in sense.keys():
if key in ['ant','misc','pos','dial','s_inf','example','field']:
# ant: [u"×・ばつ・1"]
# field: おでん food term
thisSense[key]=keytolist(key)
elif key=='lsource':
list_lsource=keytolist(key)
#lsources=reduce(lambda r,lsource: r+[{'@xml:lang':lsource['@xml:lang'],'#text':lsource['#text']}],list_lsource,[])
lsources=[]
for lsource in list_lsource:
lo={}
for lk,lv in lsource.items():
lo[lk]=lv
# note that: some entry may not have '#text', OrderedDict([(u'@xml:lang', u'kor')]).
# lsources.append({u'@xml:lang':unicode(lsource['@xml:lang']),u'#text':unicode(lsource['#text'])})
lsources.append(lo)
thisSense[key]=lsources
elif key=='xref':
list_xref=keytolist(key)
thisSense[key]=[{'pt':0,'ref':xref} for xref in list_xref]
elif key=='gloss':
list_gloss=keytolist(key)
glosses=[gloss['#text'] for gloss in list_gloss]
thisSense[key]=glosses
else:
# - stagk*, stagr*: continue
if key not in ['stagk','stagr']:
print key,e['ent_seq'],keytolist(key)
pass
#eof: for key
senses.append(thisSense)
#eof: for sense
#eof: k=='sense'
else:
print "OTHER than: entry (ent_seq, k_ele*, r_ele+, info?, sense+), #"+k
ps[key]=e[k][key] if type(e[k][key])==list else [e[k][key]]
#eof: k in e.keys()
ne['ps']=ps
ne['forms']=forms
ne['senses']=senses
nes.append(ne)
index=idx
if index%10000==0:
print ' '.join([str(index),'of',str(len(entries)),'done.'])
# f=open('JMdict_e_NewEntries.data','w')
# cPickle.dump(nes,f)
# f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment