Created
          May 28, 2013 02:32 
        
      - 
      
- 
        Save maowug/5660181 to your computer and use it in GitHub Desktop. 
    reFormat JMdict_e with xmltodict ( and jsonpickle for pretty json view)
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | #!/usr/bin/env python | |
| #encoding: utf-8 | |
| #__author__ = 'actor2019' | |
| import xmltodict | |
| import cPickle | |
| import simplejson as json | |
| # load sample100 from JMdict_e_typeDict | |
| # entries= cPickle.load(open('JMdict_e_sample100_cPickle.data','rb')) | |
| data= cPickle.load(open('JMdict_e_typeDict.data','rb')) | |
| entries=data['JMdict']['entry'] | |
| nes=[] | |
| for idx,e in enumerate(entries): | |
| #new entry | |
| ne={} | |
| # properties not very common | |
| ps={} | |
| # the forms of the title | |
| forms=[] | |
| ne['pt']=0 | |
| # meanings | |
| senses=[] | |
| for k in e.keys(): | |
| if k=='ent_seq': | |
| #_id: JMdict_1000110[2] | |
| ne['_id']='JMdict'+'_'+e['ent_seq'] | |
| elif k=='r_ele': | |
| list_r_ele= e[k] if type(e[k])==list else [e[k]] | |
| for idx,r_ele in enumerate(list_r_ele): | |
| for key in r_ele.keys(): | |
| if key=='reb': | |
| if idx==0: | |
| # reb -> title | |
| ne['title']=r_ele['reb'] | |
| else: | |
| forms.append(r_ele['reb']) | |
| else: | |
| if key in ['re_nokanji','re_inf','re_pri']: | |
| # 're_pri' in r_ele could be a list: ["ichi1", "news1", "nf10"](めいはく) | |
| # ps.append({key:r_ele[key] if type(r_ele[key])==list else [r_ele[key]]}) | |
| ps[key]=r_ele[key] if type(r_ele[key])==list else [r_ele[key]] | |
| #eof: for key in r_ele.keys() | |
| #eof: enumerate(list_r_ele) | |
| elif k=='info': | |
| pass | |
| elif k=='k_ele': | |
| list_k_ele=e[k] if type(e[k])==list else [e[k]] | |
| for k_ele in list_k_ele: | |
| forms.append(k_ele['keb']) | |
| if k_ele.has_key('ke_inf'): | |
| if ps.has_key('ke_inf'): | |
| if not filter(lambda inf: inf==k_ele['ke_inf'],ps['ke_inf']): | |
| ps['ke_inf'].append(k_ele['ke_inf']) | |
| else: | |
| ps['ke_inf']=[k_ele['ke_inf']] | |
| elif k=='sense': | |
| list_senses=e[k] if type(e[k])==list else [e[k]] | |
| for sense in list_senses: | |
| keytolist=lambda key: sense[key] if type(sense[key])==list else [sense[key]] | |
| thisSense={} | |
| for key in sense.keys(): | |
| if key in ['ant','misc','pos','dial','s_inf','example','field']: | |
| # ant: [u"×・ばつ・1"] | |
| # field: おでん food term | |
| thisSense[key]=keytolist(key) | |
| elif key=='lsource': | |
| list_lsource=keytolist(key) | |
| #lsources=reduce(lambda r,lsource: r+[{'@xml:lang':lsource['@xml:lang'],'#text':lsource['#text']}],list_lsource,[]) | |
| lsources=[] | |
| for lsource in list_lsource: | |
| lo={} | |
| for lk,lv in lsource.items(): | |
| lo[lk]=lv | |
| # note that: some entry may not have '#text', OrderedDict([(u'@xml:lang', u'kor')]). | |
| # lsources.append({u'@xml:lang':unicode(lsource['@xml:lang']),u'#text':unicode(lsource['#text'])}) | |
| lsources.append(lo) | |
| thisSense[key]=lsources | |
| elif key=='xref': | |
| list_xref=keytolist(key) | |
| thisSense[key]=[{'pt':0,'ref':xref} for xref in list_xref] | |
| elif key=='gloss': | |
| list_gloss=keytolist(key) | |
| glosses=[gloss['#text'] for gloss in list_gloss] | |
| thisSense[key]=glosses | |
| else: | |
| # - stagk*, stagr*: continue | |
| if key not in ['stagk','stagr']: | |
| print key,e['ent_seq'],keytolist(key) | |
| pass | |
| #eof: for key | |
| senses.append(thisSense) | |
| #eof: for sense | |
| #eof: k=='sense' | |
| else: | |
| print "OTHER than: entry (ent_seq, k_ele*, r_ele+, info?, sense+), #"+k | |
| ps[key]=e[k][key] if type(e[k][key])==list else [e[k][key]] | |
| #eof: k in e.keys() | |
| ne['ps']=ps | |
| ne['forms']=forms | |
| ne['senses']=senses | |
| nes.append(ne) | |
| index=idx | |
| if index%10000==0: | |
| print ' '.join([str(index),'of',str(len(entries)),'done.']) | |
| # f=open('JMdict_e_NewEntries.data','w') | |
| # cPickle.dump(nes,f) | |
| # f.close() | 
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment