Created
June 30, 2013 14:58
-
-
Save maowug/5895465 to your computer and use it in GitHub Desktop.
refine sensesJa gloss list after updating ss_titles. jp_3_2_sensesJa_refineGlossList.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for e in iter(temp.find()): | |
new_sensesJa=[] | |
for ss in e['sensesJa']: | |
new_gloss=[] | |
gloss_cp=copy.deepcopy(ss['gloss']) | |
# combine mis-splited glosses | |
for idx_gls,gls in enumerate(ss['gloss']): | |
# if gls ends with KanjiKana and `idx_gls!=len(ss['gloss'])-1:` | |
# :"一(動バ上二)⇒のびる二(動バ下二)⇒のべる" | |
if gloss_cp[idx_gls] and isKanjiKana(gls[-1]) and idx_gls!=len(ss['gloss'])-1: | |
gloss_cp[idx_gls]+=gloss_cp[idx_gls+1] | |
gloss_cp[idx_gls+1]=None | |
# if gls not the 1st one and begins with some char not KanjiKana | |
# case: "「さしゅ(叉手)", | |
# "」に同じ。" | |
if idx_gls>0 and gloss_cp[idx_gls] and not isKanjiKana(gls[0]): | |
#case: (「おあいにくさま」の形で)相手の期待がはずれたことを... | |
if gls[0]==u'(': | |
continue | |
# case: gloss_cp[idx_gls-1]==None, check gloss_cp[idx_gls-2] | |
if not gloss_cp[idx_gls-1]: | |
if gloss_cp[idx_gls-2]: | |
gloss_cp[idx_gls-2]+=gloss_cp[idx_gls] | |
gloss_cp[idx_gls]=None | |
continue | |
else: | |
continue | |
# default | |
gloss_cp[idx_gls-1]+=gloss_cp[idx_gls] | |
gloss_cp[idx_gls]=None | |
gloss_cp=filter(lambda x:x!=None,gloss_cp) | |
gloss_join=u'@ '.join(gloss_cp) | |
idx_div=[] | |
for thisdiv in [u'一(',u'二(',u'三(',u'四(']: | |
idx_thisdiv= gloss_join.find(thisdiv) | |
if idx_thisdiv!=-1: | |
idx_div.append(idx_thisdiv) | |
else: | |
break | |
# check idx order | |
if sorted(idx_div)!=idx_div: | |
temp_idx_div=copy.deepcopy(idx_div) | |
temp_idx_div[0]=gloss_join.find(u'一[') | |
if sorted(temp_idx_div)!=temp_idx_div: | |
print u'idx_div disorder.', e['_id'] | |
continue | |
else: | |
idx_div=temp_idx_div | |
# sense with no u'一(',u'二(', ... | |
if len(idx_div)==0: | |
idx_div.append(-2) | |
# split gloss_join with idx_div and @ | |
if e['_id']=="JMdict_1586200": | |
print idx_div, gloss_join | |
for ind,idx_thisdiv in enumerate(idx_div): | |
idx_nextdiv=len(gloss_join) if ind==len(idx_div)-1 else idx_div[ind+1] | |
if idx_nextdiv<=idx_thisdiv+2: | |
break | |
thisstr=gloss_join[idx_thisdiv+2:idx_nextdiv] | |
# idx_thisdiv not -2 | |
pos=thisstr[:thisstr.find(u')')].strip() if idx_thisdiv>=0 else u'' | |
if idx_thisdiv>=0: | |
this_glosses=thisstr[thisstr.find(u')')+1:].split(u'@') | |
else: | |
this_glosses=thisstr.split(u'@') | |
for tg in this_glosses: | |
tgs=tg.strip() | |
if tgs: | |
new_g=dict( | |
content=[tgs], | |
pos=ss['pos'] if ss['pos'].startswith(u'(') else [pos], | |
field=[u''], | |
related={} | |
) | |
new_gloss.append(new_g) | |
# save pos to ps.: "〔感動詞「あや」に形容詞語幹「にく」の付いた語。「生」は当て字〕" | |
new_ss=dict( | |
glosses=new_gloss, | |
idoms=ss['idioms'], | |
ps=[ss['pos']] if ss['pos'].startswith(u'〔') else [] | |
) | |
new_sensesJa.append(new_ss) | |
#eof: for ss in e['sensesJa'] | |
temp.update( | |
{'_id':e['_id']}, | |
{'$set':{'new_sensesJa':new_sensesJa, | |
'last':'#new_sensesJa'} } | |
) | |
# step1: | |
# idx_div disorder. JMdict_2148680 | |
# idx_div disorder. JMdict_1288340 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment