Skip to content

Instantly share code, notes, and snippets.

@maowug
Created June 30, 2013 14:58
Show Gist options
  • Save maowug/5895465 to your computer and use it in GitHub Desktop.
Save maowug/5895465 to your computer and use it in GitHub Desktop.
refine sensesJa gloss list after updating ss_titles. jp_3_2_sensesJa_refineGlossList.py
for e in iter(temp.find()):
new_sensesJa=[]
for ss in e['sensesJa']:
new_gloss=[]
gloss_cp=copy.deepcopy(ss['gloss'])
# combine mis-splited glosses
for idx_gls,gls in enumerate(ss['gloss']):
# if gls ends with KanjiKana and `idx_gls!=len(ss['gloss'])-1:`
# :"一(動バ上二)⇒のびる二(動バ下二)⇒のべる"
if gloss_cp[idx_gls] and isKanjiKana(gls[-1]) and idx_gls!=len(ss['gloss'])-1:
gloss_cp[idx_gls]+=gloss_cp[idx_gls+1]
gloss_cp[idx_gls+1]=None
# if gls not the 1st one and begins with some char not KanjiKana
# case: "「さしゅ(叉手)",
# "」に同じ。"
if idx_gls>0 and gloss_cp[idx_gls] and not isKanjiKana(gls[0]):
#case: (「おあいにくさま」の形で)相手の期待がはずれたことを...
if gls[0]==u'(':
continue
# case: gloss_cp[idx_gls-1]==None, check gloss_cp[idx_gls-2]
if not gloss_cp[idx_gls-1]:
if gloss_cp[idx_gls-2]:
gloss_cp[idx_gls-2]+=gloss_cp[idx_gls]
gloss_cp[idx_gls]=None
continue
else:
continue
# default
gloss_cp[idx_gls-1]+=gloss_cp[idx_gls]
gloss_cp[idx_gls]=None
gloss_cp=filter(lambda x:x!=None,gloss_cp)
gloss_join=u'@ '.join(gloss_cp)
idx_div=[]
for thisdiv in [u'一(',u'二(',u'三(',u'四(']:
idx_thisdiv= gloss_join.find(thisdiv)
if idx_thisdiv!=-1:
idx_div.append(idx_thisdiv)
else:
break
# check idx order
if sorted(idx_div)!=idx_div:
temp_idx_div=copy.deepcopy(idx_div)
temp_idx_div[0]=gloss_join.find(u'一[')
if sorted(temp_idx_div)!=temp_idx_div:
print u'idx_div disorder.', e['_id']
continue
else:
idx_div=temp_idx_div
# sense with no u'一(',u'二(', ...
if len(idx_div)==0:
idx_div.append(-2)
# split gloss_join with idx_div and @
if e['_id']=="JMdict_1586200":
print idx_div, gloss_join
for ind,idx_thisdiv in enumerate(idx_div):
idx_nextdiv=len(gloss_join) if ind==len(idx_div)-1 else idx_div[ind+1]
if idx_nextdiv<=idx_thisdiv+2:
break
thisstr=gloss_join[idx_thisdiv+2:idx_nextdiv]
# idx_thisdiv not -2
pos=thisstr[:thisstr.find(u')')].strip() if idx_thisdiv>=0 else u''
if idx_thisdiv>=0:
this_glosses=thisstr[thisstr.find(u')')+1:].split(u'@')
else:
this_glosses=thisstr.split(u'@')
for tg in this_glosses:
tgs=tg.strip()
if tgs:
new_g=dict(
content=[tgs],
pos=ss['pos'] if ss['pos'].startswith(u'(') else [pos],
field=[u''],
related={}
)
new_gloss.append(new_g)
# save pos to ps.: "〔感動詞「あや」に形容詞語幹「にく」の付いた語。「生」は当て字〕"
new_ss=dict(
glosses=new_gloss,
idoms=ss['idioms'],
ps=[ss['pos']] if ss['pos'].startswith(u'〔') else []
)
new_sensesJa.append(new_ss)
#eof: for ss in e['sensesJa']
temp.update(
{'_id':e['_id']},
{'$set':{'new_sensesJa':new_sensesJa,
'last':'#new_sensesJa'} }
)
# step1:
# idx_div disorder. JMdict_2148680
# idx_div disorder. JMdict_1288340
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment