maowug · June 30, 2013 14:58
diff --git a/jp_3_2_sensesJa_refineGlossList.py b/jp_3_2_sensesJa_refineGlossList.py
 for e in iter(temp.find()):
    new_sensesJa=[]
    for ss in e['sensesJa']:
        new_gloss=[]
        gloss_cp=copy.deepcopy(ss['gloss'])

        # combine mis-splited glosses
        for idx_gls,gls in enumerate(ss['gloss']):
            # if gls ends with KanjiKana and `idx_gls!=len(ss['gloss'])-1:`
            # :"一（動バ上二）⇒のびる二（動バ下二）⇒のべる"
            if gloss_cp[idx_gls] and isKanjiKana(gls[-1]) and idx_gls!=len(ss['gloss'])-1:
                gloss_cp[idx_gls]+=gloss_cp[idx_gls+1]
                gloss_cp[idx_gls+1]=None
            # if gls not the 1st one and begins with some char not KanjiKana
            # case:  "「さしゅ（叉手）",
            #        "」に同じ。"
            if idx_gls>0 and gloss_cp[idx_gls] and not isKanjiKana(gls[0]):

                #case: （「おあいにくさま」の形で）相手の期待がはずれたことを...
                if gls[0]==u'（':
                    continue

                # case:  gloss_cp[idx_gls-1]==None, check gloss_cp[idx_gls-2]
                if not gloss_cp[idx_gls-1]:
                    if gloss_cp[idx_gls-2]:
                        gloss_cp[idx_gls-2]+=gloss_cp[idx_gls]
                        gloss_cp[idx_gls]=None
                        continue
                    else:
                        continue

                # default
                gloss_cp[idx_gls-1]+=gloss_cp[idx_gls]
                gloss_cp[idx_gls]=None

        gloss_cp=filter(lambda x:x!=None,gloss_cp)
        gloss_join=u'@ '.join(gloss_cp)
        idx_div=[]
        for thisdiv in [u'一（',u'二（',u'三（',u'四（']:
            idx_thisdiv= gloss_join.find(thisdiv)
            if idx_thisdiv!=-1:
                idx_div.append(idx_thisdiv)
            else:
                break

        # check idx order
        if sorted(idx_div)!=idx_div:
            temp_idx_div=copy.deepcopy(idx_div)
            temp_idx_div[0]=gloss_join.find(u'一［')
            if sorted(temp_idx_div)!=temp_idx_div:
                print u'idx_div disorder.', e['_id']
                continue
            else:
                idx_div=temp_idx_div


        # sense with no u'一（',u'二（', ...
        if len(idx_div)==0:
            idx_div.append(-2)

        # split gloss_join with idx_div and @

        if e['_id']=="JMdict_1586200":
            print idx_div, gloss_join

        for ind,idx_thisdiv in enumerate(idx_div):
            idx_nextdiv=len(gloss_join) if ind==len(idx_div)-1 else idx_div[ind+1]
            if idx_nextdiv<=idx_thisdiv+2:
                break
            thisstr=gloss_join[idx_thisdiv+2:idx_nextdiv]

            # idx_thisdiv not -2
            pos=thisstr[:thisstr.find(u'）')].strip() if idx_thisdiv>=0 else u''

            if idx_thisdiv>=0:
                this_glosses=thisstr[thisstr.find(u'）')+1:].split(u'@')
            else:
                this_glosses=thisstr.split(u'@')

            for tg in this_glosses:
                tgs=tg.strip()
                if tgs:
                    new_g=dict(
                        content=[tgs],
                        pos=ss['pos'] if ss['pos'].startswith(u'（') else [pos],
                        field=[u''],
                        related={}
                    )
                    new_gloss.append(new_g)

        # save pos to ps.: "〔感動詞「あや」に形容詞語幹「にく」の付いた語。「生」は当て字〕"
        new_ss=dict(
            glosses=new_gloss,
            idoms=ss['idioms'],
            ps=[ss['pos']] if ss['pos'].startswith(u'〔') else []
        )
        new_sensesJa.append(new_ss)

        #eof: for ss in e['sensesJa']
    temp.update(
        {'_id':e['_id']},
        {'$set':{'new_sensesJa':new_sensesJa,
                 'last':'#new_sensesJa'} }
    )

    # step1:
    # idx_div disorder. JMdict_2148680
    # idx_div disorder. JMdict_1288340
	for e in iter(temp.find()):
	new_sensesJa=[]
	for ss in e['sensesJa']:
	new_gloss=[]
	gloss_cp=copy.deepcopy(ss['gloss'])

	# combine mis-splited glosses
	for idx_gls,gls in enumerate(ss['gloss']):
	# if gls ends with KanjiKana and `idx_gls!=len(ss['gloss'])-1:`
	# :"一（動バ上二）⇒のびる二（動バ下二）⇒のべる"
	if gloss_cp[idx_gls] and isKanjiKana(gls[-1]) and idx_gls!=len(ss['gloss'])-1:
	gloss_cp[idx_gls]+=gloss_cp[idx_gls+1]
	gloss_cp[idx_gls+1]=None
	# if gls not the 1st one and begins with some char not KanjiKana
	# case: "「さしゅ（叉手）",
	# "」に同じ。"
	if idx_gls>0 and gloss_cp[idx_gls] and not isKanjiKana(gls[0]):

	#case: （「おあいにくさま」の形で）相手の期待がはずれたことを...
	if gls[0]==u'（':
	continue

	# case: gloss_cp[idx_gls-1]==None, check gloss_cp[idx_gls-2]
	if not gloss_cp[idx_gls-1]:
	if gloss_cp[idx_gls-2]:
	gloss_cp[idx_gls-2]+=gloss_cp[idx_gls]
	gloss_cp[idx_gls]=None
	continue
	else:
	continue

	# default
	gloss_cp[idx_gls-1]+=gloss_cp[idx_gls]
	gloss_cp[idx_gls]=None

	gloss_cp=filter(lambda x:x!=None,gloss_cp)
	gloss_join=u'@ '.join(gloss_cp)
	idx_div=[]
	for thisdiv in [u'一（',u'二（',u'三（',u'四（']:
	idx_thisdiv= gloss_join.find(thisdiv)
	if idx_thisdiv!=-1:
	idx_div.append(idx_thisdiv)
	else:
	break

	# check idx order
	if sorted(idx_div)!=idx_div:
	temp_idx_div=copy.deepcopy(idx_div)
	temp_idx_div[0]=gloss_join.find(u'一［')
	if sorted(temp_idx_div)!=temp_idx_div:
	print u'idx_div disorder.', e['_id']
	continue
	else:
	idx_div=temp_idx_div


	# sense with no u'一（',u'二（', ...
	if len(idx_div)==0:
	idx_div.append(-2)

	# split gloss_join with idx_div and @

	if e['_id']=="JMdict_1586200":
	print idx_div, gloss_join

	for ind,idx_thisdiv in enumerate(idx_div):
	idx_nextdiv=len(gloss_join) if ind==len(idx_div)-1 else idx_div[ind+1]
	if idx_nextdiv<=idx_thisdiv+2:
	break
	thisstr=gloss_join[idx_thisdiv+2:idx_nextdiv]

	# idx_thisdiv not -2
	pos=thisstr[:thisstr.find(u'）')].strip() if idx_thisdiv>=0 else u''

	if idx_thisdiv>=0:
	this_glosses=thisstr[thisstr.find(u'）')+1:].split(u'@')
	else:
	this_glosses=thisstr.split(u'@')

	for tg in this_glosses:
	tgs=tg.strip()
	if tgs:
	new_g=dict(
	content=[tgs],
	pos=ss['pos'] if ss['pos'].startswith(u'（') else [pos],
	field=[u''],
	related={}
	)
	new_gloss.append(new_g)

	# save pos to ps.: "〔感動詞「あや」に形容詞語幹「にく」の付いた語。「生」は当て字〕"
	new_ss=dict(
	glosses=new_gloss,
	idoms=ss['idioms'],
	ps=[ss['pos']] if ss['pos'].startswith(u'〔') else []
	)
	new_sensesJa.append(new_ss)

	#eof: for ss in e['sensesJa']
	temp.update(
	{'_id':e['_id']},
	{'$set':{'new_sensesJa':new_sensesJa,
	'last':'#new_sensesJa'} }
	)

	# step1:
	# idx_div disorder. JMdict_2148680
	# idx_div disorder. JMdict_1288340