maowug · December 19, 2015 02:29
diff --git a/temp_get_shibashiba.py b/temp_get_shibashiba.py
 #!/usr/bin/env python
 #encoding: utf-8
 #__author__ = 'actor2019'

 # daijirin is much neater than `daijisen`

 #conn
 from pymongo import *

 conn = MongoClient()
 db=conn.dict3
 jp=db.jp
 daijirin2=db.daijirin2


 import requests
 from bs4 import BeautifulSoup as bs
 from bs4 import NavigableString
 from dateutil.parser import *



 # todo:
 # http://kotobank.jp/dictionary/daijirin/1313/  {title:"しばしばめ"} => no sensesJa ??



 import re
 import ngram

 def getWord(urlWord):
    """
    :param urlWord:
    :return:
    """
    num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻]  ') #!!!imp　
    # ex_ptn=re.compile(u'「.*－.*」[\s「$]')
    # pos_ptn=re.compile(u'\(.+\)')

    # print urlWord

    res = requests.get(urlWord)
    #get the daijisen ul div
    thisDict = bs(res.content).find('ul', class_='word_dic')
    # raw senses
    raw_senses = thisDict.select('li.word')
    sensesJa=[]
    titles=[]


    for idxt,rs in enumerate(raw_senses):


        # title: "べつび【別火】"
        title = u''.join(rs.find('b').string.split())


        # pos: (名）スル etc
        #（ 感 ）
        # （ 動サ変 ） ［文］ サ変　あひはん・す
        #（ 形ク ）
        #（名）スル〔もと仏教語〕

        #if the position of `pos` is a NavigableString (i.e. if `pos` exists)
        pos_pos=rs.find('div', class_='NetDicBody').next_element
        raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u''

        gloss = []
        idioms = []


        # unicode text
        ss = re.split(num_ptn, rs.get_text())

        # for every sense in senses
        for idx, s in enumerate(ss):

            #clean every
            sense_plus_ex = u''.join(s.split())

            # #!!!todo: remove ex, keep the definition simple
            # while sense_plus_ex[-1] in u'」':#u'」' and \s
            #     sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')]
            #     sense_plus_ex.lstrip()

            sense = sense_plus_ex
            # print sense

            # divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \
            #   since ss strats from ss[1]
            if len(ss) > 1 and idx!=0:
                pass
            else:
                #good with title:愛想笑い, "あいそわらい【愛想笑い】（名）" got removed
                new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip()
                if len(new_sense)<=2:
                    continue
                else:
                    sense = new_sense

            # かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き②
            if sense==u'':
                continue

            if u'［句］' in sense:
                idioms = sense[sense.rfind(u'［句］')+3:].split(u'・')

            if u'［可能］' in sense:
                sense= sense[:sense.rfind(u'［可能］')]

            # print title,'/',raw_pos,'/',sense
            gloss.append(sense)

            # todo: if u"⇒" in sense (synonym, not forms)
            # 間作・相作 【あいさく】 -> ⇒ かんさく（間作）
            # あいそめつけ【藍染め付け・藍染付け】 ⇒ 染（そ）め付（つ）け② ③
            # べつび。 → 合い火（び）
            # ⇒ べっか（別火）
            #「 あいちゃく（愛着） 」に同じ。
            # if u"⇒" in sense or u"→" in sense:

        sense_ja = {
            'gloss': gloss,
            'idioms': idioms,
            'pos': raw_pos
        }

        # indent bug in #test1
        sensesJa.append(sense_ja)
        #eof: for idx, s in enumerate(ss)

        # print u';'.join(idioms) if idioms else u''
        title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')]
        titles.append(title_return.strip())

    return sensesJa,titles



 def getPage(urlPage):
    """
    :param urlPage:
    :return:
    """
    res = requests.get(urlPage)
    urlWords = bs(res.content).select('div#listWrapper a')
    BASE_url=r'http://kotobank.jp'
    for idx_word,uw in enumerate(urlWords):


        # titles look like:
        #   刀折れ，矢尽きる
        #   寬・漢・乾・冠 ......
        titles=uw.string.strip().split(u'・')
        flagNotOnly=True
        flagNotFound=True


        # if find sensesJa in r, continue
        ssJaFlag=False
        for t in titles:
            rts= list(jp.find({'title':t}))
            if rts:
                te=rts[0] if rts[0]['pt']==0 else jp.find_one({'_id':rts[0]['pt']})
                if te.has_key('sensesJa'):
                    ssJaFlag=True
                    break
        if ssJaFlag:
            continue
        else:
            print u'/'.join(titles)


        # bug of daijisen: http://kotobank.jp/word?dic=daijirin
        urlWord= BASE_url+uw['href']+'?dic=daijirin'
        if urlWord==r'http://kotobank.jp/word?dic=daijirin':
            continue


        # try every word, if failed, continue to next word
        try:
            sensesJa,titles_fromWord=getWord(urlWord)
            if len(titles_fromWord)>1:

                len_sj=[len(str(sj)) for sj in sensesJa]
                idx_max=len_sj.index(max(len_sj))
                title=titles_fromWord[idx_max]
            else:
                # default to: title(hiragana) with most meaningful definition
                title=titles_fromWord[0]
        except Exception,exc:
            print urlPage,u'/'.join(titles),exc
            continue


        # print title
        # print sensesJa


        # for t in titles: 刀掛け・刀懸け
        for t in titles:
            es= list(jp.find({'title':t}))
            #if find this title
            if es:
                flagNotFound=False
                # : and the title have only one match
                the_e=None

                # len(es)==1, no matter `title` and e['title'] sa
                if len(es)==1:
                    e=es[0]
                    the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
                    flagNotOnly=False
                else:
                    for e in es:
                        e_title= e['pt_title'] if e.has_key('pt_title') else e['title']
                        # if the title return from getWord==e['title] (should be hiragana)
                        if e_title in titles_fromWord:
                            title=e_title
                            # print u' find: '+ t+u' of '+ u';'.join(titles)
                            the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})

                            print '---------------'
                            print jp.find(the_e).count()

                            flagNotOnly=False
                            break
                        else:
                            continue

                e=the_e
                if e:

                    # # check title in case `if len(es)==1` : if not equal, append it to forms
                    #       偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"`
                    # titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles)

                    # import copy
                    # titles_candidates= set(titles+titles_fromWord) \
                    #     if e['title'] not in titles_fromWord \
                    #     else set(titles+copy.deepcopy(titles_fromWord).remove(title))

                    # remove title already exists (already in the `e['forms'] + e['title']` )
                    #   =note that= `e.get('forms',[]).append(e['title']))` returns None
                    forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']]))
                    # print forms_to_push

                    #some have forms==[]：
                    # so check if len_forms=(e.get('forms',[])) equals 0 won't make sense
                    len_forms=0
                    if e.has_key('forms'):
                        len_forms=len(e['forms'])


                        if not e.has_key('sensesJa'):
                            print 'jp.update 1',e['title'],e['_id'],forms_to_push

                            print jp.find(e).count()
                            jp.update({"_id":e['_id']},{'$set':{'sensesJa':sensesJa,'last':'#pushForms'},
                                         '$push':{'forms':{'$each':forms_to_push}}})
                            # print db.get_last_error()
                    else:
                        # print 'jp.update 2'
                        if not e.has_key('sensesJa'):
                            jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}})

                    for idx,fm in enumerate(forms_to_push):

                        if not e.has_key('sensesJa'):
                            jp.save(dict(
                                _id=e['_id']+'['+str(idx+1+len_forms)+']',
                                freq=0,
                                pt=e['_id'],
                                pt_title=e['title'],
                                title=fm,
                                last='#pushedForm'
                            ))

                    #update the only one
                    break

                # : and the title have many matches, continue to next title
                else:
                    continue
            # : didn't find any matches of this title, continue to next title
            else:
                continue
        # loop of `for t in titles` exhausts or break
        if flagNotFound:
            # print u'Not find: '+ u';'.join(titles)

            # save the word didn't find
            _id='daijirin_'+str(np)+'_'+str(idx_word+1)

            if jp.find_one({'_id':_id})!=None:
                # todo note that: set([title]) and set(title) would be very different when title is an unicode/str
                forms_besides=list(set(titles+titles_fromWord)-set([title]))
                ne=dict(
                    _id=_id,
                    sensesJa=sensesJa,
                    pt=0,
                    freq=0,
                    title=title,
                    forms=forms_besides,
                    last='#flagNotFound'
                )
                jp.save(ne)
                # print 'flagNotFound'

                if forms_besides:
                    for idx_fm,fm in enumerate(forms_besides):
                        fm_id=_id+'['+str(idx_fm+1)+']'
                        fm_new=dict(
                            _id=fm_id,
                            freq= 0,
                            pt=_id,
                            pt_title=title,
                            title=fm
                        )
                        jp.save(fm_new)
                        # print 'flagNotFound forms'
                continue
                # eof - flagNotFound

        if flagNotOnly:
            # print u'Not only: '+ u';'.join(titles)
            _id='daijirin_'+str(np)+'_'+str(idx_word+1)
            ne=dict(
                _id=_id,
                sensesJa=sensesJa,
                pt=0,
                freq=0,
                title=title,
                forms=titles,
                last='#flagNotOnly'
            )
            # print 'daijirin.save(ne)'
            daijirin2.save(ne)



 # http://kotobank.jp/dictionary/daijirin/5/
 # アイコラ



 for np in xrange(1,3249+1):
    urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/'
    print urlPage
    getPage(urlPage)

diff --git a/temp_shibashiba.py b/temp_shibashiba.py
 #!/usr/bin/env python
 #encoding: utf-8
 #__author__ = 'actor2019'

 # daijirin is much neater than `daijisen`

 #conn
 from pymongo import *

 conn = MongoClient()
 db=conn.dict3
 jp=db.jp
 daijirin=db.daijirin


 import requests
 from bs4 import BeautifulSoup as bs
 from bs4 import NavigableString
 from dateutil.parser import *



 # todo:
 # http://kotobank.jp/dictionary/daijirin/1313/  {title:"しばしばめ"} => no sensesJa ??



 import re
 import ngram

 def getWord(urlWord):
    """
    :param urlWord:
    :return:
    """
    num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻]  ') #!!!imp　
    # ex_ptn=re.compile(u'「.*－.*」[\s「$]')
    # pos_ptn=re.compile(u'\(.+\)')

    # print urlWord

    res = requests.get(urlWord)
    #get the daijisen ul div
    thisDict = bs(res.content).find('ul', class_='word_dic')
    # raw senses
    raw_senses = thisDict.select('li.word')
    sensesJa=[]
    titles=[]


    for idxt,rs in enumerate(raw_senses):


        # title: "べつび【別火】"
        title = u''.join(rs.find('b').string.split())


        # pos: (名）スル etc
        #（ 感 ）
        # （ 動サ変 ） ［文］ サ変　あひはん・す
        #（ 形ク ）
        #（名）スル〔もと仏教語〕

        #if the position of `pos` is a NavigableString (i.e. if `pos` exists)
        pos_pos=rs.find('div', class_='NetDicBody').next_element
        raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u''

        gloss = []
        idioms = []


        # unicode text
        ss = re.split(num_ptn, rs.get_text())

        # for every sense in senses
        for idx, s in enumerate(ss):

            #clean every
            sense_plus_ex = u''.join(s.split())

            # #!!!todo: remove ex, keep the definition simple
            # while sense_plus_ex[-1] in u'」':#u'」' and \s
            #     sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')]
            #     sense_plus_ex.lstrip()

            sense = sense_plus_ex
            # print sense

            # divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \
            #   since ss strats from ss[1]
            if len(ss) > 1 and idx!=0:
                pass
            else:
                #good with title:愛想笑い, "あいそわらい【愛想笑い】（名）" got removed
                new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip()
                if len(new_sense)<=2:
                    continue
                else:
                    sense = new_sense

            # かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き②
            if sense==u'':
                continue

            if u'［句］' in sense:
                idioms = sense[sense.rfind(u'［句］')+3:].split(u'・')

            if u'［可能］' in sense:
                sense= sense[:sense.rfind(u'［可能］')]

            # print title,'/',raw_pos,'/',sense
            gloss.append(sense)

            # todo: if u"⇒" in sense (synonym, not forms)
            # 間作・相作 【あいさく】 -> ⇒ かんさく（間作）
            # あいそめつけ【藍染め付け・藍染付け】 ⇒ 染（そ）め付（つ）け② ③
            # べつび。 → 合い火（び）
            # ⇒ べっか（別火）
            #「 あいちゃく（愛着） 」に同じ。
            # if u"⇒" in sense or u"→" in sense:

        sense_ja = {
            'gloss': gloss,
            'idioms': idioms,
            'pos': raw_pos
        }

        # indent bug in #test1
        sensesJa.append(sense_ja)
        #eof: for idx, s in enumerate(ss)

        # print u';'.join(idioms) if idioms else u''
        title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')]
        titles.append(title_return.strip())

    return sensesJa,titles



 def getPage(urlPage):
    """
    :param urlPage:
    :return:
    """
    res = requests.get(urlPage)
    urlWords = bs(res.content).select('div#listWrapper a')
    BASE_url=r'http://kotobank.jp'
    for idx_word,uw in enumerate(urlWords):

         # see what's going on when: http://kotobank.jp/dictionary/daijirin/1313/  {title:"しばしばめ"}
        if idx_word != 4:
            continue

        # titles look like:
        #   刀折れ，矢尽きる
        #   寬・漢・乾・冠 ......
        titles=uw.string.strip().split(u'・')
        flagNotOnly=True
        flagNotFound=True

        # bug of daijisen: http://kotobank.jp/word?dic=daijirin
        urlWord= BASE_url+uw['href']+'?dic=daijirin'
        if urlWord==r'http://kotobank.jp/word?dic=daijirin':
            continue

        print urlWord

        # try every word, if failed, continue to next word
        try:
            sensesJa,titles_fromWord=getWord(urlWord)
            if len(titles_fromWord)>1:

                len_sj=[len(str(sj)) for sj in sensesJa]
                idx_max=len_sj.index(max(len_sj))
                title=titles_fromWord[idx_max]
            else:
                # default to: title(hiragana) with most meaningful definition
                title=titles_fromWord[0]
        except Exception,exc:
            print urlPage,u'/'.join(titles),exc
            continue

        print title
        # print sensesJa


        # for t in titles: 刀掛け・刀懸け
        for t in titles:
            es= list(jp.find({'title':t}))
            #if find this title
            if es:
                flagNotFound=False
                # : and the title have only one match
                the_e=None

                # len(es)==1, no matter `title` and e['title'] sa
                if len(es)==1:
                    e=es[0]
                    the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
                    flagNotOnly=False
                else:
                    for e in es:
                        e_title= e['pt_title'] if e.has_key('pt_title') else e['title']
                        # if the title return from getWord==e['title] (should be hiragana)
                        if e_title in titles_fromWord:
                            title=e_title
                            # print u' find: '+ t+u' of '+ u';'.join(titles)
                            the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
                            flagNotOnly=False
                            break
                        else:
                            continue

                e=the_e
                if e:

                    # # check title in case `if len(es)==1` : if not equal, append it to forms
                    #       偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"`
                    # titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles)

                    # import copy
                    # titles_candidates= set(titles+titles_fromWord) \
                    #     if e['title'] not in titles_fromWord \
                    #     else set(titles+copy.deepcopy(titles_fromWord).remove(title))

                    # remove title already exists (already in the `e['forms'] + e['title']` )
                    #   =note that= `e.get('forms',[]).append(e['title']))` returns None
                    forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']]))
                    print forms_to_push

                    #some have forms==[]：
                    # so check if len_forms=(e.get('forms',[])) equals 0 won't make sense
                    len_forms=0
                    if e.has_key('forms'):
                        len_forms=len(e['forms'])
                        print 'jp.update 1'
                        print e['_id']
                        jp.update(e,{'$set':{'sensesJa':sensesJa,'last':'#pushForms'},
                                     '$push':{'forms':{'$each':forms_to_push}}})
                    else:
                        print 'jp.update 2'
                        # jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}})

                    for idx,fm in enumerate(forms_to_push):
                        print 'jp.save form 3'
                        # jp.save(dict(
                        #     _id=e['_id']+'['+str(idx+1+len_forms)+']',
                        #     freq=0,
                        #     pt=e['_id'],
                        #     pt_title=e['title'],
                        #     title=fm,
                        #     last='#pushedForm'
                        # ))

                    #update the only one
                    break

                # : and the title have many matches, continue to next title
                else:
                    continue
            # : didn't find any matches of this title, continue to next title
            else:
                continue
        # loop of `for t in titles` exhausts or break
        if flagNotFound:
            # print u'Not find: '+ u';'.join(titles)

            # save the word didn't find
            _id='daijirin_'+str(np)+'_'+str(idx_word+1)

            # todo note that: set([title]) and set(title) would be very different when title is an unicode/str
            forms_besides=list(set(titles+titles_fromWord)-set([title]))
            ne=dict(
                _id=_id,
                sensesJa=sensesJa,
                pt=0,
                freq=0,
                title=title,
                forms=forms_besides,
                last='#flagNotFound'
            )
            # jp.save(ne)
            print 'flagNotFound'

            if forms_besides:
                for idx_fm,fm in enumerate(forms_besides):
                    fm_id=_id+'['+str(idx_fm+1)+']'
                    fm_new=dict(
                        _id=fm_id,
                        freq= 0,
                        pt=_id,
                        pt_title=title,
                        title=fm
                    )
                    # jp.save(fm_new)
                    print 'flagNotFound forms'
            continue
            # eof - flagNotFound

        if flagNotOnly:
            # print u'Not only: '+ u';'.join(titles)
            _id='daijirin_'+str(np)+'_'+str(idx_word+1)
            ne=dict(
                _id=_id,
                sensesJa=sensesJa,
                pt=0,
                freq=0,
                title=title,
                forms=titles,
                last='#flagNotOnly'
            )
            print 'daijirin.save(ne)'
            # daijirin.save(ne)



 # urlWord=r'http://kotobank.jp/word/%E7%89%87%E7%A9%8D%E3%81%BF?dic=daijirin'
 # print getWord(urlWord)



 for np in xrange(1313,1313+1):
    urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/'
    getPage(urlPage)
	#!/usr/bin/env python
	#encoding: utf-8
	#__author__ = 'actor2019'

	# daijirin is much neater than `daijisen`

	#conn
	from pymongo import *

	conn = MongoClient()
	db=conn.dict3
	jp=db.jp
	daijirin2=db.daijirin2


	import requests
	from bs4 import BeautifulSoup as bs
	from bs4 import NavigableString
	from dateutil.parser import *



	# todo:
	# http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"} => no sensesJa ??



	import re
	import ngram

	def getWord(urlWord):
	"""
	:param urlWord:
	:return:
	"""
	num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻] ') #!!!imp
	# ex_ptn=re.compile(u'「.－.」[\s「$]')
	# pos_ptn=re.compile(u'\(.+\)')

	# print urlWord

	res = requests.get(urlWord)
	#get the daijisen ul div
	thisDict = bs(res.content).find('ul', class_='word_dic')
	# raw senses
	raw_senses = thisDict.select('li.word')
	sensesJa=[]
	titles=[]


	for idxt,rs in enumerate(raw_senses):


	# title: "べつび【別火】"
	title = u''.join(rs.find('b').string.split())


	# pos: (名）スル etc
	#（感）
	# （動サ変）［文］サ変　あひはん・す
	#（形ク）
	#（名）スル〔もと仏教語〕

	#if the position of `pos` is a NavigableString (i.e. if `pos` exists)
	pos_pos=rs.find('div', class_='NetDicBody').next_element
	raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u''

	gloss = []
	idioms = []


	# unicode text
	ss = re.split(num_ptn, rs.get_text())

	# for every sense in senses
	for idx, s in enumerate(ss):

	#clean every
	sense_plus_ex = u''.join(s.split())

	# #!!!todo: remove ex, keep the definition simple
	# while sense_plus_ex[-1] in u'」':#u'」' and \s
	# sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')]
	# sense_plus_ex.lstrip()

	sense = sense_plus_ex
	# print sense

	# divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \
	# since ss strats from ss[1]
	if len(ss) > 1 and idx!=0:
	pass
	else:
	#good with title:愛想笑い, "あいそわらい【愛想笑い】（名）" got removed
	new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip()
	if len(new_sense)<=2:
	continue
	else:
	sense = new_sense

	# かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き②
	if sense==u'':
	continue

	if u'［句］' in sense:
	idioms = sense[sense.rfind(u'［句］')+3:].split(u'・')

	if u'［可能］' in sense:
	sense= sense[:sense.rfind(u'［可能］')]

	# print title,'/',raw_pos,'/',sense
	gloss.append(sense)

	# todo: if u"⇒" in sense (synonym, not forms)
	# 間作・相作【あいさく】 -> ⇒ かんさく（間作）
	# あいそめつけ【藍染め付け・藍染付け】 ⇒ 染（そ）め付（つ）け② ③
	# べつび。 → 合い火（び）
	# ⇒ べっか（別火）
	#「あいちゃく（愛着）」に同じ。
	# if u"⇒" in sense or u"→" in sense:

	sense_ja = {
	'gloss': gloss,
	'idioms': idioms,
	'pos': raw_pos
	}

	# indent bug in #test1
	sensesJa.append(sense_ja)
	#eof: for idx, s in enumerate(ss)

	# print u';'.join(idioms) if idioms else u''
	title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')]
	titles.append(title_return.strip())

	return sensesJa,titles



	def getPage(urlPage):
	"""
	:param urlPage:
	:return:
	"""
	res = requests.get(urlPage)
	urlWords = bs(res.content).select('div#listWrapper a')
	BASE_url=r'http://kotobank.jp'
	for idx_word,uw in enumerate(urlWords):


	# titles look like:
	# 刀折れ，矢尽きる
	# 寬・漢・乾・冠 ......
	titles=uw.string.strip().split(u'・')
	flagNotOnly=True
	flagNotFound=True


	# if find sensesJa in r, continue
	ssJaFlag=False
	for t in titles:
	rts= list(jp.find({'title':t}))
	if rts:
	te=rts[0] if rts[0]['pt']==0 else jp.find_one({'_id':rts[0]['pt']})
	if te.has_key('sensesJa'):
	ssJaFlag=True
	break
	if ssJaFlag:
	continue
	else:
	print u'/'.join(titles)


	# bug of daijisen: http://kotobank.jp/word?dic=daijirin
	urlWord= BASE_url+uw['href']+'?dic=daijirin'
	if urlWord==r'http://kotobank.jp/word?dic=daijirin':
	continue


	# try every word, if failed, continue to next word
	try:
	sensesJa,titles_fromWord=getWord(urlWord)
	if len(titles_fromWord)>1:

	len_sj=[len(str(sj)) for sj in sensesJa]
	idx_max=len_sj.index(max(len_sj))
	title=titles_fromWord[idx_max]
	else:
	# default to: title(hiragana) with most meaningful definition
	title=titles_fromWord[0]
	except Exception,exc:
	print urlPage,u'/'.join(titles),exc
	continue


	# print title
	# print sensesJa


	# for t in titles: 刀掛け・刀懸け
	for t in titles:
	es= list(jp.find({'title':t}))
	#if find this title
	if es:
	flagNotFound=False
	# : and the title have only one match
	the_e=None

	# len(es)==1, no matter `title` and e['title'] sa
	if len(es)==1:
	e=es[0]
	the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
	flagNotOnly=False
	else:
	for e in es:
	e_title= e['pt_title'] if e.has_key('pt_title') else e['title']
	# if the title return from getWord==e['title] (should be hiragana)
	if e_title in titles_fromWord:
	title=e_title
	# print u' find: '+ t+u' of '+ u';'.join(titles)
	the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})

	print '---------------'
	print jp.find(the_e).count()

	flagNotOnly=False
	break
	else:
	continue

	e=the_e
	if e:

	# # check title in case `if len(es)==1` : if not equal, append it to forms
	# 偏照り・片照り【かたでり】 but in jp: `title: "へんでり"`
	# titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles)

	# import copy
	# titles_candidates= set(titles+titles_fromWord) \
	# if e['title'] not in titles_fromWord \
	# else set(titles+copy.deepcopy(titles_fromWord).remove(title))

	# remove title already exists (already in the `e['forms'] + e['title']` )
	# =note that= `e.get('forms',[]).append(e['title']))` returns None
	forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']]))
	# print forms_to_push

	#some have forms==[]：
	# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense
	len_forms=0
	if e.has_key('forms'):
	len_forms=len(e['forms'])


	if not e.has_key('sensesJa'):
	print 'jp.update 1',e['title'],e['_id'],forms_to_push

	print jp.find(e).count()
	jp.update({"_id":e['_id']},{'$set':{'sensesJa':sensesJa,'last':'#pushForms'},
	'$push':{'forms':{'$each':forms_to_push}}})
	# print db.get_last_error()
	else:
	# print 'jp.update 2'
	if not e.has_key('sensesJa'):
	jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}})

	for idx,fm in enumerate(forms_to_push):

	if not e.has_key('sensesJa'):
	jp.save(dict(
	_id=e['_id']+'['+str(idx+1+len_forms)+']',
	freq=0,
	pt=e['_id'],
	pt_title=e['title'],
	title=fm,
	last='#pushedForm'
	))

	#update the only one
	break

	# : and the title have many matches, continue to next title
	else:
	continue
	# : didn't find any matches of this title, continue to next title
	else:
	continue
	# loop of `for t in titles` exhausts or break
	if flagNotFound:
	# print u'Not find: '+ u';'.join(titles)

	# save the word didn't find
	_id='daijirin_'+str(np)+'_'+str(idx_word+1)

	if jp.find_one({'_id':_id})!=None:
	# todo note that: set([title]) and set(title) would be very different when title is an unicode/str
	forms_besides=list(set(titles+titles_fromWord)-set([title]))
	ne=dict(
	_id=_id,
	sensesJa=sensesJa,
	pt=0,
	freq=0,
	title=title,
	forms=forms_besides,
	last='#flagNotFound'
	)
	jp.save(ne)
	# print 'flagNotFound'

	if forms_besides:
	for idx_fm,fm in enumerate(forms_besides):
	fm_id=_id+'['+str(idx_fm+1)+']'
	fm_new=dict(
	_id=fm_id,
	freq= 0,
	pt=_id,
	pt_title=title,
	title=fm
	)
	jp.save(fm_new)
	# print 'flagNotFound forms'
	continue
	# eof - flagNotFound

	if flagNotOnly:
	# print u'Not only: '+ u';'.join(titles)
	_id='daijirin_'+str(np)+'_'+str(idx_word+1)
	ne=dict(
	_id=_id,
	sensesJa=sensesJa,
	pt=0,
	freq=0,
	title=title,
	forms=titles,
	last='#flagNotOnly'
	)
	# print 'daijirin.save(ne)'
	daijirin2.save(ne)



	# http://kotobank.jp/dictionary/daijirin/5/
	# アイコラ



	for np in xrange(1,3249+1):
	urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/'
	print urlPage
	getPage(urlPage)