Created
June 24, 2013 03:10
-
-
Save maowug/5847533 to your computer and use it in GitHub Desktop.
note: getPage of `kotobank daijirin `
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def getPage(urlPage): | |
""" | |
:param urlPage: | |
:return: | |
""" | |
res = requests.get(urlPage) | |
urlWords = bs(res.content).select('div#listWrapper a') | |
BASE_url=r'http://kotobank.jp' | |
for idx_word,uw in enumerate(urlWords): | |
urlWord= BASE_url+uw['href']+'?dic=daijirin' | |
# #http://kotobank.jp/word?dic=daijirin | |
if urlWord==r'http://kotobank.jp/word?dic=daijirin': | |
continue | |
# try every word: | |
# 寬・漢・乾・冠 | |
try: | |
sensesJa,title=getWord(urlWord) | |
except Exception,exc: | |
print np,urlPage,exc | |
continue | |
# titles look like: | |
# 刀折れ,矢尽きる | |
titles=uw.string.strip().split(u'・') | |
flagNotOnly=True | |
flagNotFound=True | |
# for t in titles: 刀掛け・刀懸け | |
for t in titles: | |
es= list(jp.find({'title':t})) | |
#if find this title | |
if es: | |
flagNotFound=False | |
# : and the title have only one match | |
the_e=None | |
# len(es)==1, no matter `title` and e['title'] sa | |
if len(es)==1: | |
e=es[0] | |
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']}) | |
flagNotOnly=False | |
else: | |
for e in es: | |
e_title= e['pt_title'] if e.has_key('pt_title') else e['title'] | |
# if the title return from getWord==e['title] (should be hiragana) | |
if title==e_title: | |
# print u'find: '+ t+u' of '+ u';'.join(titles) | |
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']}) | |
flagNotOnly=False | |
break | |
else: | |
continue | |
e=the_e | |
if e: | |
# # check title: if not equal, append it to forms | |
# 偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"` | |
# if title!=e['title']: | |
# jp.update(e,{'$set':{'title_daijirin':title,'last':'#titleDiff'}}) | |
titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles) | |
# remove titles already in the e['forms'] and e['title'] | |
# note that: `e.get('forms',[]).append(e['title']))` returns None | |
forms_to_push=list( titles_candidates - set(e.get('forms',[])+[e['title']]) ) | |
#some have forms==[]: | |
# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense | |
len_forms=0 | |
if e.has_key('forms'): | |
len_forms=len(e['forms']) | |
jp.update(e,{'$set':{'sensesJa':sensesJa,'last':'#pushForms'}, | |
'$push':{'forms':{'$each':forms_to_push}} | |
}) | |
else: | |
jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}}) | |
for idx,fm in enumerate(forms_to_push): | |
jp.save(dict( | |
_id=e['_id']+'['+str(idx+1+len_forms)+']', | |
freq= 0, | |
pt=e['_id'], | |
pt_title=e['title'], | |
title=fm, | |
last='#pushedForm' | |
)) | |
#update the only one | |
break | |
# : and the title have many matches, continue to next title | |
else: | |
continue | |
# : didn't find any matches of this title, continue to next title | |
else: | |
continue | |
# loop of `for t in titles` exhausts or break | |
if flagNotFound: | |
# print u'Not find: '+ u';'.join(titles) | |
# save the word didn't find | |
_id='daijirin_'+str(np)+'_'+str(idx_word+1) | |
forms_besides=list(set(titles)-set(title)) | |
ne=dict( | |
_id=_id, | |
sensesJa=sensesJa, | |
pt=0, | |
freq=0, | |
title=title, | |
forms=forms_besides, | |
last='#flagNotFound' | |
) | |
jp.save(ne) | |
if forms_besides: | |
for idx_fm,fm in enumerate(forms_besides): | |
fm_id=_id+'['+str(idx_fm+1)+']' | |
fm_new=dict( | |
_id=fm_id, | |
freq= 0, | |
pt=_id, | |
pt_title=title, | |
title=fm | |
) | |
jp.save(fm_new) | |
continue | |
# eof - flagNotFound | |
if flagNotOnly: | |
# print u'Not only: '+ u';'.join(titles) | |
_id='daijirin_'+str(np)+'_'+str(idx_word+1) | |
ne=dict( | |
_id=_id, | |
sensesJa=sensesJa, | |
pt=0, | |
freq=0, | |
title=title, | |
forms=titles, | |
last='#flagNotOnly' | |
) | |
daijirin.save(ne) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment