Skip to content

Instantly share code, notes, and snippets.

@maowug
Created June 24, 2013 03:10
Show Gist options
  • Save maowug/5847533 to your computer and use it in GitHub Desktop.
Save maowug/5847533 to your computer and use it in GitHub Desktop.
note: getPage of `kotobank daijirin `
def getPage(urlPage):
"""
:param urlPage:
:return:
"""
res = requests.get(urlPage)
urlWords = bs(res.content).select('div#listWrapper a')
BASE_url=r'http://kotobank.jp'
for idx_word,uw in enumerate(urlWords):
urlWord= BASE_url+uw['href']+'?dic=daijirin'
# #http://kotobank.jp/word?dic=daijirin
if urlWord==r'http://kotobank.jp/word?dic=daijirin':
continue
# try every word:
# 寬・漢・乾・冠
try:
sensesJa,title=getWord(urlWord)
except Exception,exc:
print np,urlPage,exc
continue
# titles look like:
# 刀折れ,矢尽きる
titles=uw.string.strip().split(u'・')
flagNotOnly=True
flagNotFound=True
# for t in titles: 刀掛け・刀懸け
for t in titles:
es= list(jp.find({'title':t}))
#if find this title
if es:
flagNotFound=False
# : and the title have only one match
the_e=None
# len(es)==1, no matter `title` and e['title'] sa
if len(es)==1:
e=es[0]
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
flagNotOnly=False
else:
for e in es:
e_title= e['pt_title'] if e.has_key('pt_title') else e['title']
# if the title return from getWord==e['title] (should be hiragana)
if title==e_title:
# print u'find: '+ t+u' of '+ u';'.join(titles)
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
flagNotOnly=False
break
else:
continue
e=the_e
if e:
# # check title: if not equal, append it to forms
# 偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"`
# if title!=e['title']:
# jp.update(e,{'$set':{'title_daijirin':title,'last':'#titleDiff'}})
titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles)
# remove titles already in the e['forms'] and e['title']
# note that: `e.get('forms',[]).append(e['title']))` returns None
forms_to_push=list( titles_candidates - set(e.get('forms',[])+[e['title']]) )
#some have forms==[]:
# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense
len_forms=0
if e.has_key('forms'):
len_forms=len(e['forms'])
jp.update(e,{'$set':{'sensesJa':sensesJa,'last':'#pushForms'},
'$push':{'forms':{'$each':forms_to_push}}
})
else:
jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}})
for idx,fm in enumerate(forms_to_push):
jp.save(dict(
_id=e['_id']+'['+str(idx+1+len_forms)+']',
freq= 0,
pt=e['_id'],
pt_title=e['title'],
title=fm,
last='#pushedForm'
))
#update the only one
break
# : and the title have many matches, continue to next title
else:
continue
# : didn't find any matches of this title, continue to next title
else:
continue
# loop of `for t in titles` exhausts or break
if flagNotFound:
# print u'Not find: '+ u';'.join(titles)
# save the word didn't find
_id='daijirin_'+str(np)+'_'+str(idx_word+1)
forms_besides=list(set(titles)-set(title))
ne=dict(
_id=_id,
sensesJa=sensesJa,
pt=0,
freq=0,
title=title,
forms=forms_besides,
last='#flagNotFound'
)
jp.save(ne)
if forms_besides:
for idx_fm,fm in enumerate(forms_besides):
fm_id=_id+'['+str(idx_fm+1)+']'
fm_new=dict(
_id=fm_id,
freq= 0,
pt=_id,
pt_title=title,
title=fm
)
jp.save(fm_new)
continue
# eof - flagNotFound
if flagNotOnly:
# print u'Not only: '+ u';'.join(titles)
_id='daijirin_'+str(np)+'_'+str(idx_word+1)
ne=dict(
_id=_id,
sensesJa=sensesJa,
pt=0,
freq=0,
title=title,
forms=titles,
last='#flagNotOnly'
)
daijirin.save(ne)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment