Skip to content

Instantly share code, notes, and snippets.

@maowug
Last active December 19, 2015 02:29
Show Gist options
  • Save maowug/5883317 to your computer and use it in GitHub Desktop.
Save maowug/5883317 to your computer and use it in GitHub Desktop.
mongodb (pymongo) `update` didn't work (using jp.update(_id,{...}) instead) & init from cPickle
#!/usr/bin/env python
#encoding: utf-8
#__author__ = 'actor2019'
# daijirin is much neater than `daijisen`
#conn
from pymongo import *
conn = MongoClient()
db=conn.dict3
jp=db.jp
daijirin2=db.daijirin2
import requests
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString
from dateutil.parser import *
# todo:
# http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"} => no sensesJa ??
import re
import ngram
def getWord(urlWord):
"""
:param urlWord:
:return:
"""
num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻] ') #!!!imp 
# ex_ptn=re.compile(u'「.*-.*」[\s「$]')
# pos_ptn=re.compile(u'\(.+\)')
# print urlWord
res = requests.get(urlWord)
#get the daijisen ul div
thisDict = bs(res.content).find('ul', class_='word_dic')
# raw senses
raw_senses = thisDict.select('li.word')
sensesJa=[]
titles=[]
for idxt,rs in enumerate(raw_senses):
# title: "べつび【別火】"
title = u''.join(rs.find('b').string.split())
# pos: (名)スル etc
#( 感 )
# ( 動サ変 ) [文] サ変 あひはん・す
#( 形ク )
#(名)スル〔もと仏教語〕
#if the position of `pos` is a NavigableString (i.e. if `pos` exists)
pos_pos=rs.find('div', class_='NetDicBody').next_element
raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u''
gloss = []
idioms = []
# unicode text
ss = re.split(num_ptn, rs.get_text())
# for every sense in senses
for idx, s in enumerate(ss):
#clean every
sense_plus_ex = u''.join(s.split())
# #!!!todo: remove ex, keep the definition simple
# while sense_plus_ex[-1] in u'」':#u'」' and \s
# sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')]
# sense_plus_ex.lstrip()
sense = sense_plus_ex
# print sense
# divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \
# since ss strats from ss[1]
if len(ss) > 1 and idx!=0:
pass
else:
#good with title:愛想笑い, "あいそわらい【愛想笑い】(名)" got removed
new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip()
if len(new_sense)<=2:
continue
else:
sense = new_sense
# かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き②
if sense==u'':
continue
if u'[句]' in sense:
idioms = sense[sense.rfind(u'[句]')+3:].split(u'・')
if u'[可能]' in sense:
sense= sense[:sense.rfind(u'[可能]')]
# print title,'/',raw_pos,'/',sense
gloss.append(sense)
# todo: if u"⇒" in sense (synonym, not forms)
# 間作・相作 【あいさく】 -> ⇒ かんさく(間作)
# あいそめつけ【藍染め付け・藍染付け】 ⇒ 染(そ)め付(つ)け② ③
# べつび。 → 合い火(び)
# ⇒ べっか(別火)
#「 あいちゃく(愛着) 」に同じ。
# if u"⇒" in sense or u"→" in sense:
sense_ja = {
'gloss': gloss,
'idioms': idioms,
'pos': raw_pos
}
# indent bug in #test1
sensesJa.append(sense_ja)
#eof: for idx, s in enumerate(ss)
# print u';'.join(idioms) if idioms else u''
title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')]
titles.append(title_return.strip())
return sensesJa,titles
def getPage(urlPage):
"""
:param urlPage:
:return:
"""
res = requests.get(urlPage)
urlWords = bs(res.content).select('div#listWrapper a')
BASE_url=r'http://kotobank.jp'
for idx_word,uw in enumerate(urlWords):
# titles look like:
# 刀折れ,矢尽きる
# 寬・漢・乾・冠 ......
titles=uw.string.strip().split(u'・')
flagNotOnly=True
flagNotFound=True
# if find sensesJa in r, continue
ssJaFlag=False
for t in titles:
rts= list(jp.find({'title':t}))
if rts:
te=rts[0] if rts[0]['pt']==0 else jp.find_one({'_id':rts[0]['pt']})
if te.has_key('sensesJa'):
ssJaFlag=True
break
if ssJaFlag:
continue
else:
print u'/'.join(titles)
# bug of daijisen: http://kotobank.jp/word?dic=daijirin
urlWord= BASE_url+uw['href']+'?dic=daijirin'
if urlWord==r'http://kotobank.jp/word?dic=daijirin':
continue
# try every word, if failed, continue to next word
try:
sensesJa,titles_fromWord=getWord(urlWord)
if len(titles_fromWord)>1:
len_sj=[len(str(sj)) for sj in sensesJa]
idx_max=len_sj.index(max(len_sj))
title=titles_fromWord[idx_max]
else:
# default to: title(hiragana) with most meaningful definition
title=titles_fromWord[0]
except Exception,exc:
print urlPage,u'/'.join(titles),exc
continue
# print title
# print sensesJa
# for t in titles: 刀掛け・刀懸け
for t in titles:
es= list(jp.find({'title':t}))
#if find this title
if es:
flagNotFound=False
# : and the title have only one match
the_e=None
# len(es)==1, no matter `title` and e['title'] sa
if len(es)==1:
e=es[0]
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
flagNotOnly=False
else:
for e in es:
e_title= e['pt_title'] if e.has_key('pt_title') else e['title']
# if the title return from getWord==e['title] (should be hiragana)
if e_title in titles_fromWord:
title=e_title
# print u' find: '+ t+u' of '+ u';'.join(titles)
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
print '---------------'
print jp.find(the_e).count()
flagNotOnly=False
break
else:
continue
e=the_e
if e:
# # check title in case `if len(es)==1` : if not equal, append it to forms
# 偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"`
# titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles)
# import copy
# titles_candidates= set(titles+titles_fromWord) \
# if e['title'] not in titles_fromWord \
# else set(titles+copy.deepcopy(titles_fromWord).remove(title))
# remove title already exists (already in the `e['forms'] + e['title']` )
# =note that= `e.get('forms',[]).append(e['title']))` returns None
forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']]))
# print forms_to_push
#some have forms==[]:
# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense
len_forms=0
if e.has_key('forms'):
len_forms=len(e['forms'])
if not e.has_key('sensesJa'):
print 'jp.update 1',e['title'],e['_id'],forms_to_push
print jp.find(e).count()
jp.update({"_id":e['_id']},{'$set':{'sensesJa':sensesJa,'last':'#pushForms'},
'$push':{'forms':{'$each':forms_to_push}}})
# print db.get_last_error()
else:
# print 'jp.update 2'
if not e.has_key('sensesJa'):
jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}})
for idx,fm in enumerate(forms_to_push):
if not e.has_key('sensesJa'):
jp.save(dict(
_id=e['_id']+'['+str(idx+1+len_forms)+']',
freq=0,
pt=e['_id'],
pt_title=e['title'],
title=fm,
last='#pushedForm'
))
#update the only one
break
# : and the title have many matches, continue to next title
else:
continue
# : didn't find any matches of this title, continue to next title
else:
continue
# loop of `for t in titles` exhausts or break
if flagNotFound:
# print u'Not find: '+ u';'.join(titles)
# save the word didn't find
_id='daijirin_'+str(np)+'_'+str(idx_word+1)
if jp.find_one({'_id':_id})!=None:
# todo note that: set([title]) and set(title) would be very different when title is an unicode/str
forms_besides=list(set(titles+titles_fromWord)-set([title]))
ne=dict(
_id=_id,
sensesJa=sensesJa,
pt=0,
freq=0,
title=title,
forms=forms_besides,
last='#flagNotFound'
)
jp.save(ne)
# print 'flagNotFound'
if forms_besides:
for idx_fm,fm in enumerate(forms_besides):
fm_id=_id+'['+str(idx_fm+1)+']'
fm_new=dict(
_id=fm_id,
freq= 0,
pt=_id,
pt_title=title,
title=fm
)
jp.save(fm_new)
# print 'flagNotFound forms'
continue
# eof - flagNotFound
if flagNotOnly:
# print u'Not only: '+ u';'.join(titles)
_id='daijirin_'+str(np)+'_'+str(idx_word+1)
ne=dict(
_id=_id,
sensesJa=sensesJa,
pt=0,
freq=0,
title=title,
forms=titles,
last='#flagNotOnly'
)
# print 'daijirin.save(ne)'
daijirin2.save(ne)
# http://kotobank.jp/dictionary/daijirin/5/
# アイコラ
for np in xrange(1,3249+1):
urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/'
print urlPage
getPage(urlPage)
#!/usr/bin/env python
#encoding: utf-8
#__author__ = 'actor2019'
# daijirin is much neater than `daijisen`
#conn
from pymongo import *
conn = MongoClient()
db=conn.dict3
jp=db.jp
daijirin=db.daijirin
import requests
from bs4 import BeautifulSoup as bs
from bs4 import NavigableString
from dateutil.parser import *
# todo:
# http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"} => no sensesJa ??
import re
import ngram
def getWord(urlWord):
"""
:param urlWord:
:return:
"""
num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻] ') #!!!imp 
# ex_ptn=re.compile(u'「.*-.*」[\s「$]')
# pos_ptn=re.compile(u'\(.+\)')
# print urlWord
res = requests.get(urlWord)
#get the daijisen ul div
thisDict = bs(res.content).find('ul', class_='word_dic')
# raw senses
raw_senses = thisDict.select('li.word')
sensesJa=[]
titles=[]
for idxt,rs in enumerate(raw_senses):
# title: "べつび【別火】"
title = u''.join(rs.find('b').string.split())
# pos: (名)スル etc
#( 感 )
# ( 動サ変 ) [文] サ変 あひはん・す
#( 形ク )
#(名)スル〔もと仏教語〕
#if the position of `pos` is a NavigableString (i.e. if `pos` exists)
pos_pos=rs.find('div', class_='NetDicBody').next_element
raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u''
gloss = []
idioms = []
# unicode text
ss = re.split(num_ptn, rs.get_text())
# for every sense in senses
for idx, s in enumerate(ss):
#clean every
sense_plus_ex = u''.join(s.split())
# #!!!todo: remove ex, keep the definition simple
# while sense_plus_ex[-1] in u'」':#u'」' and \s
# sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')]
# sense_plus_ex.lstrip()
sense = sense_plus_ex
# print sense
# divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \
# since ss strats from ss[1]
if len(ss) > 1 and idx!=0:
pass
else:
#good with title:愛想笑い, "あいそわらい【愛想笑い】(名)" got removed
new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip()
if len(new_sense)<=2:
continue
else:
sense = new_sense
# かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き②
if sense==u'':
continue
if u'[句]' in sense:
idioms = sense[sense.rfind(u'[句]')+3:].split(u'・')
if u'[可能]' in sense:
sense= sense[:sense.rfind(u'[可能]')]
# print title,'/',raw_pos,'/',sense
gloss.append(sense)
# todo: if u"⇒" in sense (synonym, not forms)
# 間作・相作 【あいさく】 -> ⇒ かんさく(間作)
# あいそめつけ【藍染め付け・藍染付け】 ⇒ 染(そ)め付(つ)け② ③
# べつび。 → 合い火(び)
# ⇒ べっか(別火)
#「 あいちゃく(愛着) 」に同じ。
# if u"⇒" in sense or u"→" in sense:
sense_ja = {
'gloss': gloss,
'idioms': idioms,
'pos': raw_pos
}
# indent bug in #test1
sensesJa.append(sense_ja)
#eof: for idx, s in enumerate(ss)
# print u';'.join(idioms) if idioms else u''
title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')]
titles.append(title_return.strip())
return sensesJa,titles
def getPage(urlPage):
"""
:param urlPage:
:return:
"""
res = requests.get(urlPage)
urlWords = bs(res.content).select('div#listWrapper a')
BASE_url=r'http://kotobank.jp'
for idx_word,uw in enumerate(urlWords):
# see what's going on when: http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"}
if idx_word != 4:
continue
# titles look like:
# 刀折れ,矢尽きる
# 寬・漢・乾・冠 ......
titles=uw.string.strip().split(u'・')
flagNotOnly=True
flagNotFound=True
# bug of daijisen: http://kotobank.jp/word?dic=daijirin
urlWord= BASE_url+uw['href']+'?dic=daijirin'
if urlWord==r'http://kotobank.jp/word?dic=daijirin':
continue
print urlWord
# try every word, if failed, continue to next word
try:
sensesJa,titles_fromWord=getWord(urlWord)
if len(titles_fromWord)>1:
len_sj=[len(str(sj)) for sj in sensesJa]
idx_max=len_sj.index(max(len_sj))
title=titles_fromWord[idx_max]
else:
# default to: title(hiragana) with most meaningful definition
title=titles_fromWord[0]
except Exception,exc:
print urlPage,u'/'.join(titles),exc
continue
print title
# print sensesJa
# for t in titles: 刀掛け・刀懸け
for t in titles:
es= list(jp.find({'title':t}))
#if find this title
if es:
flagNotFound=False
# : and the title have only one match
the_e=None
# len(es)==1, no matter `title` and e['title'] sa
if len(es)==1:
e=es[0]
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
flagNotOnly=False
else:
for e in es:
e_title= e['pt_title'] if e.has_key('pt_title') else e['title']
# if the title return from getWord==e['title] (should be hiragana)
if e_title in titles_fromWord:
title=e_title
# print u' find: '+ t+u' of '+ u';'.join(titles)
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']})
flagNotOnly=False
break
else:
continue
e=the_e
if e:
# # check title in case `if len(es)==1` : if not equal, append it to forms
# 偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"`
# titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles)
# import copy
# titles_candidates= set(titles+titles_fromWord) \
# if e['title'] not in titles_fromWord \
# else set(titles+copy.deepcopy(titles_fromWord).remove(title))
# remove title already exists (already in the `e['forms'] + e['title']` )
# =note that= `e.get('forms',[]).append(e['title']))` returns None
forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']]))
print forms_to_push
#some have forms==[]:
# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense
len_forms=0
if e.has_key('forms'):
len_forms=len(e['forms'])
print 'jp.update 1'
print e['_id']
jp.update(e,{'$set':{'sensesJa':sensesJa,'last':'#pushForms'},
'$push':{'forms':{'$each':forms_to_push}}})
else:
print 'jp.update 2'
# jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}})
for idx,fm in enumerate(forms_to_push):
print 'jp.save form 3'
# jp.save(dict(
# _id=e['_id']+'['+str(idx+1+len_forms)+']',
# freq=0,
# pt=e['_id'],
# pt_title=e['title'],
# title=fm,
# last='#pushedForm'
# ))
#update the only one
break
# : and the title have many matches, continue to next title
else:
continue
# : didn't find any matches of this title, continue to next title
else:
continue
# loop of `for t in titles` exhausts or break
if flagNotFound:
# print u'Not find: '+ u';'.join(titles)
# save the word didn't find
_id='daijirin_'+str(np)+'_'+str(idx_word+1)
# todo note that: set([title]) and set(title) would be very different when title is an unicode/str
forms_besides=list(set(titles+titles_fromWord)-set([title]))
ne=dict(
_id=_id,
sensesJa=sensesJa,
pt=0,
freq=0,
title=title,
forms=forms_besides,
last='#flagNotFound'
)
# jp.save(ne)
print 'flagNotFound'
if forms_besides:
for idx_fm,fm in enumerate(forms_besides):
fm_id=_id+'['+str(idx_fm+1)+']'
fm_new=dict(
_id=fm_id,
freq= 0,
pt=_id,
pt_title=title,
title=fm
)
# jp.save(fm_new)
print 'flagNotFound forms'
continue
# eof - flagNotFound
if flagNotOnly:
# print u'Not only: '+ u';'.join(titles)
_id='daijirin_'+str(np)+'_'+str(idx_word+1)
ne=dict(
_id=_id,
sensesJa=sensesJa,
pt=0,
freq=0,
title=title,
forms=titles,
last='#flagNotOnly'
)
print 'daijirin.save(ne)'
# daijirin.save(ne)
# urlWord=r'http://kotobank.jp/word/%E7%89%87%E7%A9%8D%E3%81%BF?dic=daijirin'
# print getWord(urlWord)
for np in xrange(1313,1313+1):
urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/'
getPage(urlPage)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment