Last active
December 19, 2015 02:29
-
-
Save maowug/5883317 to your computer and use it in GitHub Desktop.
mongodb (pymongo) `update` didn't work (using jp.update(_id,{...}) instead) & init from cPickle
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#encoding: utf-8 | |
#__author__ = 'actor2019' | |
# daijirin is much neater than `daijisen` | |
#conn | |
from pymongo import * | |
conn = MongoClient() | |
db=conn.dict3 | |
jp=db.jp | |
daijirin2=db.daijirin2 | |
import requests | |
from bs4 import BeautifulSoup as bs | |
from bs4 import NavigableString | |
from dateutil.parser import * | |
# todo: | |
# http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"} => no sensesJa ?? | |
import re | |
import ngram | |
def getWord(urlWord): | |
""" | |
:param urlWord: | |
:return: | |
""" | |
num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻] ') #!!!imp | |
# ex_ptn=re.compile(u'「.*-.*」[\s「$]') | |
# pos_ptn=re.compile(u'\(.+\)') | |
# print urlWord | |
res = requests.get(urlWord) | |
#get the daijisen ul div | |
thisDict = bs(res.content).find('ul', class_='word_dic') | |
# raw senses | |
raw_senses = thisDict.select('li.word') | |
sensesJa=[] | |
titles=[] | |
for idxt,rs in enumerate(raw_senses): | |
# title: "べつび【別火】" | |
title = u''.join(rs.find('b').string.split()) | |
# pos: (名)スル etc | |
#( 感 ) | |
# ( 動サ変 ) [文] サ変 あひはん・す | |
#( 形ク ) | |
#(名)スル〔もと仏教語〕 | |
#if the position of `pos` is a NavigableString (i.e. if `pos` exists) | |
pos_pos=rs.find('div', class_='NetDicBody').next_element | |
raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u'' | |
gloss = [] | |
idioms = [] | |
# unicode text | |
ss = re.split(num_ptn, rs.get_text()) | |
# for every sense in senses | |
for idx, s in enumerate(ss): | |
#clean every | |
sense_plus_ex = u''.join(s.split()) | |
# #!!!todo: remove ex, keep the definition simple | |
# while sense_plus_ex[-1] in u'」':#u'」' and \s | |
# sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')] | |
# sense_plus_ex.lstrip() | |
sense = sense_plus_ex | |
# print sense | |
# divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \ | |
# since ss strats from ss[1] | |
if len(ss) > 1 and idx!=0: | |
pass | |
else: | |
#good with title:愛想笑い, "あいそわらい【愛想笑い】(名)" got removed | |
new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip() | |
if len(new_sense)<=2: | |
continue | |
else: | |
sense = new_sense | |
# かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き② | |
if sense==u'': | |
continue | |
if u'[句]' in sense: | |
idioms = sense[sense.rfind(u'[句]')+3:].split(u'・') | |
if u'[可能]' in sense: | |
sense= sense[:sense.rfind(u'[可能]')] | |
# print title,'/',raw_pos,'/',sense | |
gloss.append(sense) | |
# todo: if u"⇒" in sense (synonym, not forms) | |
# 間作・相作 【あいさく】 -> ⇒ かんさく(間作) | |
# あいそめつけ【藍染め付け・藍染付け】 ⇒ 染(そ)め付(つ)け② ③ | |
# べつび。 → 合い火(び) | |
# ⇒ べっか(別火) | |
#「 あいちゃく(愛着) 」に同じ。 | |
# if u"⇒" in sense or u"→" in sense: | |
sense_ja = { | |
'gloss': gloss, | |
'idioms': idioms, | |
'pos': raw_pos | |
} | |
# indent bug in #test1 | |
sensesJa.append(sense_ja) | |
#eof: for idx, s in enumerate(ss) | |
# print u';'.join(idioms) if idioms else u'' | |
title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')] | |
titles.append(title_return.strip()) | |
return sensesJa,titles | |
def getPage(urlPage): | |
""" | |
:param urlPage: | |
:return: | |
""" | |
res = requests.get(urlPage) | |
urlWords = bs(res.content).select('div#listWrapper a') | |
BASE_url=r'http://kotobank.jp' | |
for idx_word,uw in enumerate(urlWords): | |
# titles look like: | |
# 刀折れ,矢尽きる | |
# 寬・漢・乾・冠 ...... | |
titles=uw.string.strip().split(u'・') | |
flagNotOnly=True | |
flagNotFound=True | |
# if find sensesJa in r, continue | |
ssJaFlag=False | |
for t in titles: | |
rts= list(jp.find({'title':t})) | |
if rts: | |
te=rts[0] if rts[0]['pt']==0 else jp.find_one({'_id':rts[0]['pt']}) | |
if te.has_key('sensesJa'): | |
ssJaFlag=True | |
break | |
if ssJaFlag: | |
continue | |
else: | |
print u'/'.join(titles) | |
# bug of daijisen: http://kotobank.jp/word?dic=daijirin | |
urlWord= BASE_url+uw['href']+'?dic=daijirin' | |
if urlWord==r'http://kotobank.jp/word?dic=daijirin': | |
continue | |
# try every word, if failed, continue to next word | |
try: | |
sensesJa,titles_fromWord=getWord(urlWord) | |
if len(titles_fromWord)>1: | |
len_sj=[len(str(sj)) for sj in sensesJa] | |
idx_max=len_sj.index(max(len_sj)) | |
title=titles_fromWord[idx_max] | |
else: | |
# default to: title(hiragana) with most meaningful definition | |
title=titles_fromWord[0] | |
except Exception,exc: | |
print urlPage,u'/'.join(titles),exc | |
continue | |
# print title | |
# print sensesJa | |
# for t in titles: 刀掛け・刀懸け | |
for t in titles: | |
es= list(jp.find({'title':t})) | |
#if find this title | |
if es: | |
flagNotFound=False | |
# : and the title have only one match | |
the_e=None | |
# len(es)==1, no matter `title` and e['title'] sa | |
if len(es)==1: | |
e=es[0] | |
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']}) | |
flagNotOnly=False | |
else: | |
for e in es: | |
e_title= e['pt_title'] if e.has_key('pt_title') else e['title'] | |
# if the title return from getWord==e['title] (should be hiragana) | |
if e_title in titles_fromWord: | |
title=e_title | |
# print u' find: '+ t+u' of '+ u';'.join(titles) | |
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']}) | |
print '---------------' | |
print jp.find(the_e).count() | |
flagNotOnly=False | |
break | |
else: | |
continue | |
e=the_e | |
if e: | |
# # check title in case `if len(es)==1` : if not equal, append it to forms | |
# 偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"` | |
# titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles) | |
# import copy | |
# titles_candidates= set(titles+titles_fromWord) \ | |
# if e['title'] not in titles_fromWord \ | |
# else set(titles+copy.deepcopy(titles_fromWord).remove(title)) | |
# remove title already exists (already in the `e['forms'] + e['title']` ) | |
# =note that= `e.get('forms',[]).append(e['title']))` returns None | |
forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']])) | |
# print forms_to_push | |
#some have forms==[]: | |
# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense | |
len_forms=0 | |
if e.has_key('forms'): | |
len_forms=len(e['forms']) | |
if not e.has_key('sensesJa'): | |
print 'jp.update 1',e['title'],e['_id'],forms_to_push | |
print jp.find(e).count() | |
jp.update({"_id":e['_id']},{'$set':{'sensesJa':sensesJa,'last':'#pushForms'}, | |
'$push':{'forms':{'$each':forms_to_push}}}) | |
# print db.get_last_error() | |
else: | |
# print 'jp.update 2' | |
if not e.has_key('sensesJa'): | |
jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}}) | |
for idx,fm in enumerate(forms_to_push): | |
if not e.has_key('sensesJa'): | |
jp.save(dict( | |
_id=e['_id']+'['+str(idx+1+len_forms)+']', | |
freq=0, | |
pt=e['_id'], | |
pt_title=e['title'], | |
title=fm, | |
last='#pushedForm' | |
)) | |
#update the only one | |
break | |
# : and the title have many matches, continue to next title | |
else: | |
continue | |
# : didn't find any matches of this title, continue to next title | |
else: | |
continue | |
# loop of `for t in titles` exhausts or break | |
if flagNotFound: | |
# print u'Not find: '+ u';'.join(titles) | |
# save the word didn't find | |
_id='daijirin_'+str(np)+'_'+str(idx_word+1) | |
if jp.find_one({'_id':_id})!=None: | |
# todo note that: set([title]) and set(title) would be very different when title is an unicode/str | |
forms_besides=list(set(titles+titles_fromWord)-set([title])) | |
ne=dict( | |
_id=_id, | |
sensesJa=sensesJa, | |
pt=0, | |
freq=0, | |
title=title, | |
forms=forms_besides, | |
last='#flagNotFound' | |
) | |
jp.save(ne) | |
# print 'flagNotFound' | |
if forms_besides: | |
for idx_fm,fm in enumerate(forms_besides): | |
fm_id=_id+'['+str(idx_fm+1)+']' | |
fm_new=dict( | |
_id=fm_id, | |
freq= 0, | |
pt=_id, | |
pt_title=title, | |
title=fm | |
) | |
jp.save(fm_new) | |
# print 'flagNotFound forms' | |
continue | |
# eof - flagNotFound | |
if flagNotOnly: | |
# print u'Not only: '+ u';'.join(titles) | |
_id='daijirin_'+str(np)+'_'+str(idx_word+1) | |
ne=dict( | |
_id=_id, | |
sensesJa=sensesJa, | |
pt=0, | |
freq=0, | |
title=title, | |
forms=titles, | |
last='#flagNotOnly' | |
) | |
# print 'daijirin.save(ne)' | |
daijirin2.save(ne) | |
# http://kotobank.jp/dictionary/daijirin/5/ | |
# アイコラ | |
for np in xrange(1,3249+1): | |
urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/' | |
print urlPage | |
getPage(urlPage) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
#encoding: utf-8 | |
#__author__ = 'actor2019' | |
# daijirin is much neater than `daijisen` | |
#conn | |
from pymongo import * | |
conn = MongoClient() | |
db=conn.dict3 | |
jp=db.jp | |
daijirin=db.daijirin | |
import requests | |
from bs4 import BeautifulSoup as bs | |
from bs4 import NavigableString | |
from dateutil.parser import * | |
# todo: | |
# http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"} => no sensesJa ?? | |
import re | |
import ngram | |
def getWord(urlWord): | |
""" | |
:param urlWord: | |
:return: | |
""" | |
num_ptn = re.compile(u'[①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮❶❷❸❹❺❻] ') #!!!imp | |
# ex_ptn=re.compile(u'「.*-.*」[\s「$]') | |
# pos_ptn=re.compile(u'\(.+\)') | |
# print urlWord | |
res = requests.get(urlWord) | |
#get the daijisen ul div | |
thisDict = bs(res.content).find('ul', class_='word_dic') | |
# raw senses | |
raw_senses = thisDict.select('li.word') | |
sensesJa=[] | |
titles=[] | |
for idxt,rs in enumerate(raw_senses): | |
# title: "べつび【別火】" | |
title = u''.join(rs.find('b').string.split()) | |
# pos: (名)スル etc | |
#( 感 ) | |
# ( 動サ変 ) [文] サ変 あひはん・す | |
#( 形ク ) | |
#(名)スル〔もと仏教語〕 | |
#if the position of `pos` is a NavigableString (i.e. if `pos` exists) | |
pos_pos=rs.find('div', class_='NetDicBody').next_element | |
raw_pos = u''.join(pos_pos.string.split()) if type(pos_pos)==NavigableString else u'' | |
gloss = [] | |
idioms = [] | |
# unicode text | |
ss = re.split(num_ptn, rs.get_text()) | |
# for every sense in senses | |
for idx, s in enumerate(ss): | |
#clean every | |
sense_plus_ex = u''.join(s.split()) | |
# #!!!todo: remove ex, keep the definition simple | |
# while sense_plus_ex[-1] in u'」':#u'」' and \s | |
# sense_plus_ex = sense_plus_ex[0:sense_plus_ex.rfind(u'「')] | |
# sense_plus_ex.lstrip() | |
sense = sense_plus_ex | |
# print sense | |
# divided by ①②.. , no need removing `title` or `raw_pos` from ss[0] \ | |
# since ss strats from ss[1] | |
if len(ss) > 1 and idx!=0: | |
pass | |
else: | |
#good with title:愛想笑い, "あいそわらい【愛想笑い】(名)" got removed | |
new_sense = sense.replace(title, u'').replace(raw_pos, u'').strip() | |
if len(new_sense)<=2: | |
continue | |
else: | |
sense = new_sense | |
# かたをたたく【肩を叩く】: ②上役が部下に退職を勧める。 → 肩叩き② | |
if sense==u'': | |
continue | |
if u'[句]' in sense: | |
idioms = sense[sense.rfind(u'[句]')+3:].split(u'・') | |
if u'[可能]' in sense: | |
sense= sense[:sense.rfind(u'[可能]')] | |
# print title,'/',raw_pos,'/',sense | |
gloss.append(sense) | |
# todo: if u"⇒" in sense (synonym, not forms) | |
# 間作・相作 【あいさく】 -> ⇒ かんさく(間作) | |
# あいそめつけ【藍染め付け・藍染付け】 ⇒ 染(そ)め付(つ)け② ③ | |
# べつび。 → 合い火(び) | |
# ⇒ べっか(別火) | |
#「 あいちゃく(愛着) 」に同じ。 | |
# if u"⇒" in sense or u"→" in sense: | |
sense_ja = { | |
'gloss': gloss, | |
'idioms': idioms, | |
'pos': raw_pos | |
} | |
# indent bug in #test1 | |
sensesJa.append(sense_ja) | |
#eof: for idx, s in enumerate(ss) | |
# print u';'.join(idioms) if idioms else u'' | |
title_return=title if title.find(u'【')==-1 else title[:title.find(u'【')] | |
titles.append(title_return.strip()) | |
return sensesJa,titles | |
def getPage(urlPage): | |
""" | |
:param urlPage: | |
:return: | |
""" | |
res = requests.get(urlPage) | |
urlWords = bs(res.content).select('div#listWrapper a') | |
BASE_url=r'http://kotobank.jp' | |
for idx_word,uw in enumerate(urlWords): | |
# see what's going on when: http://kotobank.jp/dictionary/daijirin/1313/ {title:"しばしばめ"} | |
if idx_word != 4: | |
continue | |
# titles look like: | |
# 刀折れ,矢尽きる | |
# 寬・漢・乾・冠 ...... | |
titles=uw.string.strip().split(u'・') | |
flagNotOnly=True | |
flagNotFound=True | |
# bug of daijisen: http://kotobank.jp/word?dic=daijirin | |
urlWord= BASE_url+uw['href']+'?dic=daijirin' | |
if urlWord==r'http://kotobank.jp/word?dic=daijirin': | |
continue | |
print urlWord | |
# try every word, if failed, continue to next word | |
try: | |
sensesJa,titles_fromWord=getWord(urlWord) | |
if len(titles_fromWord)>1: | |
len_sj=[len(str(sj)) for sj in sensesJa] | |
idx_max=len_sj.index(max(len_sj)) | |
title=titles_fromWord[idx_max] | |
else: | |
# default to: title(hiragana) with most meaningful definition | |
title=titles_fromWord[0] | |
except Exception,exc: | |
print urlPage,u'/'.join(titles),exc | |
continue | |
print title | |
# print sensesJa | |
# for t in titles: 刀掛け・刀懸け | |
for t in titles: | |
es= list(jp.find({'title':t})) | |
#if find this title | |
if es: | |
flagNotFound=False | |
# : and the title have only one match | |
the_e=None | |
# len(es)==1, no matter `title` and e['title'] sa | |
if len(es)==1: | |
e=es[0] | |
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']}) | |
flagNotOnly=False | |
else: | |
for e in es: | |
e_title= e['pt_title'] if e.has_key('pt_title') else e['title'] | |
# if the title return from getWord==e['title] (should be hiragana) | |
if e_title in titles_fromWord: | |
title=e_title | |
# print u' find: '+ t+u' of '+ u';'.join(titles) | |
the_e = e if e['pt']==0 else jp.find_one({'_id':e['pt']}) | |
flagNotOnly=False | |
break | |
else: | |
continue | |
e=the_e | |
if e: | |
# # check title in case `if len(es)==1` : if not equal, append it to forms | |
# 偏照り・片照り 【かたでり】 but in jp: `title: "へんでり"` | |
# titles_candidates= set(titles+[title]) if title!=e['title'] else set(titles) | |
# import copy | |
# titles_candidates= set(titles+titles_fromWord) \ | |
# if e['title'] not in titles_fromWord \ | |
# else set(titles+copy.deepcopy(titles_fromWord).remove(title)) | |
# remove title already exists (already in the `e['forms'] + e['title']` ) | |
# =note that= `e.get('forms',[]).append(e['title']))` returns None | |
forms_to_push = list(set(titles+titles_fromWord) - set(e.get('forms', [])+[e['title']])) | |
print forms_to_push | |
#some have forms==[]: | |
# so check if len_forms=(e.get('forms',[])) equals 0 won't make sense | |
len_forms=0 | |
if e.has_key('forms'): | |
len_forms=len(e['forms']) | |
print 'jp.update 1' | |
print e['_id'] | |
jp.update(e,{'$set':{'sensesJa':sensesJa,'last':'#pushForms'}, | |
'$push':{'forms':{'$each':forms_to_push}}}) | |
else: | |
print 'jp.update 2' | |
# jp.update(e,{'$set':{'sensesJa':sensesJa,'forms':forms_to_push,'last':'#setForms'}}) | |
for idx,fm in enumerate(forms_to_push): | |
print 'jp.save form 3' | |
# jp.save(dict( | |
# _id=e['_id']+'['+str(idx+1+len_forms)+']', | |
# freq=0, | |
# pt=e['_id'], | |
# pt_title=e['title'], | |
# title=fm, | |
# last='#pushedForm' | |
# )) | |
#update the only one | |
break | |
# : and the title have many matches, continue to next title | |
else: | |
continue | |
# : didn't find any matches of this title, continue to next title | |
else: | |
continue | |
# loop of `for t in titles` exhausts or break | |
if flagNotFound: | |
# print u'Not find: '+ u';'.join(titles) | |
# save the word didn't find | |
_id='daijirin_'+str(np)+'_'+str(idx_word+1) | |
# todo note that: set([title]) and set(title) would be very different when title is an unicode/str | |
forms_besides=list(set(titles+titles_fromWord)-set([title])) | |
ne=dict( | |
_id=_id, | |
sensesJa=sensesJa, | |
pt=0, | |
freq=0, | |
title=title, | |
forms=forms_besides, | |
last='#flagNotFound' | |
) | |
# jp.save(ne) | |
print 'flagNotFound' | |
if forms_besides: | |
for idx_fm,fm in enumerate(forms_besides): | |
fm_id=_id+'['+str(idx_fm+1)+']' | |
fm_new=dict( | |
_id=fm_id, | |
freq= 0, | |
pt=_id, | |
pt_title=title, | |
title=fm | |
) | |
# jp.save(fm_new) | |
print 'flagNotFound forms' | |
continue | |
# eof - flagNotFound | |
if flagNotOnly: | |
# print u'Not only: '+ u';'.join(titles) | |
_id='daijirin_'+str(np)+'_'+str(idx_word+1) | |
ne=dict( | |
_id=_id, | |
sensesJa=sensesJa, | |
pt=0, | |
freq=0, | |
title=title, | |
forms=titles, | |
last='#flagNotOnly' | |
) | |
print 'daijirin.save(ne)' | |
# daijirin.save(ne) | |
# urlWord=r'http://kotobank.jp/word/%E7%89%87%E7%A9%8D%E3%81%BF?dic=daijirin' | |
# print getWord(urlWord) | |
for np in xrange(1313,1313+1): | |
urlPage=r'http://kotobank.jp/dictionary/daijirin/'+str(np)+'/' | |
getPage(urlPage) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment