Created
April 24, 2013 06:11
-
-
Save maowug/5449996 to your computer and use it in GitHub Desktop.
tabelog
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import cPickle | |
| import urllib | |
| from bs4 import BeautifulSoup,NavigableString | |
| import re | |
| import math | |
| URLTabelog='http://tabelog.com' | |
| URLPage_a='http://tabelog.com/keywords/?kana_init=%E3%82%A2' | |
| tagLis=BeautifulSoup(urllib.urlopen(URLPage_a)).find('div','keyword-index').find_all('li') | |
| # print tagLis | |
| NKWsPerPage=100 | |
| kwDict={} | |
| for li in tagLis: | |
| kwList=[] | |
| tagName=unicode(li.get_text()) # a,i,u,... | |
| urlCurrentPage=u''#http://tabelog.com/keywords?kana_init=ア | |
| currentPage=u''#bs of urlCurrentPage | |
| try: | |
| #do the normal | |
| urlCurrentPage=URLTabelog+li.a.get('href') | |
| urlTagPage=BeautifulSoup(urllib.urlopen(urlCurrentPage)) | |
| currentPage=urlTagPage | |
| except: | |
| #do the _a or others | |
| # print tagName.encode('gb2312') | |
| if tagLis.index(li)==0: | |
| urlCurrentPage=URLPage_a | |
| currentPage=BeautifulSoup(urllib.urlopen(URLPage_a)) | |
| else: | |
| #some other tags has no KWs | |
| print tagLis.index(li),len(tagLis) | |
| continue | |
| nKWs=float(currentPage.find('p','page-count').find_all('span','num')[-1].string) | |
| nPage=math.ceil(nKWs/NKWsPerPage) | |
| for p in xrange(1,int(nPage)+1): | |
| if p==1: | |
| currentKWPage=currentPage | |
| else: | |
| urlKWPage=urlCurrentPage+'&page='+str(p) | |
| currentKWPage=BeautifulSoup(urllib.urlopen(urlKWPage)) | |
| kwLis=currentKWPage.find('div','keyword-list').find_all('li') | |
| for kwli in kwLis: | |
| kwList.append(unicode(kwli.get_text())) | |
| #eof for-p in kws pages | |
| # print len(kwList),','.join(kwList) 1202 [飯舘牛,イイダコ,飯蛸旨煮,...] | |
| kwDict[tagName]=kwList | |
| print tagLis.index(li),len(tagLis),tagName,len(kwList) | |
| #dump | |
| tabelogKWs=open(u'tabelogキーワードs'+u'.data','wb') | |
| cPickle.dump(kwDict,tabelogKWs) | |
| tabelogKWs.close() | |
| #usage: | |
| # kwDict=cPickle.load(open("tabelogキーワードs.data","rb")) | |
| # for tag in kwDict.keys():pass | |
| # # # load test | |
| # for tag in kwDict.keys(): | |
| # print ','.join(kwDict[tag][0:3]),len(kwDict[tag][0]) | |
| # | |
| # | |
| # #re-dump | |
| # tabelogKWs=open(u'tabelogキーワードs_subset'+u'.data','wb') | |
| # cPickle.dump(kwDict,tabelogKWs) | |
| # tabelogKWs.close() | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import cPickle | |
| import re | |
| import copy | |
| kwDict=cPickle.load(open(u"tabelogキーワードs.data",u"rb")) | |
| # kwDict={'a':1,'b':2,'c':5} | |
| # for tag in kwDict.keys(): | |
| # if tag=='a': | |
| # del kwDict[tag] | |
| # elif tag=='b': | |
| # print kwDict['a'] | |
| dictKWs={} | |
| dictKWs['kwHasNoRelations']=[] | |
| #make a kws dict | |
| for tag in kwDict: | |
| for kw in kwDict[tag]:#kwDict[tag] is a list of some tag(a,i,u,...) | |
| dictKWs[kw]=[[],[],[],[]] | |
| dictKWs2=copy.deepcopy(dictKWs) | |
| count=1 | |
| for kw in dictKWs2.keys()[0:10000]: | |
| count+=1 | |
| #for evert kw in dict, cal its Relations | |
| rkw=dictKWs[kw] #relation of kw | |
| if len(kw)==1 and kw in u'あいうえお': | |
| print kw,rkw,'continue' | |
| continue | |
| for kw2 in dictKWs2: | |
| rkw2=dictKWs[kw2] | |
| try: | |
| if kw==kw2:#same? | |
| pass | |
| #same? do nothing | |
| elif re.match(kw,kw2)!=None: #begin with | |
| rkw2[0].append(kw) | |
| rkw[1].append(kw2) | |
| elif re.search(u'^\S+'+kw+u'\S+$',kw2): #in the middle of it | |
| rkw2[0].append(kw) | |
| rkw[2].append(kw2) | |
| elif re.search(u'^\S+'+kw+u'$',kw2): #end with | |
| rkw2[0].append(kw) | |
| rkw[3].append(kw2) | |
| else: | |
| #kw who has no relations: [[1,0,0,0],[],[],[]] | |
| dictKWs['kwHasNoRelations'].append(kw) | |
| except: | |
| print kw,kw2,'passed' | |
| pass | |
| if count%500==100: | |
| print count,'==================' | |
| if len(rkw[3])>15: | |
| print '-------------------',kw | |
| # # [has_kw, front_in, middle_in, end_in] | |
| # for kw in dictKWs: | |
| #re-dump | |
| tabelogKWs2=open(u'tabelogキーワードs_Hypo1'+u'.data','wb') | |
| cPickle.dump(dictKWs,tabelogKWs2) | |
| tabelogKWs2.close() | |
| # bug: | |
| # rkw[0][0]+=1 | |
| # TypeError: coercing to Unicode: need string or buffer, int found |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import cPickle | |
| from bs4 import BeautifulSoup | |
| rstList=cPickle.load(open("東京_154_100_東中野.data","rb")) | |
| for rst in rstList[:3]: | |
| #['tabaco', 'reserved', 'nearSpots', 'private', 'seat', 'course', 'KWs', | |
| # 'chartered', 'parking', 'houdai', 'tel', 'avg', 'nearMarks', 'addr', | |
| # 'service', 'space', 'nearRstList', 'indexes', 'charge', 'location', | |
| # 'homepage', 'map', 'food', 'drink', 'traffic', 'child', 'date', | |
| # 'rvwList', 'yasumi', 'card', 'name', 'bgt', 'time', 'genre'] | |
| # for k in rst.keys(): | |
| # print unicode(k)+u':'+unicode(rst[k]) | |
| # for rvw in rst['rvwList']: | |
| # print rvw['comment'] | |
| # print '__________________' | |
| try: | |
| for rvw in rst['rvwList']: | |
| print rvw['comment'] | |
| comment=BeautifulSoup(rvw['comment'].encode('utf-8')) | |
| print comment.get_text() | |
| print '__________________' | |
| except: | |
| print '+++' | |
| print '=====================' | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import cPickle | |
| import urllib | |
| from bs4 import BeautifulSoup,NavigableString | |
| from tabelogRobotKu import getKu | |
| import sys | |
| sys.setrecursionlimit(1000000) | |
| urlKen='http://tabelog.com/sitemap/aomori/' #東京のレストラン一覧 | |
| kuList=BeautifulSoup(urllib.urlopen(urlKen)).find(id='arealst_sitemap').select('li') | |
| for ku in kuList: | |
| # if int(kuList.index(ku))%2==0: | |
| # continue | |
| # #index=3,5,7,...+1 | |
| kuUrl=str(ku.a['href']) | |
| kuName=u'青森'+u'_'+unicode(len(kuList))+u'_'+unicode(kuList.index(ku)+1)+u'_'+unicode(ku.a.string) | |
| getKu(kuUrl,kuName) | |
| #urlKen='http://tabelog.com/sitemap/tokyo/' #東京のレストラン一覧 | |
| #kuList=BeautifulSoup(urllib.urlopen(urlKen)).find(id='arealst_sitemap').select('li') | |
| #KuNames=[u'東京'+u'_'+unicode(len(kuList))+u'_'+unicode(kuList.index(ku)+1)+u'_'+unicode(ku.a.string) for ku in kuList] | |
| #for name in KuNames[0:3]: | |
| # print name | |
| # ------------ # ----------- # ----------- | |
| # #save | |
| #top250={'keywords':keywordsTop,'genre':genreTop} | |
| #f=open('top250.data','w') | |
| #cPickle.dump(top250,f) | |
| #f.close() | |
| # #load | |
| #top250 = cPickle.load(open("top250.data","rb")) | |
| # ------------ # ----------- # ----------- |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import cPickle | |
| import urllib | |
| from bs4 import BeautifulSoup,NavigableString | |
| import re | |
| import sys | |
| import math | |
| sys.setrecursionlimit(1000000) | |
| def getKu(url,name):#todo: for every ku | |
| #macro | |
| URLTabelog='http://tabelog.com' | |
| NRvwPerPage=20 | |
| NPerPage=200 #class='pagenation' | |
| #initalization | |
| urlKu=url | |
| nameKu=unicode(name) | |
| ku=BeautifulSoup(urllib.urlopen(urlKu).read()) | |
| #aiueo tag | |
| taglist=ku.find('div','taglist').find_all('a') | |
| rstList=[] #store ku as a data file | |
| #for tag in taglist[0:1]: | |
| for tag in taglist: #todo: for every tag (AIUEO) | |
| urlShopList=tag['href'] | |
| trans_table = dict([[ord(char), u"_"] for char in u"()() "]) | |
| nameAIUEO=nameKu+u'_'+unicode(len(taglist))+u'_'+unicode(taglist.index(tag)+1)+u'_'+unicode(tag.string).translate(trans_table) | |
| print nameAIUEO.encode('utf_8') #東京_154_1_銀座_50_1_ア_93_ #todo:print | |
| shopListPage=BeautifulSoup(urllib.urlopen(urlShopList).read()) | |
| try: | |
| # bug - no shops. | |
| nShop=float(shopListPage.find('div','result_num').strong.string) #todo: try: float() | |
| except: | |
| continue | |
| shopUrlList=[] | |
| nPage=math.ceil(float(nShop)/NPerPage) #bug - "nPage=nShop/NPerPage+1", when nShop==200. | |
| #begin iterating page 1,2,3, ... | |
| for pageCount in xrange(1,int(nPage)+1): | |
| if pageCount==1: | |
| shopListCurrentPage=shopListPage | |
| else: | |
| urlNextPage=urlShopList+'?PG='+str(pageCount) | |
| print 'Appending urlShopList in:'+urlNextPage #todo:print url to next shopList page | |
| shopListCurrentPage=BeautifulSoup(urllib.urlopen(urlNextPage).read()) | |
| shopListDIV=shopListCurrentPage.find_all('div','rstname')#list of shops | |
| for shop in shopListDIV: #add all shop urls to shopUrlList | |
| urlShop=URLTabelog+shop.find('a') ['href'] | |
| shopUrlList.append(urlShop) | |
| #end iterating page | |
| #begin iterating shops/restrants | |
| for url_shop in shopUrlList:#todo: for each url_shop in tag(あ、い、) | |
| # url_shop='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/' #口コミ 214件 | |
| # url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件 | |
| print unicode(shopUrlList.index(url_shop)+1)+u'/'+unicode(len(shopUrlList))+u': '+unicode(url_shop) #todo:print | |
| url_shop+=r'dtlrvwlst/' | |
| rst={} | |
| shopPage=BeautifulSoup(urllib.urlopen(url_shop).read()) | |
| # nReview=shopPage.find('rvw','page-count').find('span','num').string | |
| nRvw=float(shopPage.find('em',{'property':'v:count'}).string) | |
| nRvwPage=math.ceil(nRvw/NRvwPerPage) # 1,2,3,... # bug - when nRvw=20. | |
| nRvwPage=(1 if nRvwPage==0 else nRvwPage) #bug: if nRvwPage==0, nothing in xrange(1,1) | |
| rvwList=[] | |
| for pg in xrange(1,int(nRvwPage)+1): | |
| # for pg in xrange(1,2):#todo: for every rvw page | |
| if pg==1:# get basic info of the shop | |
| currentPage=shopPage | |
| sD=currentPage.find('div',id='rstdata-wrap')#shop data table | |
| #name | |
| name=unicode(sD.find('p','mname').get_text()) | |
| rst['name']=name | |
| rst['url']=url_shop[18:]#'/tokyo/A1301/A130102/13000145/dtlrvwlst/' | |
| #other basic info | |
| thList=sD.find_all('th') | |
| for th in thList: | |
| #genre | |
| if unicode(th.string)==u'ジャンル': | |
| rst['genre']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))#創作料理、イタリアン、居酒屋・ダイニングバー(その他) | |
| continue | |
| #TEL | |
| elif unicode(th.string)==u'TEL・予約': | |
| rst['tel']=unicode(sD.find('p',class_=['tel-main', 'ppc-main']).get_text()) | |
| #http://tabelog.com/tokyo/A1301/A130101/13149591/dtlrvwlst/ : 050-5819-3632 (予約専用番号) | |
| try: | |
| rst['reserved']=unicode(sD.find('span','reserve-status').string) | |
| except: | |
| rst['reserved']=u'不可' | |
| continue | |
| elif unicode(th.string)==u'住所': | |
| rst['addr']=unicode(sD.find('p',rel='v:addr').get_text()) #rel=['v:addr'] | |
| try: | |
| # bug - no map, only addr http://tabelog.com/tokyo/A1301/A130101/13030881/ | |
| mapaddr=sD.find('div','rst-map').find('img')['src'] | |
| rst['map']=unicode(mapaddr[mapaddr.find('center')+7:mapaddr.find('&markers')]) #35.324535345,42.212121212 | |
| except: | |
| pass | |
| continue | |
| elif unicode(th.string)==u'交通手段': | |
| rst['traffic']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'営業時間': | |
| rst['time']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'定休日': | |
| rst['yasumi']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'平均予算': | |
| rst['bgt']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| continue | |
| elif unicode(th.string)==u'平均利用金額': | |
| rst['avg']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| continue | |
| elif unicode(th.string)==u'カード': | |
| rst['card']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'サービス料・チャージ': | |
| rst['charge']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'席数': | |
| rst['seat']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| continue | |
| elif unicode(th.string)==u'個室': | |
| rst['private']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| continue | |
| elif unicode(th.string)==u'貸切': | |
| rst['chartered']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| continue | |
| elif unicode(th.string)==u'禁煙・喫煙': | |
| rst['tabaco']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'駐車場': | |
| rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'空間・設備': | |
| rst['space']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'飲み放題コース': | |
| rst['houdai']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'コース': | |
| rst['course']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'ドリンク': | |
| rst['drink']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'料理': | |
| rst['food']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.get_text())==u'こんな時にオススメ': | |
| rst['cases']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| # print rst['name'],rst['food'] #灯とともに 友人・同僚と|デート|宴会 | |
| continue | |
| elif unicode(th.string)==u'ロケーション':#u'隠れ家' | |
| rst['location']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'サービス': | |
| rst['service']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'お子様同伴': | |
| rst['child']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'ホームページ': | |
| rst['homepage']=unicode(th.next_sibling.next_sibling.p.a.get_text()) | |
| # print rst['homepage'] #http://www.hotpepper.jp/strJ000999523/ | |
| continue | |
| elif unicode(th.string)==u'オープン日': | |
| rst['date']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p> | |
| continue | |
| elif unicode(th.string)==u'備考': | |
| rst['notes']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p> | |
| continue | |
| #備考: 紹介制 via: http://tabelog.com/tokyo/A1301/A130101/13030881/dtlrvwlst/ | |
| else: | |
| # print unicode(th.string) | |
| # #店名、携帯電話、初投稿者、...? | |
| pass | |
| #end for th | |
| nearRstList=[] #近所のお店 | |
| indexes=[] #こだわりインデックス | |
| nearSpots=[]#周辺の観光スポット | |
| nearMarks=[] #近くにある施設 | |
| KWs=[] #'関連のキーワード' | |
| rBs=currentPage.find_all('div',class_='relation-box') | |
| for rb in rBs: | |
| if unicode(rb.h4.string)[0:5]==u'近所のお店': | |
| # print unicode(rb.h4.string) #近所のお店(銀座) | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| nearRst=[] #[ [name, url, distance, area & cat],[] ] | |
| nearRst.append(unicode(nr.find('p','mname').a.string)) | |
| nearRst.append(unicode(nr.find('p','mname').a['href'])) | |
| nearRst.append(unicode(nr.find('p','mname').span.string)) | |
| nearRst.append(unicode(nr.find('p','area-catg').string)) | |
| nearRstList.append(nearRst) | |
| continue | |
| elif unicode(rb.h4.string)==u'こだわりインデックス': | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| index=[]#[ [courseName, url],[] ] | |
| index.append(unicode(nr.find('p','mname').a.string)) | |
| index.append(unicode(nr.find('p','mname').a['href'])) | |
| indexes.append(index) | |
| continue | |
| elif unicode(rb.h4.string)==u'周辺の観光スポット': | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| spot=[]#[ [spotName, url, distance],[] ] | |
| spot.append(unicode(nr.find('p','mname').a.string)) | |
| spot.append(unicode(nr.find('p','mname').a['href'])) | |
| spot.append(unicode(nr.find('p','mname').span.string)) | |
| nearSpots.append(spot) | |
| continue | |
| elif unicode(rb.h4.string)==u'近くにある施設': | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| mark=[]#[ [mark1, url],[] ] | |
| mark.append(unicode(nr.find('p','mname').a.string)) | |
| mark.append(unicode(nr.find('p','mname').a['href'])) | |
| nearMarks.append(mark) | |
| continue | |
| elif unicode(rb.h4.string)==u'関連のキーワード': | |
| nrList=rb.find_all('li') #no class_ | |
| for nr in nrList: | |
| KWs.append(unicode(nr.a.string)) #[kw1,kw2,...] | |
| continue | |
| else: | |
| pass | |
| #このお店を訪れた人はこんなレストランも訪れています | |
| #周辺のお店ランキング | |
| #条件の似たお店を探す (銀座・新橋・有楽町):創作料理 × ¥2,000~¥2,999|創作料理 × 友人・同僚と | |
| #関連リンク : 東京ランチランキング | |
| #関連路線: 銀座線| 日比谷線 | |
| # print rb.h4.string #see above | |
| rst['nearRstList']=nearRstList | |
| rst['indexes']=indexes | |
| rst['nearSpots']=nearSpots | |
| rst['nearMarks']=nearMarks | |
| rst['KWs']=KWs #print u','.join(rst['KWs']) # kw1,kw2,... | |
| # print '--------------' | |
| #current rvw page | |
| #menu | |
| else: | |
| # url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件 | |
| # urlNextRvwPage='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg) | |
| urlNextRvwPage= url_shop+'COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg) | |
| print urlNextRvwPage #todo:print url to next review page | |
| currentPage=BeautifulSoup(urllib.urlopen(urlNextRvwPage).read()) | |
| rvwboxesCurrentPage=currentPage.find_all('div','review-box') | |
| for rvwbox in rvwboxesCurrentPage: | |
| rvw={} # make each rvw a dict | |
| # title('時代に流されない完成の極み') , url('http://....') | |
| title=rvwbox.find('p','title').find('a') | |
| rvw['url']=URLTabelog+str(title['href']) | |
| rvw['title']=unicode(title.string) | |
| #user('だいこんまん '), profile('30代後半・男性・愛知県') | |
| rvwer=rvwbox.find('div','reviewer-name') | |
| rvw['user']=unicode(rvwer.find('span').next_element) #<span class="lev1">だいこんまん <span class="count">(70)</span></span> | |
| profile=rvwer.find('p','area') | |
| try: | |
| rvw['profile']=unicode(profile.string) #'div','area' | |
| except AttributeError: | |
| rvw['profile']=u'' | |
| #score: tScore, scores=[料理,サービス,雰囲気, CP, ドリンク] | |
| try: | |
| # bug - no score: http://tabelog.com/tokyo/A1303/A130301/13059175/dtlrvwlst/2358064/?use_type=0&smp=2&PG=5&lc=0&sby=&srt= | |
| # bug - two scores. | |
| _scores=rvwbox.find('ul','score-ex').find_all('strong') #[ 料理・味 4.0| サービス 2.5| 雰囲気 4.0| CP 4.0| 酒・ドリンク 3.0 ] | |
| total_score=rvwbox.find('p','total').strong.string | |
| time_=rvwbox.find('p','total').find('span','subject').string #<span class="subject">昼の点数:</span> | |
| rvw['time']=unicode(time_[0]) #'昼','夜' | |
| rvw['totalScore']=(0 if total_score == u'-' else float(total_score) ) | |
| rvw['scores']=[0 if sc.string == u'-' else float(sc.string) for sc in _scores] | |
| except : | |
| rvw['time']=u'夜' | |
| rvw['totalScore']=0 | |
| rvw['scores']=[0,0,0,0,0] | |
| pass | |
| #price | |
| _price=rvwbox.find('p','price').find_all('strong') | |
| prices=[unicode(x.string) for x in _price] | |
| rvw['prices']=prices | |
| #situation | |
| cases=rvwbox.find('p','situation').find_all('img') | |
| situation=[ 0 if c['src'][-6:-4]=='_g' else 1 for c in cases] #[friends,date,settai,party,family,alone] | |
| rvw['situation']=situation | |
| #comment :better use comment_clean ,since cPickle can't pickle HTMLParser objects?? | |
| comment=rvwbox.find('div','comment') | |
| rvw['comment']=unicode(comment.p) | |
| # # todo:prettify, clean comment | |
| # comment=rvwbox.find('div','comment') | |
| # comment_clean=u'' | |
| # BR=BeautifulSoup('<html><br/></html>').find('br') | |
| # for c in comment.p.children: # def fn_reserve_br(p): isinstance(tag.next_element, NavigableString) | |
| # if isinstance(c,NavigableString): | |
| # comment_clean=comment_clean+unicode(c) | |
| # elif c==BR and c.next_sibling!=BR: | |
| # comment_clean=comment_clean+u' *br* ' | |
| # elif c==BR: | |
| # pass | |
| # elif c.name=='span':#span,a,... -> *br*, *wiki-bold* | |
| # comment_clean=comment_clean+u' *'+unicode(c['class'][0])+u'* '+unicode(c.string)+u' *'+unicode(c['class'][0])+u'* ' | |
| # rvw['comment']=unicode(comment_clean) | |
| #vote | |
| vote=int(rvwbox.find('span','agree-vote').find('em').string) | |
| rvw['vote']=vote | |
| # append rvw to rvwList | |
| rvwList.append(rvw) | |
| #end of current rvw page, all rvw in current page is appended to rvwList | |
| #end of rvw pages, all rvw in all pages is appended to rvwList | |
| rst['rvwList']=rvwList | |
| rstList.append(rst) | |
| #eof url_shop | |
| #eof every tag(AIUEO) | |
| dataKu=open(nameKu.encode('utf_8')+u'.data'.encode('utf_8'),'wb')# http://stackoverflow.com/questions/283766/pickled-file-wont-load-on-mac-linux | |
| cPickle.dump(rstList,dataKu) | |
| dataKu.close() | |
| #eof ku | |
| # ------------ # ----------- # ----------- | |
| # WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER. | |
| # todo: get menu page. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| #encoding: utf-8 | |
| import cPickle | |
| import urllib | |
| from bs4 import BeautifulSoup,NavigableString | |
| import re | |
| import sys | |
| sys.setrecursionlimit(1000000) | |
| def getKu(url,name):#todo: for every ku | |
| #macro | |
| URLTabelog='http://tabelog.com' | |
| NRvwPerPage=20 | |
| NPerPage=200 #class='pagenation' | |
| #initalization | |
| urlKu=url | |
| nameKu=name | |
| ku=BeautifulSoup(urllib.urlopen(urlKu).read()) | |
| #aiueo tag | |
| taglist=ku.find('div','taglist').find_all('a') | |
| for tag in taglist[0:1]: | |
| # for tag in taglist: #todo: for every tag (AIUEO) | |
| urlShopList=tag['href'] | |
| trans_table = dict([[ord(char), u"_"] for char in u"()() "]) | |
| nameAIUEO=nameKu+u'_'+unicode(len(taglist))+u'_'+unicode(taglist.index(tag)+1)+u'_'+unicode(tag.string).translate(trans_table) | |
| print nameAIUEO #東京_154_1_銀座_50_1_ア_93_ #todo:print | |
| shopListPage=BeautifulSoup(urllib.urlopen(urlShopList).read()) | |
| nShop=int(shopListPage.find('div','result_num').strong.string) #todo: try: int() | |
| shopListDIV=shopListPage.find_all('div','rstname')#list of shops | |
| shopUrlList=[] | |
| nPage=nShop/NPerPage+1 # | |
| #begin iterating page 1,2,3, ... | |
| for pageCount in xrange(1,nPage+1): | |
| if pageCount==1: | |
| shopListCurrentPage=shopListPage | |
| else: | |
| urlNextPage=urlShopList+'?PG='+str(pageCount) | |
| shopListCurrentPage=BeautifulSoup(urllib.urlopen(urlNextPage).read()) | |
| shopListDIV=shopListCurrentPage.find_all('div','rstname')#list of shops | |
| for shop in shopListDIV: #add all shop urls to shopUrlList | |
| urlShop=URLTabelog+shop.find('a') ['href'] | |
| shopUrlList.append(urlShop) | |
| #end iterating page | |
| #begin iterating shops/restrants | |
| rstList=[] | |
| for url_shop in shopUrlList[0:5]:#todo: for each url_shop in tag(あ、い、) | |
| # url_shop='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/' #口コミ 214件 | |
| # url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件 | |
| url_shop='http://tabelog.com/tokyo/A1301/A130101/13030881/dtlrvwlst/' | |
| print url_shop #todo:print | |
| rst={} | |
| shopPage=BeautifulSoup(urllib.urlopen(url_shop).read()) | |
| # nReview=shopPage.find('rvw','page-count').find('span','num').string | |
| nRvw=int(shopPage.find('em',{'property':'v:count'}).string) | |
| nRvwPage=nRvw/NRvwPerPage+1 # 1,2,3,... | |
| rvwList=[] | |
| for pg in xrange(1,nRvwPage+1): | |
| # for pg in xrange(1,2):#todo: for every rvw page | |
| if pg==1:# get basic info of the shop | |
| currentPage=shopPage | |
| sD=currentPage.find('div',id='rstdata-wrap')#shop data table | |
| #name | |
| name=unicode(sD.find('p','mname').get_text()) | |
| rst['name']=name | |
| #other basic info | |
| thList=sD.find_all('th') | |
| for th in thList: | |
| #genre | |
| if unicode(th.string)==u'ジャンル': | |
| rst['genre']=unicode(th.next_sibling.next_sibling.get_text())#創作料理、イタリアン、居酒屋・ダイニングバー(その他) | |
| continue | |
| #TEL | |
| elif unicode(th.string)==u'TEL・予約': | |
| rst['tel']=unicode(sD.find('p',class_=['tel-main', 'ppc-main']).get_text()) | |
| #http://tabelog.com/tokyo/A1301/A130101/13149591/dtlrvwlst/ : 050-5819-3632 (予約専用番号) | |
| try: | |
| rst['reserved']=unicode(sD.find('span','reserve-status').string) | |
| except: | |
| rst['reserved']=u'不可' | |
| continue | |
| elif unicode(th.string)==u'住所': | |
| rst['addr']=unicode(sD.find('p',rel='v:addr').get_text()) #rel=['v:addr'] | |
| try: | |
| mapaddr=sD.find('div','rst-map').find('img')['src'] | |
| rst['map']=unicode(mapaddr[mapaddr.find('center')+7:mapaddr.find('&markers')]) #35.324535345,42.212121212 | |
| except: | |
| pass | |
| continue | |
| elif unicode(th.string)==u'交通手段': | |
| rst['traffic']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'営業時間': | |
| rst['time']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'定休日': | |
| rst['yasumi']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'平均予算': | |
| rst['bgt']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'平均利用金額': | |
| rst['avg']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'カード': | |
| rst['card']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'サービス料・チャージ': | |
| rst['charge']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'席数': | |
| rst['seat']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'個室': | |
| # print '-------------' | |
| # print unicode(th.next_sibling.next_sibling.p.get_text()) | |
| # print '-------------' | |
| # print unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| # print '-------------' | |
| rst['private']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'貸切': | |
| rst['chartered']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'禁煙・喫煙': | |
| rst['tabaco']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'駐車場': | |
| rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'駐車場': | |
| rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'空間・設備': | |
| rst['space']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'飲み放題コース': | |
| rst['houdai']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'コース': | |
| rst['course']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'ドリンク': | |
| rst['drink']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'料理': | |
| rst['food']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.get_text())==u'こんな時にオススメ': | |
| rst['food']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings)) | |
| # print rst['name'],rst['food'] #灯とともに 友人・同僚と|デート|宴会 | |
| continue | |
| elif unicode(th.string)==u'ロケーション': | |
| rst['location']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'サービス': | |
| rst['service']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'お子様同伴': | |
| rst['child']=unicode(th.next_sibling.next_sibling.p.get_text()) | |
| continue | |
| elif unicode(th.string)==u'ホームページ': | |
| rst['homepage']=unicode(th.next_sibling.next_sibling.p.a.get_text()) | |
| # print rst['homepage'] #http://www.hotpepper.jp/strJ000999523/ | |
| continue | |
| elif unicode(th.string)==u'オープン日': | |
| rst['date']=unicode(th.next_sibling.next_sibling.get_text()) #no <p> | |
| continue | |
| elif unicode(th.string)==u'備考': | |
| rst['notes']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p> | |
| print rst['notes'] | |
| continue | |
| else: | |
| # print unicode(th.string) | |
| # #店名、携帯電話、初投稿者、...? | |
| pass | |
| #end for th | |
| nearRstList=[] #近所のお店 | |
| indexes=[] #こだわりインデックス | |
| nearSpots=[]#周辺の観光スポット | |
| nearMarks=[] #近くにある施設 | |
| KWs=[] #'関連のキーワード' | |
| rBs=currentPage.find_all('div',class_='relation-box') | |
| for rb in rBs: | |
| if unicode(rb.h4.string)[0:5]==u'近所のお店': | |
| # print unicode(rb.h4.string) #近所のお店(銀座) | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| nearRst=[] #[ [name, url, distance, area & cat],[] ] | |
| nearRst.append(unicode(nr.find('p','mname').a.string)) | |
| nearRst.append(unicode(nr.find('p','mname').a['href'])) | |
| nearRst.append(unicode(nr.find('p','mname').span.string)) | |
| nearRst.append(unicode(nr.find('p','area-catg').string)) | |
| nearRstList.append(nearRst) | |
| continue | |
| elif unicode(rb.h4.string)==u'こだわりインデックス': | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| index=[]#[ [courseName, url],[] ] | |
| index.append(unicode(nr.find('p','mname').a.string)) | |
| index.append(unicode(nr.find('p','mname').a['href'])) | |
| indexes.append(index) | |
| continue | |
| elif unicode(rb.h4.string)==u'周辺の観光スポット': | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| spot=[]#[ [spotName, url, distance],[] ] | |
| spot.append(unicode(nr.find('p','mname').a.string)) | |
| spot.append(unicode(nr.find('p','mname').a['href'])) | |
| spot.append(unicode(nr.find('p','mname').span.string)) | |
| nearSpots.append(spot) | |
| continue | |
| elif unicode(rb.h4.string)==u'近くにある施設': | |
| nrList=rb.find_all('li',class_='restbox') | |
| for nr in nrList: | |
| mark=[]#[ [mark1, url],[] ] | |
| mark.append(unicode(nr.find('p','mname').a.string)) | |
| mark.append(unicode(nr.find('p','mname').a['href'])) | |
| nearMarks.append(mark) | |
| continue | |
| elif unicode(rb.h4.string)==u'関連のキーワード': | |
| nrList=rb.find_all('li') #no class_ | |
| for nr in nrList: | |
| KWs.append(unicode(nr.a.string)) #[kw1,kw2,...] | |
| continue | |
| else: | |
| pass | |
| #このお店を訪れた人はこんなレストランも訪れています | |
| #周辺のお店ランキング | |
| #条件の似たお店を探す (銀座・新橋・有楽町):創作料理 × ¥2,000~¥2,999|創作料理 × 友人・同僚と | |
| #関連リンク : 東京ランチランキング | |
| #関連路線: 銀座線| 日比谷線 | |
| # print rb.h4.string #see above | |
| rst['nearRstList']=nearRstList | |
| rst['indexes']=indexes | |
| rst['nearSpots']=nearSpots | |
| rst['nearMarks']=nearMarks | |
| rst['KWs']=KWs #print u','.join(rst['KWs']) # kw1,kw2,... | |
| # print '--------------' | |
| #current rvw page | |
| else: | |
| # url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件 | |
| # urlNextRvwPage='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg) | |
| urlNextRvwPage= url_shop+'COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg) #todo: url to next review page | |
| currentPage=BeautifulSoup(urllib.urlopen(urlNextRvwPage).read()) | |
| rvwboxesCurrentPage=currentPage.find_all('div','review-box') | |
| for rvwbox in rvwboxesCurrentPage: | |
| rvw={} # make each rvw a dict | |
| # title('時代に流されない完成の極み') , url('http://....') | |
| title=rvwbox.find('p','title').find('a') | |
| rvw['url']=URLTabelog+str(title['href']) | |
| rvw['title']=unicode(title.string) | |
| #user('だいこんまん '), profile('30代後半・男性・愛知県') | |
| rvwer=rvwbox.find('div','reviewer-name') | |
| rvw['user']=unicode(rvwer.find('span').next_element) #<span class="lev1">だいこんまん <span class="count">(70)</span></span> | |
| profile=rvwer.find('p','area') | |
| try: | |
| rvw['profile']=unicode(profile.string) #'div','area' | |
| except AttributeError: | |
| rvw['profile']=u'' | |
| #score: tScore, scores=[料理,サービス,雰囲気, CP, ドリンク] | |
| _scores=rvwbox.find('ul','score-ex').find_all('strong') #[ 料理・味 4.0| サービス 2.5| 雰囲気 4.0| CP 4.0| 酒・ドリンク 3.0 ] | |
| total_score=rvwbox.find('p','total').strong.string | |
| time_=rvwbox.find('p','total').find('span','subject').string #<span class="subject">昼の点数:</span> | |
| rvw['time']=unicode(time_[0]) #'昼','夜' | |
| rvw['totalScore']=(0 if total_score == u'-' else float(total_score) ) | |
| rvw['scores']=[0 if sc.string == u'-' else float(sc.string) for sc in _scores] | |
| #price | |
| _price=rvwbox.find('p','price').find_all('strong') | |
| prices=[unicode(x.string) for x in _price] | |
| rvw['prices']=prices | |
| #situation | |
| cases=rvwbox.find('p','situation').find_all('img') | |
| situation=[ 0 if c['src'][-6:-4]=='_g' else 1 for c in cases] #[friends,date,settai,party,family,alone] | |
| rvw['situation']=situation | |
| #comment :better use comment_clean ,since cPickle can't pickle HTMLParser objects?? | |
| comment=rvwbox.find('div','comment') | |
| rvw['comment']=unicode(comment.p) | |
| # # todo:prettify, clean comment | |
| # comment=rvwbox.find('div','comment') | |
| # comment_clean=u'' | |
| # BR=BeautifulSoup('<html><br/></html>').find('br') | |
| # for c in comment.p.children: # def fn_reserve_br(p): isinstance(tag.next_element, NavigableString) | |
| # if isinstance(c,NavigableString): | |
| # comment_clean=comment_clean+unicode(c) | |
| # elif c==BR and c.next_sibling!=BR: | |
| # comment_clean=comment_clean+u' *br* ' | |
| # elif c==BR: | |
| # pass | |
| # elif c.name=='span':#span,a,... -> *br*, *wiki-bold* | |
| # comment_clean=comment_clean+u' *'+unicode(c['class'][0])+u'* '+unicode(c.string)+u' *'+unicode(c['class'][0])+u'* ' | |
| # rvw['comment']=unicode(comment_clean) | |
| #vote | |
| vote=int(rvwbox.find('span','agree-vote').find('em').string) | |
| rvw['vote']=vote | |
| # append rvw to rvwList | |
| rvwList.append(rvw) | |
| #end of current rvw page, all rvw in current page is appended to rvwList | |
| #end of rvw pages, all rvw in all pages is appended to rvwList | |
| rst['rvwList']=rvwList | |
| rstList.append(rst) | |
| #end of url_shop | |
| dataKu=open(nameAIUEO+u'.data','w') | |
| cPickle.dump(rstList,dataKu) | |
| dataKu.close() | |
| #end of every tag(AIUEO) | |
| # ------------ # ----------- # ----------- | |
| kuURL='http://tabelog.com/sitemap/tokyo/A1301-A130101/' | |
| nameKu=u'東京_154_1_銀座' | |
| getKu(kuURL,nameKu) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment