Skip to content

Instantly share code, notes, and snippets.

@maowug
Created April 24, 2013 06:11
Show Gist options
  • Select an option

  • Save maowug/5449996 to your computer and use it in GitHub Desktop.

Select an option

Save maowug/5449996 to your computer and use it in GitHub Desktop.
tabelog
#!/usr/bin/env python
#encoding: utf-8
import cPickle
import urllib
from bs4 import BeautifulSoup,NavigableString
import re
import math
URLTabelog='http://tabelog.com'
URLPage_a='http://tabelog.com/keywords/?kana_init=%E3%82%A2'
tagLis=BeautifulSoup(urllib.urlopen(URLPage_a)).find('div','keyword-index').find_all('li')
# print tagLis
NKWsPerPage=100
kwDict={}
for li in tagLis:
kwList=[]
tagName=unicode(li.get_text()) # a,i,u,...
urlCurrentPage=u''#http://tabelog.com/keywords?kana_init=ア
currentPage=u''#bs of urlCurrentPage
try:
#do the normal
urlCurrentPage=URLTabelog+li.a.get('href')
urlTagPage=BeautifulSoup(urllib.urlopen(urlCurrentPage))
currentPage=urlTagPage
except:
#do the _a or others
# print tagName.encode('gb2312')
if tagLis.index(li)==0:
urlCurrentPage=URLPage_a
currentPage=BeautifulSoup(urllib.urlopen(URLPage_a))
else:
#some other tags has no KWs
print tagLis.index(li),len(tagLis)
continue
nKWs=float(currentPage.find('p','page-count').find_all('span','num')[-1].string)
nPage=math.ceil(nKWs/NKWsPerPage)
for p in xrange(1,int(nPage)+1):
if p==1:
currentKWPage=currentPage
else:
urlKWPage=urlCurrentPage+'&page='+str(p)
currentKWPage=BeautifulSoup(urllib.urlopen(urlKWPage))
kwLis=currentKWPage.find('div','keyword-list').find_all('li')
for kwli in kwLis:
kwList.append(unicode(kwli.get_text()))
#eof for-p in kws pages
# print len(kwList),','.join(kwList) 1202 [飯舘牛,イイダコ,飯蛸旨煮,...]
kwDict[tagName]=kwList
print tagLis.index(li),len(tagLis),tagName,len(kwList)
#dump
tabelogKWs=open(u'tabelogキーワードs'+u'.data','wb')
cPickle.dump(kwDict,tabelogKWs)
tabelogKWs.close()
#usage:
# kwDict=cPickle.load(open("tabelogキーワードs.data","rb"))
# for tag in kwDict.keys():pass
# # # load test
# for tag in kwDict.keys():
# print ','.join(kwDict[tag][0:3]),len(kwDict[tag][0])
#
#
# #re-dump
# tabelogKWs=open(u'tabelogキーワードs_subset'+u'.data','wb')
# cPickle.dump(kwDict,tabelogKWs)
# tabelogKWs.close()
#!/usr/bin/env python
#encoding: utf-8
import cPickle
import re
import copy
kwDict=cPickle.load(open(u"tabelogキーワードs.data",u"rb"))
# kwDict={'a':1,'b':2,'c':5}
# for tag in kwDict.keys():
# if tag=='a':
# del kwDict[tag]
# elif tag=='b':
# print kwDict['a']
dictKWs={}
dictKWs['kwHasNoRelations']=[]
#make a kws dict
for tag in kwDict:
for kw in kwDict[tag]:#kwDict[tag] is a list of some tag(a,i,u,...)
dictKWs[kw]=[[],[],[],[]]
dictKWs2=copy.deepcopy(dictKWs)
count=1
for kw in dictKWs2.keys()[0:10000]:
count+=1
#for evert kw in dict, cal its Relations
rkw=dictKWs[kw] #relation of kw
if len(kw)==1 and kw in u'あいうえお':
print kw,rkw,'continue'
continue
for kw2 in dictKWs2:
rkw2=dictKWs[kw2]
try:
if kw==kw2:#same?
pass
#same? do nothing
elif re.match(kw,kw2)!=None: #begin with
rkw2[0].append(kw)
rkw[1].append(kw2)
elif re.search(u'^\S+'+kw+u'\S+$',kw2): #in the middle of it
rkw2[0].append(kw)
rkw[2].append(kw2)
elif re.search(u'^\S+'+kw+u'$',kw2): #end with
rkw2[0].append(kw)
rkw[3].append(kw2)
else:
#kw who has no relations: [[1,0,0,0],[],[],[]]
dictKWs['kwHasNoRelations'].append(kw)
except:
print kw,kw2,'passed'
pass
if count%500==100:
print count,'=================='
if len(rkw[3])>15:
print '-------------------',kw
# # [has_kw, front_in, middle_in, end_in]
# for kw in dictKWs:
#re-dump
tabelogKWs2=open(u'tabelogキーワードs_Hypo1'+u'.data','wb')
cPickle.dump(dictKWs,tabelogKWs2)
tabelogKWs2.close()
# bug:
# rkw[0][0]+=1
# TypeError: coercing to Unicode: need string or buffer, int found
#!/usr/bin/env python
#encoding: utf-8
import cPickle
from bs4 import BeautifulSoup
rstList=cPickle.load(open("東京_154_100_東中野.data","rb"))
for rst in rstList[:3]:
#['tabaco', 'reserved', 'nearSpots', 'private', 'seat', 'course', 'KWs',
# 'chartered', 'parking', 'houdai', 'tel', 'avg', 'nearMarks', 'addr',
# 'service', 'space', 'nearRstList', 'indexes', 'charge', 'location',
# 'homepage', 'map', 'food', 'drink', 'traffic', 'child', 'date',
# 'rvwList', 'yasumi', 'card', 'name', 'bgt', 'time', 'genre']
# for k in rst.keys():
# print unicode(k)+u':'+unicode(rst[k])
# for rvw in rst['rvwList']:
# print rvw['comment']
# print '__________________'
try:
for rvw in rst['rvwList']:
print rvw['comment']
comment=BeautifulSoup(rvw['comment'].encode('utf-8'))
print comment.get_text()
print '__________________'
except:
print '+++'
print '====================='
#!/usr/bin/env python
#encoding: utf-8
import cPickle
import urllib
from bs4 import BeautifulSoup,NavigableString
from tabelogRobotKu import getKu
import sys
sys.setrecursionlimit(1000000)
urlKen='http://tabelog.com/sitemap/aomori/' #東京のレストラン一覧
kuList=BeautifulSoup(urllib.urlopen(urlKen)).find(id='arealst_sitemap').select('li')
for ku in kuList:
# if int(kuList.index(ku))%2==0:
# continue
# #index=3,5,7,...+1
kuUrl=str(ku.a['href'])
kuName=u'青森'+u'_'+unicode(len(kuList))+u'_'+unicode(kuList.index(ku)+1)+u'_'+unicode(ku.a.string)
getKu(kuUrl,kuName)
#urlKen='http://tabelog.com/sitemap/tokyo/' #東京のレストラン一覧
#kuList=BeautifulSoup(urllib.urlopen(urlKen)).find(id='arealst_sitemap').select('li')
#KuNames=[u'東京'+u'_'+unicode(len(kuList))+u'_'+unicode(kuList.index(ku)+1)+u'_'+unicode(ku.a.string) for ku in kuList]
#for name in KuNames[0:3]:
# print name
# ------------ # ----------- # -----------
# #save
#top250={'keywords':keywordsTop,'genre':genreTop}
#f=open('top250.data','w')
#cPickle.dump(top250,f)
#f.close()
# #load
#top250 = cPickle.load(open("top250.data","rb"))
# ------------ # ----------- # -----------
#!/usr/bin/env python
#encoding: utf-8
import cPickle
import urllib
from bs4 import BeautifulSoup,NavigableString
import re
import sys
import math
sys.setrecursionlimit(1000000)
def getKu(url,name):#todo: for every ku
#macro
URLTabelog='http://tabelog.com'
NRvwPerPage=20
NPerPage=200 #class='pagenation'
#initalization
urlKu=url
nameKu=unicode(name)
ku=BeautifulSoup(urllib.urlopen(urlKu).read())
#aiueo tag
taglist=ku.find('div','taglist').find_all('a')
rstList=[] #store ku as a data file
#for tag in taglist[0:1]:
for tag in taglist: #todo: for every tag (AIUEO)
urlShopList=tag['href']
trans_table = dict([[ord(char), u"_"] for char in u"()() "])
nameAIUEO=nameKu+u'_'+unicode(len(taglist))+u'_'+unicode(taglist.index(tag)+1)+u'_'+unicode(tag.string).translate(trans_table)
print nameAIUEO.encode('utf_8') #東京_154_1_銀座_50_1_ア_93_ #todo:print
shopListPage=BeautifulSoup(urllib.urlopen(urlShopList).read())
try:
# bug - no shops.
nShop=float(shopListPage.find('div','result_num').strong.string) #todo: try: float()
except:
continue
shopUrlList=[]
nPage=math.ceil(float(nShop)/NPerPage) #bug - "nPage=nShop/NPerPage+1", when nShop==200.
#begin iterating page 1,2,3, ...
for pageCount in xrange(1,int(nPage)+1):
if pageCount==1:
shopListCurrentPage=shopListPage
else:
urlNextPage=urlShopList+'?PG='+str(pageCount)
print 'Appending urlShopList in:'+urlNextPage #todo:print url to next shopList page
shopListCurrentPage=BeautifulSoup(urllib.urlopen(urlNextPage).read())
shopListDIV=shopListCurrentPage.find_all('div','rstname')#list of shops
for shop in shopListDIV: #add all shop urls to shopUrlList
urlShop=URLTabelog+shop.find('a') ['href']
shopUrlList.append(urlShop)
#end iterating page
#begin iterating shops/restrants
for url_shop in shopUrlList:#todo: for each url_shop in tag(あ、い、)
# url_shop='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/' #口コミ 214件
# url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
print unicode(shopUrlList.index(url_shop)+1)+u'/'+unicode(len(shopUrlList))+u': '+unicode(url_shop) #todo:print
url_shop+=r'dtlrvwlst/'
rst={}
shopPage=BeautifulSoup(urllib.urlopen(url_shop).read())
# nReview=shopPage.find('rvw','page-count').find('span','num').string
nRvw=float(shopPage.find('em',{'property':'v:count'}).string)
nRvwPage=math.ceil(nRvw/NRvwPerPage) # 1,2,3,... # bug - when nRvw=20.
nRvwPage=(1 if nRvwPage==0 else nRvwPage) #bug: if nRvwPage==0, nothing in xrange(1,1)
rvwList=[]
for pg in xrange(1,int(nRvwPage)+1):
# for pg in xrange(1,2):#todo: for every rvw page
if pg==1:# get basic info of the shop
currentPage=shopPage
sD=currentPage.find('div',id='rstdata-wrap')#shop data table
#name
name=unicode(sD.find('p','mname').get_text())
rst['name']=name
rst['url']=url_shop[18:]#'/tokyo/A1301/A130102/13000145/dtlrvwlst/'
#other basic info
thList=sD.find_all('th')
for th in thList:
#genre
if unicode(th.string)==u'ジャンル':
rst['genre']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))#創作料理、イタリアン、居酒屋・ダイニングバー(その他)
continue
#TEL
elif unicode(th.string)==u'TEL・予約':
rst['tel']=unicode(sD.find('p',class_=['tel-main', 'ppc-main']).get_text())
#http://tabelog.com/tokyo/A1301/A130101/13149591/dtlrvwlst/ : 050-5819-3632 (予約専用番号)
try:
rst['reserved']=unicode(sD.find('span','reserve-status').string)
except:
rst['reserved']=u'不可'
continue
elif unicode(th.string)==u'住所':
rst['addr']=unicode(sD.find('p',rel='v:addr').get_text()) #rel=['v:addr']
try:
# bug - no map, only addr http://tabelog.com/tokyo/A1301/A130101/13030881/
mapaddr=sD.find('div','rst-map').find('img')['src']
rst['map']=unicode(mapaddr[mapaddr.find('center')+7:mapaddr.find('&markers')]) #35.324535345,42.212121212
except:
pass
continue
elif unicode(th.string)==u'交通手段':
rst['traffic']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'営業時間':
rst['time']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'定休日':
rst['yasumi']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'平均予算':
rst['bgt']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
continue
elif unicode(th.string)==u'平均利用金額':
rst['avg']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
continue
elif unicode(th.string)==u'カード':
rst['card']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'サービス料・チャージ':
rst['charge']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'席数':
rst['seat']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
continue
elif unicode(th.string)==u'個室':
rst['private']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
continue
elif unicode(th.string)==u'貸切':
rst['chartered']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
continue
elif unicode(th.string)==u'禁煙・喫煙':
rst['tabaco']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'駐車場':
rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'空間・設備':
rst['space']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'飲み放題コース':
rst['houdai']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'コース':
rst['course']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'ドリンク':
rst['drink']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'料理':
rst['food']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.get_text())==u'こんな時にオススメ':
rst['cases']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
# print rst['name'],rst['food'] #灯とともに 友人・同僚と|デート|宴会
continue
elif unicode(th.string)==u'ロケーション':#u'隠れ家'
rst['location']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'サービス':
rst['service']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'お子様同伴':
rst['child']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'ホームページ':
rst['homepage']=unicode(th.next_sibling.next_sibling.p.a.get_text())
# print rst['homepage'] #http://www.hotpepper.jp/strJ000999523/
continue
elif unicode(th.string)==u'オープン日':
rst['date']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p>
continue
elif unicode(th.string)==u'備考':
rst['notes']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p>
continue
#備考: 紹介制 via: http://tabelog.com/tokyo/A1301/A130101/13030881/dtlrvwlst/
else:
# print unicode(th.string)
# #店名、携帯電話、初投稿者、...?
pass
#end for th
nearRstList=[] #近所のお店
indexes=[] #こだわりインデックス
nearSpots=[]#周辺の観光スポット
nearMarks=[] #近くにある施設
KWs=[] #'関連のキーワード'
rBs=currentPage.find_all('div',class_='relation-box')
for rb in rBs:
if unicode(rb.h4.string)[0:5]==u'近所のお店':
# print unicode(rb.h4.string) #近所のお店(銀座)
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
nearRst=[] #[ [name, url, distance, area & cat],[] ]
nearRst.append(unicode(nr.find('p','mname').a.string))
nearRst.append(unicode(nr.find('p','mname').a['href']))
nearRst.append(unicode(nr.find('p','mname').span.string))
nearRst.append(unicode(nr.find('p','area-catg').string))
nearRstList.append(nearRst)
continue
elif unicode(rb.h4.string)==u'こだわりインデックス':
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
index=[]#[ [courseName, url],[] ]
index.append(unicode(nr.find('p','mname').a.string))
index.append(unicode(nr.find('p','mname').a['href']))
indexes.append(index)
continue
elif unicode(rb.h4.string)==u'周辺の観光スポット':
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
spot=[]#[ [spotName, url, distance],[] ]
spot.append(unicode(nr.find('p','mname').a.string))
spot.append(unicode(nr.find('p','mname').a['href']))
spot.append(unicode(nr.find('p','mname').span.string))
nearSpots.append(spot)
continue
elif unicode(rb.h4.string)==u'近くにある施設':
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
mark=[]#[ [mark1, url],[] ]
mark.append(unicode(nr.find('p','mname').a.string))
mark.append(unicode(nr.find('p','mname').a['href']))
nearMarks.append(mark)
continue
elif unicode(rb.h4.string)==u'関連のキーワード':
nrList=rb.find_all('li') #no class_
for nr in nrList:
KWs.append(unicode(nr.a.string)) #[kw1,kw2,...]
continue
else:
pass
#このお店を訪れた人はこんなレストランも訪れています
#周辺のお店ランキング
#条件の似たお店を探す (銀座・新橋・有楽町):創作料理 × ¥2,000~¥2,999|創作料理 × 友人・同僚と
#関連リンク : 東京ランチランキング
#関連路線: 銀座線| 日比谷線
# print rb.h4.string #see above
rst['nearRstList']=nearRstList
rst['indexes']=indexes
rst['nearSpots']=nearSpots
rst['nearMarks']=nearMarks
rst['KWs']=KWs #print u','.join(rst['KWs']) # kw1,kw2,...
# print '--------------'
#current rvw page
#menu
else:
# url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
# urlNextRvwPage='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg)
urlNextRvwPage= url_shop+'COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg)
print urlNextRvwPage #todo:print url to next review page
currentPage=BeautifulSoup(urllib.urlopen(urlNextRvwPage).read())
rvwboxesCurrentPage=currentPage.find_all('div','review-box')
for rvwbox in rvwboxesCurrentPage:
rvw={} # make each rvw a dict
# title('時代に流されない完成の極み') , url('http://....')
title=rvwbox.find('p','title').find('a')
rvw['url']=URLTabelog+str(title['href'])
rvw['title']=unicode(title.string)
#user('だいこんまん '), profile('30代後半・男性・愛知県')
rvwer=rvwbox.find('div','reviewer-name')
rvw['user']=unicode(rvwer.find('span').next_element) #<span class="lev1">だいこんまん <span class="count">(70)</span></span>
profile=rvwer.find('p','area')
try:
rvw['profile']=unicode(profile.string) #'div','area'
except AttributeError:
rvw['profile']=u''
#score: tScore, scores=[料理,サービス,雰囲気, CP, ドリンク]
try:
# bug - no score: http://tabelog.com/tokyo/A1303/A130301/13059175/dtlrvwlst/2358064/?use_type=0&smp=2&PG=5&lc=0&sby=&srt=
# bug - two scores.
_scores=rvwbox.find('ul','score-ex').find_all('strong') #[ 料理・味 4.0| サービス 2.5| 雰囲気 4.0| CP 4.0| 酒・ドリンク 3.0 ]
total_score=rvwbox.find('p','total').strong.string
time_=rvwbox.find('p','total').find('span','subject').string #<span class="subject">昼の点数:</span>
rvw['time']=unicode(time_[0]) #'昼','夜'
rvw['totalScore']=(0 if total_score == u'-' else float(total_score) )
rvw['scores']=[0 if sc.string == u'-' else float(sc.string) for sc in _scores]
except :
rvw['time']=u'夜'
rvw['totalScore']=0
rvw['scores']=[0,0,0,0,0]
pass
#price
_price=rvwbox.find('p','price').find_all('strong')
prices=[unicode(x.string) for x in _price]
rvw['prices']=prices
#situation
cases=rvwbox.find('p','situation').find_all('img')
situation=[ 0 if c['src'][-6:-4]=='_g' else 1 for c in cases] #[friends,date,settai,party,family,alone]
rvw['situation']=situation
#comment :better use comment_clean ,since cPickle can't pickle HTMLParser objects??
comment=rvwbox.find('div','comment')
rvw['comment']=unicode(comment.p)
# # todo:prettify, clean comment
# comment=rvwbox.find('div','comment')
# comment_clean=u''
# BR=BeautifulSoup('<html><br/></html>').find('br')
# for c in comment.p.children: # def fn_reserve_br(p): isinstance(tag.next_element, NavigableString)
# if isinstance(c,NavigableString):
# comment_clean=comment_clean+unicode(c)
# elif c==BR and c.next_sibling!=BR:
# comment_clean=comment_clean+u' *br* '
# elif c==BR:
# pass
# elif c.name=='span':#span,a,... -> *br*, *wiki-bold*
# comment_clean=comment_clean+u' *'+unicode(c['class'][0])+u'* '+unicode(c.string)+u' *'+unicode(c['class'][0])+u'* '
# rvw['comment']=unicode(comment_clean)
#vote
vote=int(rvwbox.find('span','agree-vote').find('em').string)
rvw['vote']=vote
# append rvw to rvwList
rvwList.append(rvw)
#end of current rvw page, all rvw in current page is appended to rvwList
#end of rvw pages, all rvw in all pages is appended to rvwList
rst['rvwList']=rvwList
rstList.append(rst)
#eof url_shop
#eof every tag(AIUEO)
dataKu=open(nameKu.encode('utf_8')+u'.data'.encode('utf_8'),'wb')# http://stackoverflow.com/questions/283766/pickled-file-wont-load-on-mac-linux
cPickle.dump(rstList,dataKu)
dataKu.close()
#eof ku
# ------------ # ----------- # -----------
# WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
# todo: get menu page.
#!/usr/bin/env python
#encoding: utf-8
import cPickle
import urllib
from bs4 import BeautifulSoup,NavigableString
import re
import sys
sys.setrecursionlimit(1000000)
def getKu(url,name):#todo: for every ku
#macro
URLTabelog='http://tabelog.com'
NRvwPerPage=20
NPerPage=200 #class='pagenation'
#initalization
urlKu=url
nameKu=name
ku=BeautifulSoup(urllib.urlopen(urlKu).read())
#aiueo tag
taglist=ku.find('div','taglist').find_all('a')
for tag in taglist[0:1]:
# for tag in taglist: #todo: for every tag (AIUEO)
urlShopList=tag['href']
trans_table = dict([[ord(char), u"_"] for char in u"()() "])
nameAIUEO=nameKu+u'_'+unicode(len(taglist))+u'_'+unicode(taglist.index(tag)+1)+u'_'+unicode(tag.string).translate(trans_table)
print nameAIUEO #東京_154_1_銀座_50_1_ア_93_ #todo:print
shopListPage=BeautifulSoup(urllib.urlopen(urlShopList).read())
nShop=int(shopListPage.find('div','result_num').strong.string) #todo: try: int()
shopListDIV=shopListPage.find_all('div','rstname')#list of shops
shopUrlList=[]
nPage=nShop/NPerPage+1 #
#begin iterating page 1,2,3, ...
for pageCount in xrange(1,nPage+1):
if pageCount==1:
shopListCurrentPage=shopListPage
else:
urlNextPage=urlShopList+'?PG='+str(pageCount)
shopListCurrentPage=BeautifulSoup(urllib.urlopen(urlNextPage).read())
shopListDIV=shopListCurrentPage.find_all('div','rstname')#list of shops
for shop in shopListDIV: #add all shop urls to shopUrlList
urlShop=URLTabelog+shop.find('a') ['href']
shopUrlList.append(urlShop)
#end iterating page
#begin iterating shops/restrants
rstList=[]
for url_shop in shopUrlList[0:5]:#todo: for each url_shop in tag(あ、い、)
# url_shop='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/' #口コミ 214件
# url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
url_shop='http://tabelog.com/tokyo/A1301/A130101/13030881/dtlrvwlst/'
print url_shop #todo:print
rst={}
shopPage=BeautifulSoup(urllib.urlopen(url_shop).read())
# nReview=shopPage.find('rvw','page-count').find('span','num').string
nRvw=int(shopPage.find('em',{'property':'v:count'}).string)
nRvwPage=nRvw/NRvwPerPage+1 # 1,2,3,...
rvwList=[]
for pg in xrange(1,nRvwPage+1):
# for pg in xrange(1,2):#todo: for every rvw page
if pg==1:# get basic info of the shop
currentPage=shopPage
sD=currentPage.find('div',id='rstdata-wrap')#shop data table
#name
name=unicode(sD.find('p','mname').get_text())
rst['name']=name
#other basic info
thList=sD.find_all('th')
for th in thList:
#genre
if unicode(th.string)==u'ジャンル':
rst['genre']=unicode(th.next_sibling.next_sibling.get_text())#創作料理、イタリアン、居酒屋・ダイニングバー(その他)
continue
#TEL
elif unicode(th.string)==u'TEL・予約':
rst['tel']=unicode(sD.find('p',class_=['tel-main', 'ppc-main']).get_text())
#http://tabelog.com/tokyo/A1301/A130101/13149591/dtlrvwlst/ : 050-5819-3632 (予約専用番号)
try:
rst['reserved']=unicode(sD.find('span','reserve-status').string)
except:
rst['reserved']=u'不可'
continue
elif unicode(th.string)==u'住所':
rst['addr']=unicode(sD.find('p',rel='v:addr').get_text()) #rel=['v:addr']
try:
mapaddr=sD.find('div','rst-map').find('img')['src']
rst['map']=unicode(mapaddr[mapaddr.find('center')+7:mapaddr.find('&markers')]) #35.324535345,42.212121212
except:
pass
continue
elif unicode(th.string)==u'交通手段':
rst['traffic']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'営業時間':
rst['time']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'定休日':
rst['yasumi']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'平均予算':
rst['bgt']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'平均利用金額':
rst['avg']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'カード':
rst['card']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'サービス料・チャージ':
rst['charge']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'席数':
rst['seat']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'個室':
# print '-------------'
# print unicode(th.next_sibling.next_sibling.p.get_text())
# print '-------------'
# print unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
# print '-------------'
rst['private']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'貸切':
rst['chartered']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'禁煙・喫煙':
rst['tabaco']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'駐車場':
rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'駐車場':
rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'空間・設備':
rst['space']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'飲み放題コース':
rst['houdai']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'コース':
rst['course']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'ドリンク':
rst['drink']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'料理':
rst['food']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.get_text())==u'こんな時にオススメ':
rst['food']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
# print rst['name'],rst['food'] #灯とともに 友人・同僚と|デート|宴会
continue
elif unicode(th.string)==u'ロケーション':
rst['location']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'サービス':
rst['service']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'お子様同伴':
rst['child']=unicode(th.next_sibling.next_sibling.p.get_text())
continue
elif unicode(th.string)==u'ホームページ':
rst['homepage']=unicode(th.next_sibling.next_sibling.p.a.get_text())
# print rst['homepage'] #http://www.hotpepper.jp/strJ000999523/
continue
elif unicode(th.string)==u'オープン日':
rst['date']=unicode(th.next_sibling.next_sibling.get_text()) #no <p>
continue
elif unicode(th.string)==u'備考':
rst['notes']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p>
print rst['notes']
continue
else:
# print unicode(th.string)
# #店名、携帯電話、初投稿者、...?
pass
#end for th
nearRstList=[] #近所のお店
indexes=[] #こだわりインデックス
nearSpots=[]#周辺の観光スポット
nearMarks=[] #近くにある施設
KWs=[] #'関連のキーワード'
rBs=currentPage.find_all('div',class_='relation-box')
for rb in rBs:
if unicode(rb.h4.string)[0:5]==u'近所のお店':
# print unicode(rb.h4.string) #近所のお店(銀座)
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
nearRst=[] #[ [name, url, distance, area & cat],[] ]
nearRst.append(unicode(nr.find('p','mname').a.string))
nearRst.append(unicode(nr.find('p','mname').a['href']))
nearRst.append(unicode(nr.find('p','mname').span.string))
nearRst.append(unicode(nr.find('p','area-catg').string))
nearRstList.append(nearRst)
continue
elif unicode(rb.h4.string)==u'こだわりインデックス':
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
index=[]#[ [courseName, url],[] ]
index.append(unicode(nr.find('p','mname').a.string))
index.append(unicode(nr.find('p','mname').a['href']))
indexes.append(index)
continue
elif unicode(rb.h4.string)==u'周辺の観光スポット':
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
spot=[]#[ [spotName, url, distance],[] ]
spot.append(unicode(nr.find('p','mname').a.string))
spot.append(unicode(nr.find('p','mname').a['href']))
spot.append(unicode(nr.find('p','mname').span.string))
nearSpots.append(spot)
continue
elif unicode(rb.h4.string)==u'近くにある施設':
nrList=rb.find_all('li',class_='restbox')
for nr in nrList:
mark=[]#[ [mark1, url],[] ]
mark.append(unicode(nr.find('p','mname').a.string))
mark.append(unicode(nr.find('p','mname').a['href']))
nearMarks.append(mark)
continue
elif unicode(rb.h4.string)==u'関連のキーワード':
nrList=rb.find_all('li') #no class_
for nr in nrList:
KWs.append(unicode(nr.a.string)) #[kw1,kw2,...]
continue
else:
pass
#このお店を訪れた人はこんなレストランも訪れています
#周辺のお店ランキング
#条件の似たお店を探す (銀座・新橋・有楽町):創作料理 × ¥2,000~¥2,999|創作料理 × 友人・同僚と
#関連リンク : 東京ランチランキング
#関連路線: 銀座線| 日比谷線
# print rb.h4.string #see above
rst['nearRstList']=nearRstList
rst['indexes']=indexes
rst['nearSpots']=nearSpots
rst['nearMarks']=nearMarks
rst['KWs']=KWs #print u','.join(rst['KWs']) # kw1,kw2,...
# print '--------------'
#current rvw page
else:
# url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
# urlNextRvwPage='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg)
urlNextRvwPage= url_shop+'COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg) #todo: url to next review page
currentPage=BeautifulSoup(urllib.urlopen(urlNextRvwPage).read())
rvwboxesCurrentPage=currentPage.find_all('div','review-box')
for rvwbox in rvwboxesCurrentPage:
rvw={} # make each rvw a dict
# title('時代に流されない完成の極み') , url('http://....')
title=rvwbox.find('p','title').find('a')
rvw['url']=URLTabelog+str(title['href'])
rvw['title']=unicode(title.string)
#user('だいこんまん '), profile('30代後半・男性・愛知県')
rvwer=rvwbox.find('div','reviewer-name')
rvw['user']=unicode(rvwer.find('span').next_element) #<span class="lev1">だいこんまん <span class="count">(70)</span></span>
profile=rvwer.find('p','area')
try:
rvw['profile']=unicode(profile.string) #'div','area'
except AttributeError:
rvw['profile']=u''
#score: tScore, scores=[料理,サービス,雰囲気, CP, ドリンク]
_scores=rvwbox.find('ul','score-ex').find_all('strong') #[ 料理・味 4.0| サービス 2.5| 雰囲気 4.0| CP 4.0| 酒・ドリンク 3.0 ]
total_score=rvwbox.find('p','total').strong.string
time_=rvwbox.find('p','total').find('span','subject').string #<span class="subject">昼の点数:</span>
rvw['time']=unicode(time_[0]) #'昼','夜'
rvw['totalScore']=(0 if total_score == u'-' else float(total_score) )
rvw['scores']=[0 if sc.string == u'-' else float(sc.string) for sc in _scores]
#price
_price=rvwbox.find('p','price').find_all('strong')
prices=[unicode(x.string) for x in _price]
rvw['prices']=prices
#situation
cases=rvwbox.find('p','situation').find_all('img')
situation=[ 0 if c['src'][-6:-4]=='_g' else 1 for c in cases] #[friends,date,settai,party,family,alone]
rvw['situation']=situation
#comment :better use comment_clean ,since cPickle can't pickle HTMLParser objects??
comment=rvwbox.find('div','comment')
rvw['comment']=unicode(comment.p)
# # todo:prettify, clean comment
# comment=rvwbox.find('div','comment')
# comment_clean=u''
# BR=BeautifulSoup('<html><br/></html>').find('br')
# for c in comment.p.children: # def fn_reserve_br(p): isinstance(tag.next_element, NavigableString)
# if isinstance(c,NavigableString):
# comment_clean=comment_clean+unicode(c)
# elif c==BR and c.next_sibling!=BR:
# comment_clean=comment_clean+u' *br* '
# elif c==BR:
# pass
# elif c.name=='span':#span,a,... -> *br*, *wiki-bold*
# comment_clean=comment_clean+u' *'+unicode(c['class'][0])+u'* '+unicode(c.string)+u' *'+unicode(c['class'][0])+u'* '
# rvw['comment']=unicode(comment_clean)
#vote
vote=int(rvwbox.find('span','agree-vote').find('em').string)
rvw['vote']=vote
# append rvw to rvwList
rvwList.append(rvw)
#end of current rvw page, all rvw in current page is appended to rvwList
#end of rvw pages, all rvw in all pages is appended to rvwList
rst['rvwList']=rvwList
rstList.append(rst)
#end of url_shop
dataKu=open(nameAIUEO+u'.data','w')
cPickle.dump(rstList,dataKu)
dataKu.close()
#end of every tag(AIUEO)
# ------------ # ----------- # -----------
kuURL='http://tabelog.com/sitemap/tokyo/A1301-A130101/'
nameKu=u'東京_154_1_銀座'
getKu(kuURL,nameKu)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment