maowug · April 24, 2013 06:11
diff --git a/tabelogGetKWs.py b/tabelogGetKWs.py
 #!/usr/bin/env python
 #encoding: utf-8

 import cPickle
 import urllib
 from bs4 import BeautifulSoup,NavigableString
 import re
 import math


 URLTabelog='http://tabelog.com'
 URLPage_a='http://tabelog.com/keywords/?kana_init=%E3%82%A2'
 tagLis=BeautifulSoup(urllib.urlopen(URLPage_a)).find('div','keyword-index').find_all('li')
 # print tagLis
 NKWsPerPage=100

 kwDict={}

 for li in tagLis:
    kwList=[]
    tagName=unicode(li.get_text()) # a,i,u,...
    urlCurrentPage=u''#http://tabelog.com/keywords?kana_init=ア
    currentPage=u''#bs of urlCurrentPage
    try:
        #do the normal
        urlCurrentPage=URLTabelog+li.a.get('href')
        urlTagPage=BeautifulSoup(urllib.urlopen(urlCurrentPage))
        currentPage=urlTagPage
    except:
        #do the _a or others
        # print tagName.encode('gb2312')
        if tagLis.index(li)==0:
            urlCurrentPage=URLPage_a
            currentPage=BeautifulSoup(urllib.urlopen(URLPage_a))
        else:
            #some other tags has no KWs
            print tagLis.index(li),len(tagLis)
            continue

    nKWs=float(currentPage.find('p','page-count').find_all('span','num')[-1].string)
    nPage=math.ceil(nKWs/NKWsPerPage)

    for p in xrange(1,int(nPage)+1):
        if p==1:
            currentKWPage=currentPage
        else:
            urlKWPage=urlCurrentPage+'&page='+str(p)
            currentKWPage=BeautifulSoup(urllib.urlopen(urlKWPage))
        kwLis=currentKWPage.find('div','keyword-list').find_all('li')
        for kwli in kwLis:
            kwList.append(unicode(kwli.get_text()))
        #eof for-p in kws pages
    # print len(kwList),','.join(kwList) 1202 [飯舘牛,イイダコ,飯蛸旨煮,...]
    kwDict[tagName]=kwList
    print tagLis.index(li),len(tagLis),tagName,len(kwList)

 #dump
 tabelogKWs=open(u'tabelogキーワードs'+u'.data','wb')
 cPickle.dump(kwDict,tabelogKWs)
 tabelogKWs.close()

 #usage:
 # kwDict=cPickle.load(open("tabelogキーワードs.data","rb"))
 # for tag in kwDict.keys():pass


 # # # load test
 # for tag in kwDict.keys():
 #     print ','.join(kwDict[tag][0:3]),len(kwDict[tag][0])
 #
 #
 # #re-dump
 # tabelogKWs=open(u'tabelogキーワードs_subset'+u'.data','wb')
 # cPickle.dump(kwDict,tabelogKWs)
 # tabelogKWs.close()




diff --git a/tabelogKWsHypo.py b/tabelogKWsHypo.py
 #!/usr/bin/env python
 #encoding: utf-8

 import cPickle
 import re
 import copy

 kwDict=cPickle.load(open(u"tabelogキーワードs.data",u"rb"))

 # kwDict={'a':1,'b':2,'c':5}

 # for tag in kwDict.keys():
 #     if tag=='a':
 #         del kwDict[tag]
 #     elif tag=='b':
 #         print kwDict['a']

 dictKWs={}
 dictKWs['kwHasNoRelations']=[]

 #make a kws dict
 for tag in kwDict:
    for kw in kwDict[tag]:#kwDict[tag] is a list of some tag(a,i,u,...)
        dictKWs[kw]=[[],[],[],[]]

 dictKWs2=copy.deepcopy(dictKWs)

 count=1

 for kw in dictKWs2.keys()[0:10000]:
    count+=1
    #for evert kw in dict, cal its Relations

    rkw=dictKWs[kw] #relation of kw
    if len(kw)==1 and kw in u'あいうえお':
        print kw,rkw,'continue'
        continue
    for kw2 in dictKWs2:
        rkw2=dictKWs[kw2]
        try:
            if kw==kw2:#same?
                pass
                #same? do nothing
            elif re.match(kw,kw2)!=None: #begin with
                rkw2[0].append(kw)
                rkw[1].append(kw2)
            elif re.search(u'^\S+'+kw+u'\S+$',kw2): #in the middle of it
                rkw2[0].append(kw)
                rkw[2].append(kw2)
            elif re.search(u'^\S+'+kw+u'$',kw2): #end with
                rkw2[0].append(kw)
                rkw[3].append(kw2)
            else:
                #kw who has no relations: [[1,0,0,0],[],[],[]]
                dictKWs['kwHasNoRelations'].append(kw)
        except:
            print kw,kw2,'passed'
            pass
    if count%500==100:
        print count,'=================='
    if len(rkw[3])>15:
        print '-------------------',kw

 # # [has_kw, front_in, middle_in, end_in]
 # for kw in dictKWs:

 #re-dump
 tabelogKWs2=open(u'tabelogキーワードs_Hypo1'+u'.data','wb')
 cPickle.dump(dictKWs,tabelogKWs2)
 tabelogKWs2.close()


 # bug:
 # rkw[0][0]+=1
 # TypeError: coercing to Unicode: need string or buffer, int found
diff --git a/tabelogLoadTest.py b/tabelogLoadTest.py
 #!/usr/bin/env python
 #encoding: utf-8

 import cPickle
 from bs4 import BeautifulSoup

 rstList=cPickle.load(open("東京_154_100_東中野.data","rb"))

 for rst in rstList[:3]:
    #['tabaco', 'reserved', 'nearSpots', 'private', 'seat', 'course', 'KWs',
    # 'chartered', 'parking', 'houdai', 'tel', 'avg', 'nearMarks', 'addr',
    # 'service', 'space', 'nearRstList', 'indexes', 'charge', 'location',
    # 'homepage', 'map', 'food', 'drink', 'traffic', 'child', 'date',
    # 'rvwList', 'yasumi', 'card', 'name', 'bgt', 'time', 'genre']
 #    for k in rst.keys():
 #        print unicode(k)+u':'+unicode(rst[k])
 #     for rvw in rst['rvwList']:
 #         print rvw['comment']
 #         print '__________________'
    try:
        for rvw in rst['rvwList']:
            print rvw['comment']

            comment=BeautifulSoup(rvw['comment'].encode('utf-8'))
            print comment.get_text()


            print '__________________'

    except:
        print '+++'

    print '====================='

diff --git a/tabelogRobotKen.py b/tabelogRobotKen.py
 #!/usr/bin/env python
 #encoding: utf-8

 import cPickle
 import urllib
 from bs4 import BeautifulSoup,NavigableString
 from tabelogRobotKu import getKu

 import sys
 sys.setrecursionlimit(1000000)


 urlKen='http://tabelog.com/sitemap/aomori/' #東京のレストラン一覧
 kuList=BeautifulSoup(urllib.urlopen(urlKen)).find(id='arealst_sitemap').select('li')
 for ku in kuList:
 #    if int(kuList.index(ku))%2==0:
 #        continue
 #    #index=3,5,7,...+1
    kuUrl=str(ku.a['href'])
    kuName=u'青森'+u'_'+unicode(len(kuList))+u'_'+unicode(kuList.index(ku)+1)+u'_'+unicode(ku.a.string)
    getKu(kuUrl,kuName)

 #urlKen='http://tabelog.com/sitemap/tokyo/' #東京のレストラン一覧
 #kuList=BeautifulSoup(urllib.urlopen(urlKen)).find(id='arealst_sitemap').select('li')
 #KuNames=[u'東京'+u'_'+unicode(len(kuList))+u'_'+unicode(kuList.index(ku)+1)+u'_'+unicode(ku.a.string) for ku in kuList]
 #for name in KuNames[0:3]:
 #    print name


    # ------------ # ----------- # -----------
    # #save
    #top250={'keywords':keywordsTop,'genre':genreTop}
    #f=open('top250.data','w')
    #cPickle.dump(top250,f)
    #f.close()
    # #load
    #top250 = cPickle.load(open("top250.data","rb"))
    # ------------ # ----------- # -----------
diff --git a/tabelogRobotKu.py b/tabelogRobotKu.py
 #!/usr/bin/env python
 #encoding: utf-8

 import cPickle
 import urllib
 from bs4 import BeautifulSoup,NavigableString
 import re

 import sys
 import math
 sys.setrecursionlimit(1000000)

 def getKu(url,name):#todo: for every ku
    #macro
    URLTabelog='http://tabelog.com'
    NRvwPerPage=20
    NPerPage=200 #class='pagenation'
    #initalization
    urlKu=url
    nameKu=unicode(name)
    ku=BeautifulSoup(urllib.urlopen(urlKu).read())
    #aiueo tag
    taglist=ku.find('div','taglist').find_all('a')
    rstList=[] #store ku as a data file
    #for tag in taglist[0:1]:
    for tag in taglist: #todo: for every tag (AIUEO)
        urlShopList=tag['href']
        trans_table = dict([[ord(char), u"_"] for char in u"（）() "])
        nameAIUEO=nameKu+u'_'+unicode(len(taglist))+u'_'+unicode(taglist.index(tag)+1)+u'_'+unicode(tag.string).translate(trans_table)
        print nameAIUEO.encode('utf_8') #東京_154_1_銀座_50_1_ア_93_ #todo:print
        shopListPage=BeautifulSoup(urllib.urlopen(urlShopList).read())
        try:
            # bug - no shops.
            nShop=float(shopListPage.find('div','result_num').strong.string) #todo: try: float()
        except:
            continue
        shopUrlList=[]
        nPage=math.ceil(float(nShop)/NPerPage) #bug - "nPage=nShop/NPerPage+1", when nShop==200.

        #begin iterating page 1,2,3, ...
        for pageCount in xrange(1,int(nPage)+1):
            if pageCount==1:
                shopListCurrentPage=shopListPage
            else:
                urlNextPage=urlShopList+'?PG='+str(pageCount)
                print 'Appending urlShopList in:'+urlNextPage #todo:print url to next shopList page
                shopListCurrentPage=BeautifulSoup(urllib.urlopen(urlNextPage).read())
            shopListDIV=shopListCurrentPage.find_all('div','rstname')#list of shops
            for shop in shopListDIV: #add all shop urls to shopUrlList
                urlShop=URLTabelog+shop.find('a') ['href']
                shopUrlList.append(urlShop)
            #end iterating page
        #begin iterating shops/restrants
        
        for url_shop in shopUrlList:#todo: for each url_shop in tag(あ、い、）
    #        url_shop='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/' #口コミ 214件
    #        url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
            print unicode(shopUrlList.index(url_shop)+1)+u'/'+unicode(len(shopUrlList))+u': '+unicode(url_shop) #todo:print
            url_shop+=r'dtlrvwlst/'
            rst={}
            shopPage=BeautifulSoup(urllib.urlopen(url_shop).read())
    #        nReview=shopPage.find('rvw','page-count').find('span','num').string
            nRvw=float(shopPage.find('em',{'property':'v:count'}).string)
            nRvwPage=math.ceil(nRvw/NRvwPerPage) # 1,2,3,... # bug - when nRvw=20.
            nRvwPage=(1 if nRvwPage==0 else nRvwPage) #bug: if nRvwPage==0, nothing in xrange(1,1)
            rvwList=[]
            for pg in xrange(1,int(nRvwPage)+1):
    #        for pg in xrange(1,2):#todo: for every rvw page
                if pg==1:# get basic info of the shop
                    currentPage=shopPage
                    sD=currentPage.find('div',id='rstdata-wrap')#shop data table
                    #name
                    name=unicode(sD.find('p','mname').get_text())
                    rst['name']=name
                    rst['url']=url_shop[18:]#'/tokyo/A1301/A130102/13000145/dtlrvwlst/'
                    #other basic info
                    thList=sD.find_all('th')
                    for th in thList:
                        #genre
                        if unicode(th.string)==u'ジャンル':
                            rst['genre']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))#創作料理、イタリアン、居酒屋・ダイニングバー（その他）
                            continue
                        #TEL
                        elif unicode(th.string)==u'TEL・予約':
                            rst['tel']=unicode(sD.find('p',class_=['tel-main', 'ppc-main']).get_text())
                            #http://tabelog.com/tokyo/A1301/A130101/13149591/dtlrvwlst/ : 050-5819-3632 （予約専用番号）
                            try:
                                rst['reserved']=unicode(sD.find('span','reserve-status').string)
                            except:
                                rst['reserved']=u'不可'
                            continue
                        elif unicode(th.string)==u'住所':
                            rst['addr']=unicode(sD.find('p',rel='v:addr').get_text()) #rel=['v:addr']
                            try:
                                # bug - no map, only addr  http://tabelog.com/tokyo/A1301/A130101/13030881/
                                mapaddr=sD.find('div','rst-map').find('img')['src']
                                rst['map']=unicode(mapaddr[mapaddr.find('center')+7:mapaddr.find('&markers')]) #35.324535345,42.212121212
                            except:
                                pass
                            continue
                        elif unicode(th.string)==u'交通手段':
                            rst['traffic']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'営業時間':
                            rst['time']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'定休日':
                            rst['yasumi']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'平均予算':
                            rst['bgt']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
                            continue
                        elif unicode(th.string)==u'平均利用金額':
                            rst['avg']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
                            continue
                        elif unicode(th.string)==u'カード':
                            rst['card']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'サービス料・チャージ':
                            rst['charge']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'席数':
                            rst['seat']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
                            continue
                        elif unicode(th.string)==u'個室':
                            rst['private']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
                            continue
                        elif unicode(th.string)==u'貸切':
                            rst['chartered']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
                            continue
                        elif unicode(th.string)==u'禁煙・喫煙':
                            rst['tabaco']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'駐車場':
                            rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'空間・設備':
                            rst['space']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'飲み放題コース':
                            rst['houdai']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'コース':
                            rst['course']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'ドリンク':
                            rst['drink']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'料理':
                            rst['food']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.get_text())==u'こんな時にオススメ':
                            rst['cases']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
    #                        print rst['name'],rst['food'] #灯とともに  友人・同僚と｜デート｜宴会
                            continue
                        elif unicode(th.string)==u'ロケーション':#u'隠れ家'
                            rst['location']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'サービス':
                            rst['service']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'お子様同伴':
                            rst['child']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'ホームページ':
                            rst['homepage']=unicode(th.next_sibling.next_sibling.p.a.get_text())
    #                        print rst['homepage'] #http://www.hotpepper.jp/strJ000999523/
                            continue
                        elif unicode(th.string)==u'オープン日':
                            rst['date']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p>
                            continue
                        elif unicode(th.string)==u'備考':
                            rst['notes']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p>
                            continue
                            #備考: 紹介制 via: http://tabelog.com/tokyo/A1301/A130101/13030881/dtlrvwlst/
                        else:
    #                        print unicode(th.string)
    #                       ＃店名、携帯電話、初投稿者、...？
                            pass
                        #end for th

                    nearRstList=[] #近所のお店
                    indexes=[] #こだわりインデックス
                    nearSpots=[]#周辺の観光スポット
                    nearMarks=[] #近くにある施設
                    KWs=[] #'関連のキーワード'

                    rBs=currentPage.find_all('div',class_='relation-box')
                    for rb in rBs:
                        if unicode(rb.h4.string)[0:5]==u'近所のお店':
    #                        print unicode(rb.h4.string) #近所のお店(銀座)
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                nearRst=[] #[ [name, url, distance, area & cat],[] ]
                                nearRst.append(unicode(nr.find('p','mname').a.string))
                                nearRst.append(unicode(nr.find('p','mname').a['href']))
                                nearRst.append(unicode(nr.find('p','mname').span.string))
                                nearRst.append(unicode(nr.find('p','area-catg').string))
                                nearRstList.append(nearRst)
                            continue

                        elif unicode(rb.h4.string)==u'こだわりインデックス':
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                index=[]#[ [courseName, url],[] ]
                                index.append(unicode(nr.find('p','mname').a.string))
                                index.append(unicode(nr.find('p','mname').a['href']))
                                indexes.append(index)
                            continue

                        elif unicode(rb.h4.string)==u'周辺の観光スポット':
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                spot=[]#[ [spotName, url, distance],[] ]
                                spot.append(unicode(nr.find('p','mname').a.string))
                                spot.append(unicode(nr.find('p','mname').a['href']))
                                spot.append(unicode(nr.find('p','mname').span.string))
                                nearSpots.append(spot)
                            continue

                        elif unicode(rb.h4.string)==u'近くにある施設':
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                mark=[]#[ [mark1, url],[] ]
                                mark.append(unicode(nr.find('p','mname').a.string))
                                mark.append(unicode(nr.find('p','mname').a['href']))
                                nearMarks.append(mark)
                            continue

                        elif unicode(rb.h4.string)==u'関連のキーワード':
                            nrList=rb.find_all('li') #no class_
                            for nr in nrList:
                                KWs.append(unicode(nr.a.string)) #[kw1,kw2,...]
                            continue

                        else:
                            pass
                            #このお店を訪れた人はこんなレストランも訪れています
                            #周辺のお店ランキング
                            #条件の似たお店を探す （銀座・新橋・有楽町）:創作料理 × ￥2,000～￥2,999｜創作料理 × 友人・同僚と
                            #関連リンク : 東京ランチランキング
                            #関連路線: 銀座線| 日比谷線
    #                    print rb.h4.string #see above

                    rst['nearRstList']=nearRstList
                    rst['indexes']=indexes
                    rst['nearSpots']=nearSpots
                    rst['nearMarks']=nearMarks
                    rst['KWs']=KWs #print u','.join(rst['KWs']) # kw1,kw2,...
    #                print '--------------'
                    #current rvw page
                    #menu
                else:
    #                url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
    #                urlNextRvwPage='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg)
                    urlNextRvwPage= url_shop+'COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg)
                    print urlNextRvwPage #todo:print url to next review page
                    currentPage=BeautifulSoup(urllib.urlopen(urlNextRvwPage).read())
                rvwboxesCurrentPage=currentPage.find_all('div','review-box')
                for rvwbox in rvwboxesCurrentPage:
                    rvw={} # make each rvw a dict

                    # title('時代に流されない完成の極み') , url('http://....')
                    title=rvwbox.find('p','title').find('a')
                    rvw['url']=URLTabelog+str(title['href'])
                    rvw['title']=unicode(title.string)

                    #user('だいこんまん '), profile('30代後半・男性・愛知県')
                    rvwer=rvwbox.find('div','reviewer-name')
                    rvw['user']=unicode(rvwer.find('span').next_element) #<span class="lev1">だいこんまん <span class="count">（70）</span></span>
                    profile=rvwer.find('p','area')
                    try:
                        rvw['profile']=unicode(profile.string) #'div','area'
                    except AttributeError:
                        rvw['profile']=u''

                    #score: tScore, scores=[料理,サービス,雰囲気, CP, ドリンク]
                    try:
                        # bug - no score: http://tabelog.com/tokyo/A1303/A130301/13059175/dtlrvwlst/2358064/?use_type=0&smp=2&PG=5&lc=0&sby=&srt=
                        # bug - two scores.
                        _scores=rvwbox.find('ul','score-ex').find_all('strong') #［ 料理・味 4.0| サービス 2.5| 雰囲気 4.0| CP 4.0| 酒・ドリンク 3.0 ］
                        total_score=rvwbox.find('p','total').strong.string
                        time_=rvwbox.find('p','total').find('span','subject').string #<span class="subject">昼の点数：</span>
                        rvw['time']=unicode(time_[0]) #'昼','夜'
                        rvw['totalScore']=(0 if total_score == u'-' else float(total_score) )
                        rvw['scores']=[0 if sc.string == u'-' else float(sc.string) for sc in _scores]
                    except :
                        rvw['time']=u'夜'
                        rvw['totalScore']=0
                        rvw['scores']=[0,0,0,0,0]
                        pass

                    #price
                    _price=rvwbox.find('p','price').find_all('strong')
                    prices=[unicode(x.string) for x in _price]
                    rvw['prices']=prices

                    #situation
                    cases=rvwbox.find('p','situation').find_all('img')
                    situation=[ 0 if c['src'][-6:-4]=='_g' else 1 for c in cases] #[friends,date,settai,party,family,alone]
                    rvw['situation']=situation

                    #comment :better use comment_clean ,since cPickle can't pickle HTMLParser objects??
                    comment=rvwbox.find('div','comment')
                    rvw['comment']=unicode(comment.p)

 #                    # todo:prettify, clean comment
 #                    comment=rvwbox.find('div','comment')
 #                    comment_clean=u''
 #                    BR=BeautifulSoup('<html><br/></html>').find('br')
 #                    for c in comment.p.children: # def fn_reserve_br(p): isinstance(tag.next_element, NavigableString)
 #                        if isinstance(c,NavigableString):
 #                            comment_clean=comment_clean+unicode(c)
 #                        elif c==BR and c.next_sibling!=BR:
 #                            comment_clean=comment_clean+u' *br* '
 #                        elif c==BR:
 #                            pass
 #                        elif c.name=='span':#span,a,...  -> *br*, *wiki-bold*
 #                            comment_clean=comment_clean+u' *'+unicode(c['class'][0])+u'* '+unicode(c.string)+u' *'+unicode(c['class'][0])+u'* '
 #                    rvw['comment']=unicode(comment_clean)

                    #vote
                    vote=int(rvwbox.find('span','agree-vote').find('em').string)
                    rvw['vote']=vote

                    # append rvw to rvwList
                    rvwList.append(rvw)

                    #end of current rvw page, all rvw in current page is appended to rvwList
                #end of rvw pages, all rvw in all pages is appended to rvwList
            rst['rvwList']=rvwList
            rstList.append(rst)
            #eof url_shop
        #eof every tag(AIUEO)
    dataKu=open(nameKu.encode('utf_8')+u'.data'.encode('utf_8'),'wb')# http://stackoverflow.com/questions/283766/pickled-file-wont-load-on-mac-linux
    cPickle.dump(rstList,dataKu)
    dataKu.close()
    #eof ku


 # ------------ # ----------- # -----------
 # WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
 # todo: get menu page.
diff --git a/tabelogRobotTest.py b/tabelogRobotTest.py
 #!/usr/bin/env python
 #encoding: utf-8

 import cPickle
 import urllib
 from bs4 import BeautifulSoup,NavigableString
 import re

 import sys
 sys.setrecursionlimit(1000000)

 def getKu(url,name):#todo: for every ku
    #macro
    URLTabelog='http://tabelog.com'
    NRvwPerPage=20
    NPerPage=200 #class='pagenation'
    #initalization
    urlKu=url
    nameKu=name
    ku=BeautifulSoup(urllib.urlopen(urlKu).read())
    #aiueo tag
    taglist=ku.find('div','taglist').find_all('a')

    for tag in taglist[0:1]:
 #    for tag in taglist: #todo: for every tag (AIUEO)
        urlShopList=tag['href']
        trans_table = dict([[ord(char), u"_"] for char in u"（）() "])
        nameAIUEO=nameKu+u'_'+unicode(len(taglist))+u'_'+unicode(taglist.index(tag)+1)+u'_'+unicode(tag.string).translate(trans_table)
        print nameAIUEO #東京_154_1_銀座_50_1_ア_93_ #todo:print
        shopListPage=BeautifulSoup(urllib.urlopen(urlShopList).read())
        nShop=int(shopListPage.find('div','result_num').strong.string) #todo: try: int()
        shopListDIV=shopListPage.find_all('div','rstname')#list of shops

        shopUrlList=[]
        nPage=nShop/NPerPage+1 #
        #begin iterating page 1,2,3, ...
        for pageCount in xrange(1,nPage+1):
            if pageCount==1:
                shopListCurrentPage=shopListPage
            else:
                urlNextPage=urlShopList+'?PG='+str(pageCount)
                shopListCurrentPage=BeautifulSoup(urllib.urlopen(urlNextPage).read())
            shopListDIV=shopListCurrentPage.find_all('div','rstname')#list of shops
            for shop in shopListDIV: #add all shop urls to shopUrlList
                urlShop=URLTabelog+shop.find('a') ['href']
                shopUrlList.append(urlShop)
                #end iterating page
            #begin iterating shops/restrants
        rstList=[]
        for url_shop in shopUrlList[0:5]:#todo: for each url_shop in tag(あ、い、）
        #        url_shop='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/' #口コミ 214件
        #        url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
            url_shop='http://tabelog.com/tokyo/A1301/A130101/13030881/dtlrvwlst/'
            print url_shop #todo:print
            rst={}
            shopPage=BeautifulSoup(urllib.urlopen(url_shop).read())
            #        nReview=shopPage.find('rvw','page-count').find('span','num').string
            nRvw=int(shopPage.find('em',{'property':'v:count'}).string)
            nRvwPage=nRvw/NRvwPerPage+1 # 1,2,3,...
            rvwList=[]
            for pg in xrange(1,nRvwPage+1):
            #        for pg in xrange(1,2):#todo: for every rvw page
                if pg==1:# get basic info of the shop
                    currentPage=shopPage
                    sD=currentPage.find('div',id='rstdata-wrap')#shop data table
                    #name
                    name=unicode(sD.find('p','mname').get_text())
                    rst['name']=name
                    #other basic info
                    thList=sD.find_all('th')
                    for th in thList:
                        #genre
                        if unicode(th.string)==u'ジャンル':
                            rst['genre']=unicode(th.next_sibling.next_sibling.get_text())#創作料理、イタリアン、居酒屋・ダイニングバー（その他）
                            continue
                        #TEL
                        elif unicode(th.string)==u'TEL・予約':
                            rst['tel']=unicode(sD.find('p',class_=['tel-main', 'ppc-main']).get_text())
                            #http://tabelog.com/tokyo/A1301/A130101/13149591/dtlrvwlst/ : 050-5819-3632 （予約専用番号）
                            try:
                                rst['reserved']=unicode(sD.find('span','reserve-status').string)
                            except:
                                rst['reserved']=u'不可'
                            continue
                        elif unicode(th.string)==u'住所':
                            rst['addr']=unicode(sD.find('p',rel='v:addr').get_text()) #rel=['v:addr']
                            try:
                                mapaddr=sD.find('div','rst-map').find('img')['src']
                                rst['map']=unicode(mapaddr[mapaddr.find('center')+7:mapaddr.find('&markers')]) #35.324535345,42.212121212
                            except:
                                pass
                            continue
                        elif unicode(th.string)==u'交通手段':
                            rst['traffic']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'営業時間':
                            rst['time']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'定休日':
                            rst['yasumi']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'平均予算':
                            rst['bgt']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'平均利用金額':
                            rst['avg']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'カード':
                            rst['card']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'サービス料・チャージ':
                            rst['charge']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'席数':
                            rst['seat']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'個室':
 #                            print '-------------'
 #                            print unicode(th.next_sibling.next_sibling.p.get_text())
 #                            print '-------------'
 #                            print unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
 #                            print '-------------'
                            rst['private']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'貸切':
                            rst['chartered']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'禁煙・喫煙':
                            rst['tabaco']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'駐車場':
                            rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'駐車場':
                            rst['parking']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'空間・設備':
                            rst['space']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'飲み放題コース':
                            rst['houdai']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'コース':
                            rst['course']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'ドリンク':
                            rst['drink']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'料理':
                            rst['food']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.get_text())==u'こんな時にオススメ':
                            rst['food']=unicode(''.join(th.next_sibling.next_sibling.p.stripped_strings))
                            #                        print rst['name'],rst['food'] #灯とともに  友人・同僚と｜デート｜宴会
                            continue
                        elif unicode(th.string)==u'ロケーション':
                            rst['location']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'サービス':
                            rst['service']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'お子様同伴':
                            rst['child']=unicode(th.next_sibling.next_sibling.p.get_text())
                            continue
                        elif unicode(th.string)==u'ホームページ':
                            rst['homepage']=unicode(th.next_sibling.next_sibling.p.a.get_text())
                            #                        print rst['homepage'] #http://www.hotpepper.jp/strJ000999523/
                            continue
                        elif unicode(th.string)==u'オープン日':
                            rst['date']=unicode(th.next_sibling.next_sibling.get_text()) #no <p>
                            continue
                        elif unicode(th.string)==u'備考':
                            rst['notes']=unicode(''.join(th.next_sibling.next_sibling.stripped_strings)) #no <p>
                            print rst['notes']
                            continue
                        else:
                        #                        print unicode(th.string)
                        #                       ＃店名、携帯電話、初投稿者、...？
                            pass
                            #end for th

                    nearRstList=[] #近所のお店
                    indexes=[] #こだわりインデックス
                    nearSpots=[]#周辺の観光スポット
                    nearMarks=[] #近くにある施設
                    KWs=[] #'関連のキーワード'

                    rBs=currentPage.find_all('div',class_='relation-box')
                    for rb in rBs:
                        if unicode(rb.h4.string)[0:5]==u'近所のお店':
                        #                        print unicode(rb.h4.string) #近所のお店(銀座)
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                nearRst=[] #[ [name, url, distance, area & cat],[] ]
                                nearRst.append(unicode(nr.find('p','mname').a.string))
                                nearRst.append(unicode(nr.find('p','mname').a['href']))
                                nearRst.append(unicode(nr.find('p','mname').span.string))
                                nearRst.append(unicode(nr.find('p','area-catg').string))
                                nearRstList.append(nearRst)
                            continue

                        elif unicode(rb.h4.string)==u'こだわりインデックス':
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                index=[]#[ [courseName, url],[] ]
                                index.append(unicode(nr.find('p','mname').a.string))
                                index.append(unicode(nr.find('p','mname').a['href']))
                                indexes.append(index)
                            continue

                        elif unicode(rb.h4.string)==u'周辺の観光スポット':
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                spot=[]#[ [spotName, url, distance],[] ]
                                spot.append(unicode(nr.find('p','mname').a.string))
                                spot.append(unicode(nr.find('p','mname').a['href']))
                                spot.append(unicode(nr.find('p','mname').span.string))
                                nearSpots.append(spot)
                            continue

                        elif unicode(rb.h4.string)==u'近くにある施設':
                            nrList=rb.find_all('li',class_='restbox')
                            for nr in nrList:
                                mark=[]#[ [mark1, url],[] ]
                                mark.append(unicode(nr.find('p','mname').a.string))
                                mark.append(unicode(nr.find('p','mname').a['href']))
                                nearMarks.append(mark)
                            continue

                        elif unicode(rb.h4.string)==u'関連のキーワード':
                            nrList=rb.find_all('li') #no class_
                            for nr in nrList:
                                KWs.append(unicode(nr.a.string)) #[kw1,kw2,...]
                            continue

                        else:
                            pass
                            #このお店を訪れた人はこんなレストランも訪れています
                            #周辺のお店ランキング
                            #条件の似たお店を探す （銀座・新橋・有楽町）:創作料理 × ￥2,000～￥2,999｜創作料理 × 友人・同僚と
                            #関連リンク : 東京ランチランキング
                            #関連路線: 銀座線| 日比谷線
                            #                    print rb.h4.string #see above

                    rst['nearRstList']=nearRstList
                    rst['indexes']=indexes
                    rst['nearSpots']=nearSpots
                    rst['nearMarks']=nearMarks
                    rst['KWs']=KWs #print u','.join(rst['KWs']) # kw1,kw2,...
                    #                print '--------------'
                    #current rvw page
                else:
                #                url_shop='http://tabelog.com/tokyo/A1301/A130101/13150400/dtlrvwlst/' #口コミ 4件
                #                urlNextRvwPage='http://tabelog.com/tokyo/A1301/A130102/13000145/dtlrvwlst/COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg)
                    urlNextRvwPage= url_shop+'COND-0/smp2/?lc=0&PG=%s&rvw_part=all' % str(pg) #todo: url to next review page
                    currentPage=BeautifulSoup(urllib.urlopen(urlNextRvwPage).read())
                rvwboxesCurrentPage=currentPage.find_all('div','review-box')
                for rvwbox in rvwboxesCurrentPage:
                    rvw={} # make each rvw a dict

                    # title('時代に流されない完成の極み') , url('http://....')
                    title=rvwbox.find('p','title').find('a')
                    rvw['url']=URLTabelog+str(title['href'])
                    rvw['title']=unicode(title.string)

                    #user('だいこんまん '), profile('30代後半・男性・愛知県')
                    rvwer=rvwbox.find('div','reviewer-name')
                    rvw['user']=unicode(rvwer.find('span').next_element) #<span class="lev1">だいこんまん <span class="count">（70）</span></span>
                    profile=rvwer.find('p','area')
                    try:
                        rvw['profile']=unicode(profile.string) #'div','area'
                    except AttributeError:
                        rvw['profile']=u''

                    #score: tScore, scores=[料理,サービス,雰囲気, CP, ドリンク]
                    _scores=rvwbox.find('ul','score-ex').find_all('strong') #［ 料理・味 4.0| サービス 2.5| 雰囲気 4.0| CP 4.0| 酒・ドリンク 3.0 ］
                    total_score=rvwbox.find('p','total').strong.string
                    time_=rvwbox.find('p','total').find('span','subject').string #<span class="subject">昼の点数：</span>
                    rvw['time']=unicode(time_[0]) #'昼','夜'
                    rvw['totalScore']=(0 if total_score == u'-' else float(total_score) )
                    rvw['scores']=[0 if sc.string == u'-' else float(sc.string) for sc in _scores]

                    #price
                    _price=rvwbox.find('p','price').find_all('strong')
                    prices=[unicode(x.string) for x in _price]
                    rvw['prices']=prices

                    #situation
                    cases=rvwbox.find('p','situation').find_all('img')
                    situation=[ 0 if c['src'][-6:-4]=='_g' else 1 for c in cases] #[friends,date,settai,party,family,alone]
                    rvw['situation']=situation

                    #comment :better use comment_clean ,since cPickle can't pickle HTMLParser objects??
                    comment=rvwbox.find('div','comment')
                    rvw['comment']=unicode(comment.p)

                    #                    # todo:prettify, clean comment
                    #                    comment=rvwbox.find('div','comment')
                    #                    comment_clean=u''
                    #                    BR=BeautifulSoup('<html><br/></html>').find('br')
                    #                    for c in comment.p.children: # def fn_reserve_br(p): isinstance(tag.next_element, NavigableString)
                    #                        if isinstance(c,NavigableString):
                    #                            comment_clean=comment_clean+unicode(c)
                    #                        elif c==BR and c.next_sibling!=BR:
                    #                            comment_clean=comment_clean+u' *br* '
                    #                        elif c==BR:
                    #                            pass
                    #                        elif c.name=='span':#span,a,...  -> *br*, *wiki-bold*
                    #                            comment_clean=comment_clean+u' *'+unicode(c['class'][0])+u'* '+unicode(c.string)+u' *'+unicode(c['class'][0])+u'* '
                    #                    rvw['comment']=unicode(comment_clean)

                    #vote
                    vote=int(rvwbox.find('span','agree-vote').find('em').string)
                    rvw['vote']=vote

                    # append rvw to rvwList
                    rvwList.append(rvw)

                    #end of current rvw page, all rvw in current page is appended to rvwList
                    #end of rvw pages, all rvw in all pages is appended to rvwList
            rst['rvwList']=rvwList
            rstList.append(rst)
            #end of url_shop
        dataKu=open(nameAIUEO+u'.data','w')
        cPickle.dump(rstList,dataKu)
        dataKu.close()
        #end of every tag(AIUEO)


 # ------------ # ----------- # -----------

 kuURL='http://tabelog.com/sitemap/tokyo/A1301-A130101/'
 nameKu=u'東京_154_1_銀座'
 getKu(kuURL,nameKu)
	#!/usr/bin/env python
	#encoding: utf-8

	import cPickle
	import urllib
	from bs4 import BeautifulSoup,NavigableString
	import re
	import math


	URLTabelog='http://tabelog.com'
	URLPage_a='http://tabelog.com/keywords/?kana_init=%E3%82%A2'
	tagLis=BeautifulSoup(urllib.urlopen(URLPage_a)).find('div','keyword-index').find_all('li')
	# print tagLis
	NKWsPerPage=100

	kwDict={}

	for li in tagLis:
	kwList=[]
	tagName=unicode(li.get_text()) # a,i,u,...
	urlCurrentPage=u''#http://tabelog.com/keywords?kana_init=ア
	currentPage=u''#bs of urlCurrentPage
	try:
	#do the normal
	urlCurrentPage=URLTabelog+li.a.get('href')
	urlTagPage=BeautifulSoup(urllib.urlopen(urlCurrentPage))
	currentPage=urlTagPage
	except:
	#do the _a or others
	# print tagName.encode('gb2312')
	if tagLis.index(li)==0:
	urlCurrentPage=URLPage_a
	currentPage=BeautifulSoup(urllib.urlopen(URLPage_a))
	else:
	#some other tags has no KWs
	print tagLis.index(li),len(tagLis)
	continue

	nKWs=float(currentPage.find('p','page-count').find_all('span','num')[-1].string)
	nPage=math.ceil(nKWs/NKWsPerPage)

	for p in xrange(1,int(nPage)+1):
	if p==1:
	currentKWPage=currentPage
	else:
	urlKWPage=urlCurrentPage+'&page='+str(p)
	currentKWPage=BeautifulSoup(urllib.urlopen(urlKWPage))
	kwLis=currentKWPage.find('div','keyword-list').find_all('li')
	for kwli in kwLis:
	kwList.append(unicode(kwli.get_text()))
	#eof for-p in kws pages
	# print len(kwList),','.join(kwList) 1202 [飯舘牛,イイダコ,飯蛸旨煮,...]
	kwDict[tagName]=kwList
	print tagLis.index(li),len(tagLis),tagName,len(kwList)

	#dump
	tabelogKWs=open(u'tabelogキーワードs'+u'.data','wb')
	cPickle.dump(kwDict,tabelogKWs)
	tabelogKWs.close()

	#usage:
	# kwDict=cPickle.load(open("tabelogキーワードs.data","rb"))
	# for tag in kwDict.keys():pass


	# # # load test
	# for tag in kwDict.keys():
	# print ','.join(kwDict[tag][0:3]),len(kwDict[tag][0])
	#
	#
	# #re-dump
	# tabelogKWs=open(u'tabelogキーワードs_subset'+u'.data','wb')
	# cPickle.dump(kwDict,tabelogKWs)
	# tabelogKWs.close()
No results found