Created
May 13, 2016 10:13
-
-
Save thekindlyone/e888c72b5b86380b45f34c9e840620e0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from __future__ import unicode_literals | |
from unidecode import unidecode | |
import requests | |
from bs4 import BeautifulSoup as bs | |
from time import sleep | |
import re | |
from kitchen.text.converters import to_bytes | |
import itertools | |
import csv | |
from multiprocessing import Process, Queue | |
def cleanse(data,transliteration=True): | |
try: | |
if transliteration: | |
return {key: unidecode(sanitize(value)) for key, value in data.iteritems()} | |
else: | |
return {key: to_bytes(sanitize(value)) for key, value in data.iteritems()} | |
except Exception as e: | |
print data | |
print str(e) | |
class Browser(object): | |
def __init__(self,url): | |
self.s=requests.Session() | |
self.s.head(url) | |
def soup(self,url): | |
r=self.s.get(url) | |
return bs(r.content) | |
def makeRequest(self,url,headers=None,maxattempts=15): | |
attempts=0 | |
while attempts < maxattempts: | |
attempts+=1 | |
try: | |
r=self.s.get(url,headers=headers) | |
return r | |
except Exception as e: | |
sleep(10) | |
continue | |
return False | |
def grouper(iterable, n, fillvalue=None): | |
args = [iter(iterable)] * n | |
return itertools.izip_longest(*args, fillvalue=fillvalue) | |
def get_soup(url, max_attempts=5,agent={'User-agent': 'Mozilla/5.0'},num=None): | |
# user_agent = {'User-agent': 'Mozilla/5.0'} | |
for i in xrange(max_attempts): | |
try: | |
if num: | |
with num.get_lock(): | |
num.value+=1 | |
r = requests.get(url, headers=agent,timeout=10) | |
if r.status_code == 200: | |
return bs(r.content) | |
else: | |
print 'status at request',url, r.status_code | |
sleep(3) | |
except Exception as e: | |
print '{} at {}'.format(str(e),url) | |
sleep(3) | |
# continue | |
# print r.status_code | |
return False | |
def sanitize(text): | |
if text: | |
# text=str(text) | |
text = re.sub(' +', ' ', text) | |
text = re.sub('\n+', '\n', text) | |
text=text.replace('"', '') | |
text=text.replace(';', ',') | |
if not text: | |
text='N/A' | |
return'\n'.join([line.strip() for line in text.strip().split('\n')]) | |
else: | |
return 'N/A' | |
def handle(func, default='N/A'): | |
try: | |
rv = func() | |
return rv | |
except Exception as e: | |
return default | |
def extract_number(text): | |
num=re.search('\d+',text).group() | |
return num | |
def flatten(l): | |
return [item for sublist in l for item in sublist] | |
def extract_float(text): | |
fl=re.search('[0-9.]+',text).group() | |
return fl | |
def scribe(q,headers,filename,mode='w',transliteration=True): | |
with open(filename, mode) as csvfile: | |
writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers,restval="N/A") | |
if mode=='w': | |
writer.writeheader() | |
count=0 | |
while True: | |
row=q.get() | |
if row=='STOP': | |
break | |
else: | |
writer.writerow(cleanse(row,transliteration=transliteration)) | |
count+=1 | |
print count,'rows written' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from utils import * | |
from urlparse import urljoin | |
import re | |
from threading import Thread | |
import Queue | |
from time import sleep | |
import csv | |
def process(url,city,q,other=False): | |
en='?lang=en' | |
soup=get_soup(url+en) | |
all_outlets_link=handle(lambda: soup.find('a',title=re.compile('All outlets')).get('href',False),False) | |
if not all_outlets_link or other : | |
name=soup.find('span',itemprop='name').text | |
contact= handle(lambda: soup.find('span',class_='tel-icon').text) | |
address=handle(lambda: soup.find('div',class_='res-main-address-text').text) | |
area=handle(lambda: soup.find('span',itemprop='addressLocality').text) | |
pricerange=handle(lambda: soup.find('span', {'itemprop': 'priceRange'}).text.strip()) | |
payment=handle(lambda: ','.join([item.text.strip() for item in soup.findAll('span', {'itemprop': 'paymentAccepted'})])) | |
delivery='No' if 'No Home Delivery' in soup.text else 'Yes' | |
cuisine=handle(lambda: soup.find('a',itemprop='servesCuisine').text) | |
ophours=handle(lambda: '\n'.join([div.text for div in soup.select('div.res-week-timetable > div')])) | |
description=handle(lambda: soup.find('a',itemprop='typeEstablishment').text) | |
coords = handle(lambda: re.search('center=(.+?)&', str(soup)).group(1)) | |
reviews = handle(lambda: soup.select('#selectors > li > a > span')[0].text) | |
ratings = handle(lambda: soup.find('div', {'itemprop': 'ratingValue'}).text.strip()) | |
q.put({'Vendor Name': name, | |
'Country': 'Brazil', | |
'City': city, | |
'Address': address, | |
'Area':area, | |
'Coordinates': coords, | |
'URL': url, | |
'Delivery': delivery, | |
'Contact no.': contact, | |
'Operating Hours': ophours, | |
'Description': description, | |
'Cuisines': cuisine, | |
'Price Range': pricerange, | |
'Payment Options': payment, | |
'Ratings': ratings, | |
'Reviews': reviews | |
}) | |
else: | |
for vlink in fetch_vendorlinks(all_outlets_link): | |
process(vlink,city,q,other=True) | |
def fetch_vendorlinks(url): | |
soup=get_soup(url) | |
vlinks=[a.get('href') for a in soup.select('a.result-title')] | |
return vlinks | |
def paginate(firstpage): | |
soup=get_soup(firstpage) | |
pages = soup.find('div', class_='pagination-number') | |
if not pages: | |
pages = 1 | |
else: | |
match = re.search('\d+$', pages.text.strip()) | |
if match: | |
pages = int(match.group()) | |
else: | |
pages = 1 | |
return ('{}?page={}'.format(firstpage, page) for page in range(1, pages + 1)) | |
def scribe(q): | |
with open('zomato_brazil.csv', 'w') as csvfile: | |
writer = csv.DictWriter(csvfile, dialect='excel', fieldnames=headers) | |
writer.writeheader() | |
count=0 | |
while True: | |
row=q.get() | |
if row=='STOP': | |
break | |
else: | |
writer.writerow(cleanse(row)) | |
count+=1 | |
print count,'rows written to sheet' | |
cities=[('Rio','https://www.zomato.com/rio/restaurants'), | |
('Sao Paulo','https://www.zomato.com/sao-paulo-sp/restaurants'), | |
('Brasilia','https://www.zomato.com/brasilia/restaurants'), | |
('Porto Alegre','https://www.zomato.com/portoalegre/restaurants'), | |
('Salvador','https://www.zomato.com/salvador/restaurants')] | |
headers = ['Vendor Name', 'Country', 'City', 'Address','Area', 'Coordinates', 'URL', 'Delivery', 'Contact no.', 'Operating Hours', | |
'Description', 'Cuisines', 'Price Range', 'Payment Options', 'Ratings', 'Reviews'] | |
q=Queue.Queue() | |
threads=[] | |
scribethread=Thread(target=scribe, args=(q,)) | |
scribethread.daemon=True | |
scribethread.start() | |
for cityname,citylink in cities: | |
for pno,pagelink in enumerate(paginate(citylink),start=1): | |
print 'processing page',pno,'of', cityname | |
for vlink in fetch_vendorlinks(pagelink): | |
t=Thread(target=process,args=(vlink,cityname,q)) | |
t.daemon=True | |
t.start() | |
threads.append(t) | |
while([thread for thread in threads if thread.isAlive()]): | |
sleep(1) | |
for thread in threads: | |
thread.join() | |
q.put('STOP') | |
print 'All Done' | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment