Skip to content

Instantly share code, notes, and snippets.

@scrapehero
Last active November 11, 2020 15:12
Show Gist options
  • Save scrapehero/d8cf3d8b7039b8ba3dcde9b607cdc7de to your computer and use it in GitHub Desktop.
Save scrapehero/d8cf3d8b7039b8ba3dcde9b607cdc7de to your computer and use it in GitHub Desktop.
Python 2 code to extract yelp business details
from lxml import html
import json
import requests
from exceptions import ValueError
from time import sleep
import re,urllib
import argparse
def parse(url):
# url = "https://www.yelp.com/biz/frances-san-francisco"
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36'}
response = requests.get(url, headers=headers, verify=False).text
parser = html.fromstring(response)
print "Parsing the page"
raw_name = parser.xpath("//h1[contains(@class,'page-title')]//text()")
raw_claimed = parser.xpath("//span[contains(@class,'claim-status_icon--claimed')]/parent::div/text()")
raw_reviews = parser.xpath("//div[contains(@class,'biz-main-info')]//span[contains(@class,'review-count rating-qualifier')]//text()")
raw_category = parser.xpath('//div[contains(@class,"biz-page-header")]//span[@class="category-str-list"]//a/text()')
hours_table = parser.xpath("//table[contains(@class,'hours-table')]//tr")
details_table = parser.xpath("//div[@class='short-def-list']//dl")
raw_map_link = parser.xpath("//a[@class='biz-map-directions']/img/@src")
raw_phone = parser.xpath(".//span[@class='biz-phone']//text()")
raw_address = parser.xpath('//div[@class="mapbox-text"]//div[contains(@class,"map-box-address")]//text()')
raw_wbsite_link = parser.xpath("//span[contains(@class,'biz-website')]/a/@href")
raw_price_range = parser.xpath("//dd[contains(@class,'price-description')]//text()")
raw_health_rating = parser.xpath("//dd[contains(@class,'health-score-description')]//text()")
rating_histogram = parser.xpath("//table[contains(@class,'histogram')]//tr[contains(@class,'histogram_row')]")
raw_ratings = parser.xpath("//div[contains(@class,'biz-page-header')]//div[contains(@class,'rating')]/@title")
working_hours = []
for hours in hours_table:
raw_day = hours.xpath(".//th//text()")
raw_timing = hours.xpath("./td//text()")
day = ''.join(raw_day).strip()
timing = ''.join(raw_timing).strip()
working_hours.append({day:timing})
info = []
for details in details_table:
raw_description_key = details.xpath('.//dt//text()')
raw_description_value = details.xpath('.//dd//text()')
description_key = ''.join(raw_description_key).strip()
description_value = ''.join(raw_description_value).strip()
info.append({description_key:description_value})
ratings_histogram = []
for ratings in rating_histogram:
raw_rating_key = ratings.xpath(".//th//text()")
raw_rating_value = ratings.xpath(".//td[@class='histogram_count']//text()")
rating_key = ''.join(raw_rating_key).strip()
rating_value = ''.join(raw_rating_value).strip()
ratings_histogram.append({rating_key:rating_value})
name = ''.join(raw_name).strip()
phone = ''.join(raw_phone).strip()
address = ' '.join(' '.join(raw_address).split())
health_rating = ''.join(raw_health_rating).strip()
price_range = ''.join(raw_price_range).strip()
claimed_status = ''.join(raw_claimed).strip()
reviews = ''.join(raw_reviews).strip()
category = ','.join(raw_category)
cleaned_ratings = ''.join(raw_ratings).strip()
if raw_wbsite_link:
decoded_raw_website_link = urllib.unquote(raw_wbsite_link[0])
website = re.findall("biz_redir\?url=(.*)&website_link",decoded_raw_website_link)[0]
else:
website = ''
if raw_map_link:
decoded_map_url = urllib.unquote(raw_map_link[0])
map_coordinates = re.findall("center=([+-]?\d+.\d+,[+-]?\d+\.\d+)",decoded_map_url)[0].split(',')
latitude = map_coordinates[0]
longitude = map_coordinates[1]
else:
latitude = ''
longitude = ''
if raw_ratings:
ratings = re.findall("\d+[.,]?\d+",cleaned_ratings)[0]
else:
ratings = 0
data={'working_hours':working_hours,
'info':info,
'ratings_histogram':ratings_histogram,
'name':name,
'phone':phone,
'ratings':ratings,
'address':address,
'health_rating':health_rating,
'price_range':price_range,
'claimed_status':claimed_status,
'reviews':reviews,
'category':category,
'website':website,
'latitude':latitude,
'longitude':longitude,
'url':url
}
return data
if __name__=="__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('url',help = 'yelp bussiness url')
args = argparser.parse_args()
url = args.url
scraped_data = parse(url)
yelp_id = url.split('/')[-1]
with open("scraped_data-%s.json"%(yelp_id),'w') as fp:
json.dump(scraped_data,fp,indent=4)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment