Skip to content

Instantly share code, notes, and snippets.

@scrapehero-code
Created June 24, 2019 11:48
Show Gist options
  • Save scrapehero-code/4e82feacd1dbe8dc5124478a5f9fa0ba to your computer and use it in GitHub Desktop.
Save scrapehero-code/4e82feacd1dbe8dc5124478a5f9fa0ba to your computer and use it in GitHub Desktop.
Python code to extract restaurant details from Tripadvisor.com using Scrapy
# -*- coding: utf-8 -*-
import scrapy
from csv import DictReader
from os import path
from tripadvisor_restaurants.items import TripadvisorRestaurantsItem
from urllib.parse import urljoin
class TripadvisorRestaurantsSpiderSpider(scrapy.Spider):
name = 'tripadvisor_restaurants_spider'
allowed_domains = ['tripadvisor.com']
base_url = 'http://tripadvisor.com'
start_urls = ['http://tripadvisor.com/']
def start_requests(self):
"""Read URLs from file"""
with open(path.join(path.dirname(__file__), "../resources/urls.csv")) as urls:
for url in DictReader(urls):
listing_page_url = url["url"]
yield scrapy.Request(listing_page_url, callback = self.parse)
def clean(self,text):
# Removing \n,\r \t.
if text:
return ' '.join(''.join(text).split())
return None
def get_rating(self, raw_rating):
# Cleanig rating.
if raw_rating:
return ''.join(raw_rating).replace("of 5 bubbles","")
return None
def get_category(self, raw_category):
# Converting list to comma separated values
if raw_category:
return ','.join(raw_category)
return None
def get_review(self, raw_review_count):
if raw_review_count:
cleaned_review_count = self.clean(raw_review_count)
if cleaned_review_count:
return cleaned_review_count.replace('reviews','')
return None
def get_absolute_url(self,relative_url):
base_url = 'http://tripadvisor.com'
url = urljoin(base_url, relative_url)
return url
def parse(self, response):
# Parsing Tripadvisor Listing Page.
XPATH_RESULTS = "//div[@id='EATERY_SEARCH_RESULTS']/div[contains(@class,'listing')]"
restaurants = response.xpath(XPATH_RESULTS)
# Iterating over the list of restaurants
for restaurant in restaurants:
# Defining XPaths.
XPATH_NAME = './/a[@class="property_title"]/text()'
XPATH_LINK = './/a[@class="property_title"]/@href'
XPATH_REVIEW_COUNT = './/span[@class="reviewCount"]//text()'
XPATH_RANK = './/div[@class="popIndexBlock"]//text()'
XPATH_PRICING = './/span[@class="item price"]//text()'
XPATH_CATEGORY = './/a[@class="item cuisine"]//text()'
XPATH_RATING = './/div[contains(@class,"rating")]//span[contains(@class,"rating")]/@alt'
# Getting data from XPath.
raw_name = restaurant.xpath(XPATH_NAME).extract()
raw_link = restaurant.xpath(XPATH_LINK).extract()
raw_review_count = restaurant.xpath(XPATH_REVIEW_COUNT).extract()
raw_rank = restaurant.xpath(XPATH_RANK).extract()
raw_pricing = restaurant.xpath(XPATH_PRICING).extract()
raw_category = restaurant.xpath(XPATH_CATEGORY).extract()
raw_rating = restaurant.xpath(XPATH_RATING).extract()
# Cleaning data.
name = self.clean(raw_name)
restaurant_link = self.get_absolute_url(self.clean(raw_link))
review_count = self.get_review(raw_review_count)
rank = self.clean(raw_rank)
pricing = self.clean(raw_pricing)
category = self.get_category(raw_category)
rating = self.get_rating(raw_rating)
restaurant_data = {
'name': name,
'url': restaurant_link,
'reviews': review_count,
'rank': rank,
'price_range': pricing,
'category': category,
'rating': rating,
'listing_page':response.url
}
yield TripadvisorRestaurantsItem(**restaurant_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment