Created
June 24, 2019 11:48
-
-
Save scrapehero-code/4e82feacd1dbe8dc5124478a5f9fa0ba to your computer and use it in GitHub Desktop.
Python code to extract restaurant details from Tripadvisor.com using Scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from csv import DictReader | |
from os import path | |
from tripadvisor_restaurants.items import TripadvisorRestaurantsItem | |
from urllib.parse import urljoin | |
class TripadvisorRestaurantsSpiderSpider(scrapy.Spider): | |
name = 'tripadvisor_restaurants_spider' | |
allowed_domains = ['tripadvisor.com'] | |
base_url = 'http://tripadvisor.com' | |
start_urls = ['http://tripadvisor.com/'] | |
def start_requests(self): | |
"""Read URLs from file""" | |
with open(path.join(path.dirname(__file__), "../resources/urls.csv")) as urls: | |
for url in DictReader(urls): | |
listing_page_url = url["url"] | |
yield scrapy.Request(listing_page_url, callback = self.parse) | |
def clean(self,text): | |
# Removing \n,\r \t. | |
if text: | |
return ' '.join(''.join(text).split()) | |
return None | |
def get_rating(self, raw_rating): | |
# Cleanig rating. | |
if raw_rating: | |
return ''.join(raw_rating).replace("of 5 bubbles","") | |
return None | |
def get_category(self, raw_category): | |
# Converting list to comma separated values | |
if raw_category: | |
return ','.join(raw_category) | |
return None | |
def get_review(self, raw_review_count): | |
if raw_review_count: | |
cleaned_review_count = self.clean(raw_review_count) | |
if cleaned_review_count: | |
return cleaned_review_count.replace('reviews','') | |
return None | |
def get_absolute_url(self,relative_url): | |
base_url = 'http://tripadvisor.com' | |
url = urljoin(base_url, relative_url) | |
return url | |
def parse(self, response): | |
# Parsing Tripadvisor Listing Page. | |
XPATH_RESULTS = "//div[@id='EATERY_SEARCH_RESULTS']/div[contains(@class,'listing')]" | |
restaurants = response.xpath(XPATH_RESULTS) | |
# Iterating over the list of restaurants | |
for restaurant in restaurants: | |
# Defining XPaths. | |
XPATH_NAME = './/a[@class="property_title"]/text()' | |
XPATH_LINK = './/a[@class="property_title"]/@href' | |
XPATH_REVIEW_COUNT = './/span[@class="reviewCount"]//text()' | |
XPATH_RANK = './/div[@class="popIndexBlock"]//text()' | |
XPATH_PRICING = './/span[@class="item price"]//text()' | |
XPATH_CATEGORY = './/a[@class="item cuisine"]//text()' | |
XPATH_RATING = './/div[contains(@class,"rating")]//span[contains(@class,"rating")]/@alt' | |
# Getting data from XPath. | |
raw_name = restaurant.xpath(XPATH_NAME).extract() | |
raw_link = restaurant.xpath(XPATH_LINK).extract() | |
raw_review_count = restaurant.xpath(XPATH_REVIEW_COUNT).extract() | |
raw_rank = restaurant.xpath(XPATH_RANK).extract() | |
raw_pricing = restaurant.xpath(XPATH_PRICING).extract() | |
raw_category = restaurant.xpath(XPATH_CATEGORY).extract() | |
raw_rating = restaurant.xpath(XPATH_RATING).extract() | |
# Cleaning data. | |
name = self.clean(raw_name) | |
restaurant_link = self.get_absolute_url(self.clean(raw_link)) | |
review_count = self.get_review(raw_review_count) | |
rank = self.clean(raw_rank) | |
pricing = self.clean(raw_pricing) | |
category = self.get_category(raw_category) | |
rating = self.get_rating(raw_rating) | |
restaurant_data = { | |
'name': name, | |
'url': restaurant_link, | |
'reviews': review_count, | |
'rank': rank, | |
'price_range': pricing, | |
'category': category, | |
'rating': rating, | |
'listing_page':response.url | |
} | |
yield TripadvisorRestaurantsItem(**restaurant_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment