Created
May 19, 2016 20:26
-
-
Save clemfromspace/51f8b153d27480debf41a094c6b67431 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import dateparser | |
from scrapy import Request | |
from scrapy.spider import CrawlSpider | |
from urlparse import urljoin | |
from ..items import PlaceItem, ReviewItem | |
class TripAdvisorSpider(CrawlSpider): | |
"""spider for the tripadvisor website""" | |
name = 'tripadvisor' | |
allowed_domains = ['www.tripadvisor.fr'] | |
start_urls = ( | |
'https://www.tripadvisor.fr/Restaurants-g187147-Paris_Ile_de_France.html', | |
) | |
@staticmethod | |
def build_review_full_link(review_id, response): | |
"""Build the link to a single review""" | |
pattern = re.compile(ur'-d(\d+)') | |
request_id = re.search(pattern, response.url).groups()[0] | |
review_url = 'https://www.tripadvisor.fr/ExpandedUserReviews-d%(request_id)s' \ | |
'?target=%(review_id)s&reviews=%(review_id)s&servlet=Attraction_Review&expand=0' % { | |
'review_id': review_id, | |
'request_id': request_id | |
} | |
return review_url | |
def parse(self, response): | |
"""Crawl a list of place""" | |
# For each pagination link, yield a new request | |
for page_link in response.xpath('//a[contains(@class, "pageNum")]/@href').extract(): | |
yield Request( | |
urljoin(response.url, page_link), | |
self.parse | |
) | |
# For each place item, yield a new request | |
for place_link in response.xpath('//h3[@class="title"]/a/@href').extract(): | |
yield Request( | |
urljoin(response.url, place_link), | |
self.parse_place | |
) | |
def parse_place(self, response): | |
"""Crawl a single place, with the reviews""" | |
# Extract the place | |
pattern = re.compile(ur'-d(\d+)') | |
place_id = re.search(pattern, response.url).groups()[0] | |
place_name = response.xpath('//h1[@id="HEADING"]/text()').extract()[1].strip() | |
try: | |
address = response.xpath('//span[@class="format_address"][descendant-or-self]/text()').extract()[0] | |
except IndexError: | |
address = None | |
try: | |
rating = response.xpath('//img[contains(@class,"rating_rr_fill")]/@content').extract()[0] | |
except IndexError: | |
rating = 0 | |
yield PlaceItem( | |
id=place_id, | |
name=place_name, | |
address=address, | |
rating=rating | |
) | |
# Yield a new request for each page of reviews | |
for review_page_link in response.xpath('//div[@class="pageNumbers"]/a/@href').extract(): | |
yield Request( | |
urljoin(response.url, review_page_link), | |
callback=self.parse_review_list, | |
meta={ | |
'place_id': place_id | |
} | |
) | |
self.parse_review_list(response, place_id) | |
def parse_review_list(self, response, place_id=None): | |
place_id = place_id or response.meta['place_id'] | |
# Extract the reviews | |
# If the review is cropped, yield a new request to fetch the full content | |
for review_item in response.xpath('//div[contains(@class,"reviewSelector")]'): | |
review_id = review_item.xpath('./@id').extract()[0].replace('review_', '') | |
try: | |
review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/@title').extract()[0] | |
review_date = dateparser.parse(review_date).date() | |
except IndexError: | |
review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/text()').extract()[0] | |
# TODO: Make the line below work | |
review_date = dateparser.parse(review_date) | |
review_title = review_item.xpath('.//span[contains(@class,"noQuotes")]/text()').extract()[0] | |
try: | |
review_body = review_item.xpath('.//p[@class="entry"]').extract()[0].strip() | |
yield ReviewItem( | |
id=review_id, | |
place_id=place_id, | |
title=review_title, | |
body=review_body, | |
date=review_date | |
) | |
except IndexError: | |
review_url = self.build_review_full_link(review_id, response) | |
yield Request( | |
review_url, | |
callback=self.parse_review, | |
meta={ | |
'place_id': place_id, | |
'review_id': review_id, | |
'review_title': review_title, | |
'review_date': review_date | |
} | |
) | |
def parse_review(self, response): | |
"""Crawl a single review""" | |
review_body = response.xpath('//div[@class="entry"]/p/text()').extract()[0].strip() | |
yield ReviewItem( | |
id=response.meta['review_id'], | |
place_id=response.meta['place_id'], | |
title=response.meta['review_title'], | |
date=response.meta['review_date'], | |
body=review_body | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment