clemfromspace · May 19, 2016 20:26
diff --git a/tripadvisor_spider_v2.py b/tripadvisor_spider_v2.py
 # -*- coding: utf-8 -*-

 import re
 import dateparser

 from scrapy import Request
 from scrapy.spider import CrawlSpider

 from urlparse import urljoin

 from ..items import PlaceItem, ReviewItem


 class TripAdvisorSpider(CrawlSpider):
    """spider for the tripadvisor website"""

    name = 'tripadvisor'
    allowed_domains = ['www.tripadvisor.fr']
    start_urls = (
        'https://www.tripadvisor.fr/Restaurants-g187147-Paris_Ile_de_France.html',
    )

    @staticmethod
    def build_review_full_link(review_id, response):
        """Build the link to a single review"""

        pattern = re.compile(ur'-d(\d+)')
        request_id = re.search(pattern, response.url).groups()[0]

        review_url = 'https://www.tripadvisor.fr/ExpandedUserReviews-d%(request_id)s' \
              '?target=%(review_id)s&reviews=%(review_id)s&servlet=Attraction_Review&expand=0' % {
            'review_id': review_id,
            'request_id': request_id
        }

        return review_url

    def parse(self, response):
        """Crawl a list of place"""

        # For each pagination link, yield a new request
        for page_link in response.xpath('//a[contains(@class, "pageNum")]/@href').extract():
            yield Request(
                urljoin(response.url, page_link),
                self.parse
            )

        # For each place item, yield a new request
        for place_link in response.xpath('//h3[@class="title"]/a/@href').extract():
            yield Request(
                urljoin(response.url, place_link),
                self.parse_place
            )

    def parse_place(self, response):
        """Crawl a single place, with the reviews"""

        # Extract the place
        pattern = re.compile(ur'-d(\d+)')
        place_id = re.search(pattern, response.url).groups()[0]
        place_name = response.xpath('//h1[@id="HEADING"]/text()').extract()[1].strip()

        try:
            address = response.xpath('//span[@class="format_address"][descendant-or-self]/text()').extract()[0]
        except IndexError:
            address = None

        try:
            rating = response.xpath('//img[contains(@class,"rating_rr_fill")]/@content').extract()[0]
        except IndexError:
            rating = 0

        yield PlaceItem(
            id=place_id,
            name=place_name,
            address=address,
            rating=rating
        )

        # Yield a new request for each page of reviews
        for review_page_link in response.xpath('//div[@class="pageNumbers"]/a/@href').extract():
            yield Request(
                urljoin(response.url, review_page_link),
                callback=self.parse_review_list,
                meta={
                    'place_id': place_id
                }
            )

        self.parse_review_list(response, place_id)

    def parse_review_list(self, response, place_id=None):

        place_id = place_id or response.meta['place_id']

        # Extract the reviews
        # If the review is cropped, yield a new request to fetch the full content
        for review_item in response.xpath('//div[contains(@class,"reviewSelector")]'):

            review_id = review_item.xpath('./@id').extract()[0].replace('review_', '')

            try:
                review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/@title').extract()[0]
                review_date = dateparser.parse(review_date).date()
            except IndexError:
                review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/text()').extract()[0]
                # TODO: Make the line below work
                review_date = dateparser.parse(review_date)

            review_title = review_item.xpath('.//span[contains(@class,"noQuotes")]/text()').extract()[0]

            try:
                review_body = review_item.xpath('.//p[@class="entry"]').extract()[0].strip()

                yield ReviewItem(
                    id=review_id,
                    place_id=place_id,
                    title=review_title,
                    body=review_body,
                    date=review_date
                )

            except IndexError:
                review_url = self.build_review_full_link(review_id, response)

                yield Request(
                    review_url,
                    callback=self.parse_review,
                    meta={
                        'place_id': place_id,
                        'review_id': review_id,
                        'review_title': review_title,
                        'review_date': review_date
                    }
                )

    def parse_review(self, response):
        """Crawl a single review"""

        review_body = response.xpath('//div[@class="entry"]/p/text()').extract()[0].strip()

        yield ReviewItem(
            id=response.meta['review_id'],
            place_id=response.meta['place_id'],
            title=response.meta['review_title'],
            date=response.meta['review_date'],
            body=review_body
        )
	# -- coding: utf-8 --

	import re
	import dateparser

	from scrapy import Request
	from scrapy.spider import CrawlSpider

	from urlparse import urljoin

	from ..items import PlaceItem, ReviewItem


	class TripAdvisorSpider(CrawlSpider):
	"""spider for the tripadvisor website"""

	name = 'tripadvisor'
	allowed_domains = ['www.tripadvisor.fr']
	start_urls = (
	'https://www.tripadvisor.fr/Restaurants-g187147-Paris_Ile_de_France.html',
	)

	@staticmethod
	def build_review_full_link(review_id, response):
	"""Build the link to a single review"""

	pattern = re.compile(ur'-d(\d+)')
	request_id = re.search(pattern, response.url).groups()[0]

	review_url = 'https://www.tripadvisor.fr/ExpandedUserReviews-d%(request_id)s' \
	'?target=%(review_id)s&reviews=%(review_id)s&servlet=Attraction_Review&expand=0' % {
	'review_id': review_id,
	'request_id': request_id
	}

	return review_url

	def parse(self, response):
	"""Crawl a list of place"""

	# For each pagination link, yield a new request
	for page_link in response.xpath('//a[contains(@class, "pageNum")]/@href').extract():
	yield Request(
	urljoin(response.url, page_link),
	self.parse
	)

	# For each place item, yield a new request
	for place_link in response.xpath('//h3[@class="title"]/a/@href').extract():
	yield Request(
	urljoin(response.url, place_link),
	self.parse_place
	)

	def parse_place(self, response):
	"""Crawl a single place, with the reviews"""

	# Extract the place
	pattern = re.compile(ur'-d(\d+)')
	place_id = re.search(pattern, response.url).groups()[0]
	place_name = response.xpath('//h1[@id="HEADING"]/text()').extract()[1].strip()

	try:
	address = response.xpath('//span[@class="format_address"][descendant-or-self]/text()').extract()[0]
	except IndexError:
	address = None

	try:
	rating = response.xpath('//img[contains(@class,"rating_rr_fill")]/@content').extract()[0]
	except IndexError:
	rating = 0

	yield PlaceItem(
	id=place_id,
	name=place_name,
	address=address,
	rating=rating
	)

	# Yield a new request for each page of reviews
	for review_page_link in response.xpath('//div[@class="pageNumbers"]/a/@href').extract():
	yield Request(
	urljoin(response.url, review_page_link),
	callback=self.parse_review_list,
	meta={
	'place_id': place_id
	}
	)

	self.parse_review_list(response, place_id)

	def parse_review_list(self, response, place_id=None):

	place_id = place_id or response.meta['place_id']

	# Extract the reviews
	# If the review is cropped, yield a new request to fetch the full content
	for review_item in response.xpath('//div[contains(@class,"reviewSelector")]'):

	review_id = review_item.xpath('./@id').extract()[0].replace('review_', '')

	try:
	review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/@title').extract()[0]
	review_date = dateparser.parse(review_date).date()
	except IndexError:
	review_date = review_item.xpath('.//span[contains(@class,"ratingDate")]/text()').extract()[0]
	# TODO: Make the line below work
	review_date = dateparser.parse(review_date)

	review_title = review_item.xpath('.//span[contains(@class,"noQuotes")]/text()').extract()[0]

	try:
	review_body = review_item.xpath('.//p[@class="entry"]').extract()[0].strip()

	yield ReviewItem(
	id=review_id,
	place_id=place_id,
	title=review_title,
	body=review_body,
	date=review_date
	)

	except IndexError:
	review_url = self.build_review_full_link(review_id, response)

	yield Request(
	review_url,
	callback=self.parse_review,
	meta={
	'place_id': place_id,
	'review_id': review_id,
	'review_title': review_title,
	'review_date': review_date
	}
	)

	def parse_review(self, response):
	"""Crawl a single review"""

	review_body = response.xpath('//div[@class="entry"]/p/text()').extract()[0].strip()

	yield ReviewItem(
	id=response.meta['review_id'],
	place_id=response.meta['place_id'],
	title=response.meta['review_title'],
	date=response.meta['review_date'],
	body=review_body
	)