Skip to content

Instantly share code, notes, and snippets.

@CodingQinghao
Last active June 2, 2018 06:08
Show Gist options
  • Save CodingQinghao/dd7be165e7b1ba31d534f727064dc96f to your computer and use it in GitHub Desktop.
Save CodingQinghao/dd7be165e7b1ba31d534f727064dc96f to your computer and use it in GitHub Desktop.
Tripadvisor Spider

#PNU 2018 Graduation design(Hope group) Part 1 spider the Tripadvisor and return to the JSON format

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2018-06-02
# Project: hope
from pyspider.libs.base_handler import *
from pyquery import PyQuery as pq
class Handler(BaseHandler):
crawl_config = {
}
@every(minutes=24 * 60)
def on_start(self):
self.crawl('https://www.tripadvisor.co.kr/Attractions-g297884-Activities-Busan.html', callback=self.index_page, validate_cert = False)
@config(age=10 * 24 * 60 * 60)
def index_page(self, response):
for each in response.doc('#ATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a').items():
self.crawl(each.attr.href, callback=self.detail_page, validate_cert = False)
next = response.doc('.pagination .nav.next').attr.href #翻到下一页
self.crawl(next, callback = self.index_page, validate_cert = False)#每一页回调之后,又把下一页的链接回调
@config(priority=2)
def detail_page(self, response):
reviews=[]
url = response.url
touristspot = response.doc('#HEADING').text()
address = response.doc('.location > .address').text()
rating = response.doc('.overallRating').text()
for each in response.doc('.review-container').items(): #括号内部使用CSS选择tag,.items()用来遍历
array = {}
html = each.html()
html = pq(html)
array["title"] = html('.noQuotes').text()
array["content"] = html('.partial_entry').text()
html = html('.quote').html()
html = pq(html)
array["link"] =html('a').attr('href')
if (array["title"]!=""):
reviews.append(array)
data = {
"url": url,
"touristspot": touristspot,
"address": address,
"rating" : rating,
"reviews" : reviews
}
return(data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment