#PNU 2018 Graduation design(Hope group) Part 1 spider the Tripadvisor and return to the JSON format
Last active
June 2, 2018 06:08
-
-
Save CodingQinghao/dd7be165e7b1ba31d534f727064dc96f to your computer and use it in GitHub Desktop.
Tripadvisor Spider
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# Created on 2018-06-02 | |
# Project: hope | |
from pyspider.libs.base_handler import * | |
from pyquery import PyQuery as pq | |
class Handler(BaseHandler): | |
crawl_config = { | |
} | |
@every(minutes=24 * 60) | |
def on_start(self): | |
self.crawl('https://www.tripadvisor.co.kr/Attractions-g297884-Activities-Busan.html', callback=self.index_page, validate_cert = False) | |
@config(age=10 * 24 * 60 * 60) | |
def index_page(self, response): | |
for each in response.doc('#ATTR_ENTRY_ > div.attraction_clarity_cell > div > div > div.listing_info > div.listing_title > a').items(): | |
self.crawl(each.attr.href, callback=self.detail_page, validate_cert = False) | |
next = response.doc('.pagination .nav.next').attr.href #翻到下一页 | |
self.crawl(next, callback = self.index_page, validate_cert = False)#每一页回调之后,又把下一页的链接回调 | |
@config(priority=2) | |
def detail_page(self, response): | |
reviews=[] | |
url = response.url | |
touristspot = response.doc('#HEADING').text() | |
address = response.doc('.location > .address').text() | |
rating = response.doc('.overallRating').text() | |
for each in response.doc('.review-container').items(): #括号内部使用CSS选择tag,.items()用来遍历 | |
array = {} | |
html = each.html() | |
html = pq(html) | |
array["title"] = html('.noQuotes').text() | |
array["content"] = html('.partial_entry').text() | |
html = html('.quote').html() | |
html = pq(html) | |
array["link"] =html('a').attr('href') | |
if (array["title"]!=""): | |
reviews.append(array) | |
data = { | |
"url": url, | |
"touristspot": touristspot, | |
"address": address, | |
"rating" : rating, | |
"reviews" : reviews | |
} | |
return(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment