Last active
August 27, 2016 16:16
-
-
Save tuanchauict/18674d3fcdd043e932983d3c664afd04 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from crawler.items import CommentItem | |
from crawler.items import PostItem | |
class TinhteSpider(scrapy.Spider): | |
name = "tinhte" | |
allowed_domains = ["tinhte.vn"] | |
start_urls = [ | |
"https://tinhte.vn/", | |
] | |
def parse(self, response): | |
self.TITLE_PATH = "//*[@id='content']/div/div/div/div/div/div[2]/div[1]/div/p[1]/text()" | |
self.AUTHOR_PATH = "//*[@id='content']/div/div/div/div/div/div[2]/div[1]/a[2]/span/text()" | |
self.MAIN_CONTENT = "//*[@id='messageList']/li[1]/div/div[2]/div[1]/article/blockquote/text()" | |
self.HEAD_CONTENT_PATH = "//*[@id='messageList']/li" | |
self.CONTENT_OFFSET = "div/div[2]/div[1]/article/blockquote/text()" | |
self.COMMENT_AUTHOR_OFFSET = "div/div[1]/div/h3/div/a/text()" | |
self.NEXT_PATH = "//*[@id='content']/div/div/div/div/div/div[4]/div[2]/nav/a[contains(text(), 'Sau')]/@href" | |
flag = 0 | |
for href in response.css("div > h2 > a::attr('href')"): | |
if (flag < 3): | |
flag += 1 | |
url = response.urljoin(href.extract()) | |
yield scrapy.Request( | |
"https://tinhte.vn/threads/facebook-mo-he-thong-nhan-dien-hinh-anh-cua-minh-cho-tat-ca-moi-nguoi.2636679/", | |
callback=self.parse_dir_contents, encoding="UTF-8") | |
def parse_dir_contents(self, response): | |
self.is_main_content = True | |
body = response.xpath('//body') | |
if not body: | |
return | |
body = body[0] | |
topic = PostItem() | |
topic['title'] = body.xpath(self.TITLE_PATH).extract() | |
topic['author'] = body.xpath(self.AUTHOR_PATH).extract() | |
topic['content'] = body.xpath(self.MAIN_CONTENT).extract() | |
topic['comment'] = self._parse_comment_helper(response) | |
next = response.xpath(self.NEXT_PATH) | |
if next: | |
next = next[0] | |
meta = { | |
'topic': topic, | |
'page': 0 | |
} | |
request = scrapy.Request(next, meta=meta, callback=self.parse_comment_page) | |
yield request | |
else: | |
self._save_data(topic) | |
def parse_comment_page(self, response): | |
meta = response.meta | |
topic = meta['topic'] | |
topic['comment'] += self._parse_comment_helper(response) | |
if meta['page'] >= 3: | |
self._save_data(topic) | |
next = response.xpath(self.NEXT_PATH) | |
if next: | |
next = next[0] | |
meta['page'] += 1 | |
request = scrapy.Request(next, meta=meta, callback=self.parse_comment_page) | |
yield request | |
else: | |
self._save_data(topic) | |
def _parse_comment_helper(self, response): | |
cmts = [] | |
for c in response.xpath(self.HEAD_CONTENT_PATH): | |
if (self.is_main_content): | |
self.is_main_content = False | |
else: | |
comment = CommentItem() | |
comment['author'] = c.xpath(self.COMMENT_AUTHOR_OFFSET).extract() | |
comment['content'] = c.xpath(self.CONTENT_OFFSET).extract() | |
cmts.append(comment) | |
return cmts | |
def _save_data(self, data): | |
# TODO | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment