Skip to content

Instantly share code, notes, and snippets.

@tuanchauict
Last active August 27, 2016 16:16
Show Gist options
  • Save tuanchauict/18674d3fcdd043e932983d3c664afd04 to your computer and use it in GitHub Desktop.
Save tuanchauict/18674d3fcdd043e932983d3c664afd04 to your computer and use it in GitHub Desktop.
import scrapy
from crawler.items import CommentItem
from crawler.items import PostItem
class TinhteSpider(scrapy.Spider):
name = "tinhte"
allowed_domains = ["tinhte.vn"]
start_urls = [
"https://tinhte.vn/",
]
def parse(self, response):
self.TITLE_PATH = "//*[@id='content']/div/div/div/div/div/div[2]/div[1]/div/p[1]/text()"
self.AUTHOR_PATH = "//*[@id='content']/div/div/div/div/div/div[2]/div[1]/a[2]/span/text()"
self.MAIN_CONTENT = "//*[@id='messageList']/li[1]/div/div[2]/div[1]/article/blockquote/text()"
self.HEAD_CONTENT_PATH = "//*[@id='messageList']/li"
self.CONTENT_OFFSET = "div/div[2]/div[1]/article/blockquote/text()"
self.COMMENT_AUTHOR_OFFSET = "div/div[1]/div/h3/div/a/text()"
self.NEXT_PATH = "//*[@id='content']/div/div/div/div/div/div[4]/div[2]/nav/a[contains(text(), 'Sau')]/@href"
flag = 0
for href in response.css("div > h2 > a::attr('href')"):
if (flag < 3):
flag += 1
url = response.urljoin(href.extract())
yield scrapy.Request(
"https://tinhte.vn/threads/facebook-mo-he-thong-nhan-dien-hinh-anh-cua-minh-cho-tat-ca-moi-nguoi.2636679/",
callback=self.parse_dir_contents, encoding="UTF-8")
def parse_dir_contents(self, response):
self.is_main_content = True
body = response.xpath('//body')
if not body:
return
body = body[0]
topic = PostItem()
topic['title'] = body.xpath(self.TITLE_PATH).extract()
topic['author'] = body.xpath(self.AUTHOR_PATH).extract()
topic['content'] = body.xpath(self.MAIN_CONTENT).extract()
topic['comment'] = self._parse_comment_helper(response)
next = response.xpath(self.NEXT_PATH)
if next:
next = next[0]
meta = {
'topic': topic,
'page': 0
}
request = scrapy.Request(next, meta=meta, callback=self.parse_comment_page)
yield request
else:
self._save_data(topic)
def parse_comment_page(self, response):
meta = response.meta
topic = meta['topic']
topic['comment'] += self._parse_comment_helper(response)
if meta['page'] >= 3:
self._save_data(topic)
next = response.xpath(self.NEXT_PATH)
if next:
next = next[0]
meta['page'] += 1
request = scrapy.Request(next, meta=meta, callback=self.parse_comment_page)
yield request
else:
self._save_data(topic)
def _parse_comment_helper(self, response):
cmts = []
for c in response.xpath(self.HEAD_CONTENT_PATH):
if (self.is_main_content):
self.is_main_content = False
else:
comment = CommentItem()
comment['author'] = c.xpath(self.COMMENT_AUTHOR_OFFSET).extract()
comment['content'] = c.xpath(self.CONTENT_OFFSET).extract()
cmts.append(comment)
return cmts
def _save_data(self, data):
# TODO
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment