Skip to content

Instantly share code, notes, and snippets.

@tudoanh
Last active May 26, 2017 11:18
Show Gist options
  • Save tudoanh/b6d1591c3fa5e112bc193142ad3c1b57 to your computer and use it in GitHub Desktop.
Save tudoanh/b6d1591c3fa5e112bc193142ad3c1b57 to your computer and use it in GitHub Desktop.
fahasa.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy_splash import SplashRequest
script = """
function main(splash)
splash:init_cookies(splash.args.cookies)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(5))
return {
cookies = splash:get_cookies(),
html = splash:html()
}
end
"""
script2 = """
function main(splash)
splash:init_cookies(splash.args.cookies)
local url = splash.args.url
assert(splash:go(url))
assert(splash:wait(0.5))
return {
cookies = splash:get_cookies(),
html = splash:html()
}
end
"""
class FahasaSpider(scrapy.Spider):
name = 'fahasa'
allowed_domains = ['fahasa.com']
start_urls = [
"https://www.fahasa.com/sach-trong-nuoc/van-hoc-trong-nuoc.html"
]
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse, endpoint='execute',
args={'lua_source': script})
def parse(self, response):
# Get the next page and yield Request
next_selector = response.xpath('//*[@title="Next"]/@href')
for url in next_selector.extract():
yield SplashRequest(url, endpoint='execute',
args={'lua_source': script2})
# Get URL in page and yield Request
url_selector = response.xpath(
'//*[@class="product-name p-name-list"]/a/@href')
for url in url_selector.extract():
yield SplashRequest(url, callback=self.parse_item,
endpoint='execute',
args={'lua_source': script2})
def parse_item(self, response):
"""
Handle crawl logic here
"""
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment