Last active
September 14, 2018 16:24
-
-
Save kmike/2112c0b7c3d28ab5047be9a4e6e6487c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from scrapy_splash import LuaRequest | |
class BooksSpider(scrapy.Spider): | |
name = 'books' | |
allowed_domains = ['books.toscrape.com'] | |
def start_requests(self): | |
yield LuaRequest( | |
'http://books.toscrape.com/', | |
lua_source=""" | |
splash:go(args.url) | |
for i=1,50 do | |
splash:css(".next a"):click() | |
splash:wait(1.0) | |
end | |
return splash:html() | |
""" | |
) | |
def parse(self, response): | |
for book in response.css('article.product_pod'): | |
yield { | |
'title': book.css('h3 a::text').get(), | |
'url': book.css('h3 a::attr(href)').get(), | |
} | |
class BooksSpider2(scrapy.Spider): | |
name = 'books' | |
allowed_domains = ['books.toscrape.com'] | |
def start_requests(self): | |
yield LuaRequest( | |
'http://books.toscrape.com/', | |
lua_source=""" | |
splash:go(args.url) | |
for i=1,50 do | |
res={} | |
splash:css(".next a"):click() | |
splash:wait(1.0) | |
table.insert(res, splash:html()) | |
end | |
return res | |
""" | |
) | |
def parse(self, response): | |
for html in response.data: | |
resp = TextResponse(response.url, body=html, encoding='utf8') | |
yield from self.parse_page(resp) | |
def parse_page(self, response): | |
for book in response.css('article.product_pod'): | |
yield { | |
'title': book.css('h3 a::text').get(), | |
'url': book.css('h3 a::attr(href)').get(), | |
} | |
class BooksSpider3(scrapy.Spider): | |
name = 'books' | |
allowed_domains = ['books.toscrape.com'] | |
def start_requests(self): | |
yield SuperLuaRequest( | |
'http://books.toscrape.com/', | |
lua_source=""" | |
splash:go(args.url) | |
for i=1,50 do | |
splash:css(".next a"):click() | |
splash:wait(1.0) | |
splash:send(splash:html()) | |
end | |
""" | |
) | |
def parse(self, response): | |
for book in response.css('article.product_pod'): | |
yield { | |
'title': book.css('h3 a::text').get(), | |
'url': book.css('h3 a::attr(href)').get(), | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment