Skip to content

Instantly share code, notes, and snippets.

@stummjr
Last active May 10, 2020 06:14
Show Gist options
  • Save stummjr/7b87c18599561171b5f7a053d0efd31a to your computer and use it in GitHub Desktop.
Save stummjr/7b87c18599561171b5f7a053d0efd31a to your computer and use it in GitHub Desktop.
# -*- coding:utf-8 -*-
import scrapy
from scrapy.exceptions import CloseSpider
class LoginSpider(scrapy.Spider):
name = 'login-spider'
start_urls = ['http://quotes.toscrape.com/login']
def parse(self, response):
self.log('visitei a página de login: {}'.format(response.url))
token = response.css('input[name="csrf_token"]::attr(value)').extract_first()
yield scrapy.FormRequest(
url='http://quotes.toscrape.com/login',
formdata={
'username': 'john.doe',
'password': 'anything',
'csrf_token': token,
},
callback=self.parse_author_links,
)
def parse_author_links(self, response):
has_logout_link = response.css('a[href="/logout"]').extract_first()
if not has_logout_link:
raise CloseSpider('falha de autenticação')
self.log('acabei de fazer login')
links = response.css('.quote a[href*="goodreads.com"]::attr(href)').extract()
for link in links:
yield {'link': link}
next_page = response.css('li.next a::attr(href)').extract_first()
if next_page:
yield scrapy.Request(
url=response.urljoin(next_page),
callback=self.parse_author_links,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment