Skip to content

Instantly share code, notes, and snippets.

@felipebastosweb
Created December 6, 2016 13:37
Show Gist options
  • Save felipebastosweb/c6048771ee388bdff67d1d23fbd9d24f to your computer and use it in GitHub Desktop.
Save felipebastosweb/c6048771ee388bdff67d1d23fbd9d24f to your computer and use it in GitHub Desktop.
Nova tentativa de fazer crawler de raças de cachorro (incompleto)
import scrapy
class DogBreed(scrapy.Spider):
name = 'dogbreed'
start_urls = ['http://dogtime.com/dog-breeds']
def parse(self, response):
# pegar diretamente da pagina do artigo
for href in response.css('div.article-crumbs > div.group-list-item > h2 > a::attr(href)').extract():
yield scrapy.Request(
response.urljoin(href),
callback=self.parse_breed
)
"""
#tentativa de fazer crawl seguindo da página de indice para página de artigo mal sucedida temporariamente
for breed in response.css('div.article-crumbs > div.group-list-item'):
name = breed.css('h2 > a.post-title::text').extract_first()
url = breed.css('h2 > a::attr(href)').extract_first()
if url is not None:
next_page = response.urljoin(url)
next_page = scrapy.Request(next_page, callback=self.parse)
description = next_page.css('div.category-article-main > header h2 p::first').extract_first()
"""
def parse_breed(self, response):
name = response.css('div.category-article-main > header > h1::text').extract_first()
description = response.css('div.category-article-main > header > h2+p::text').extract_first()
yield {
'name' : name,
#'url': url,
'description': description,
}
@felipebastosweb
Copy link
Author

Para corrigir o bug do parágrafo da description corrija a linha 30 com o seguinte código response.css('div.category-article-main > header > h2+p').extract_first()

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment