Created
December 6, 2016 13:37
-
-
Save felipebastosweb/c6048771ee388bdff67d1d23fbd9d24f to your computer and use it in GitHub Desktop.
Nova tentativa de fazer crawler de raças de cachorro (incompleto)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class DogBreed(scrapy.Spider): | |
name = 'dogbreed' | |
start_urls = ['http://dogtime.com/dog-breeds'] | |
def parse(self, response): | |
# pegar diretamente da pagina do artigo | |
for href in response.css('div.article-crumbs > div.group-list-item > h2 > a::attr(href)').extract(): | |
yield scrapy.Request( | |
response.urljoin(href), | |
callback=self.parse_breed | |
) | |
""" | |
#tentativa de fazer crawl seguindo da página de indice para página de artigo mal sucedida temporariamente | |
for breed in response.css('div.article-crumbs > div.group-list-item'): | |
name = breed.css('h2 > a.post-title::text').extract_first() | |
url = breed.css('h2 > a::attr(href)').extract_first() | |
if url is not None: | |
next_page = response.urljoin(url) | |
next_page = scrapy.Request(next_page, callback=self.parse) | |
description = next_page.css('div.category-article-main > header h2 p::first').extract_first() | |
""" | |
def parse_breed(self, response): | |
name = response.css('div.category-article-main > header > h1::text').extract_first() | |
description = response.css('div.category-article-main > header > h2+p::text').extract_first() | |
yield { | |
'name' : name, | |
#'url': url, | |
'description': description, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Para corrigir o bug do parágrafo da description corrija a linha 30 com o seguinte código response.css('div.category-article-main > header > h2+p').extract_first()