Created
October 20, 2016 08:42
-
-
Save juanriaza/e9213fc1d6d017c3b750234588638875 to your computer and use it in GitHub Desktop.
diputados scrapy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class DiputadosSpider(scrapy.Spider): | |
name = 'diputados' | |
start_urls = ['http://www.congreso.es/portal/page/portal/Congreso/Congreso/Diputados'] | |
def parse(self, response): | |
lista_diputados_url = response.xpath( | |
'//div[@id="btn_mas"]/a/@href').extract_first() | |
request = scrapy.Request( | |
lista_diputados_url, | |
callback=self.parse_lista_diputados) | |
yield request | |
def parse_lista_diputados(self, response): | |
# listado de diputados | |
diputados = response.xpath( | |
'//div[@class="listado_1"]/ul/li/a/@href').extract() | |
for diputado in diputados: | |
request = scrapy.Request( | |
response.urljoin(diputado), | |
callback=self.parse_diputado) | |
yield request | |
# proxima pagina | |
pagina_siguiente = response.xpath( | |
'//a[contains(., "Página Siguiente")]/@href').extract_first() | |
if pagina_siguiente: | |
request = scrapy.Request( | |
pagina_siguiente, | |
callback=self.parse_lista_diputados) | |
yield request | |
def parse_diputado(self, response): | |
nombre = response.xpath( | |
'//div[@class="nombre_dip"]/text()').extract_first() | |
diputado = { | |
'nombre': nombre, | |
'url': response.url} | |
yield diputado |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment