Skip to content

Instantly share code, notes, and snippets.

@horacioibrahim
Last active April 7, 2016 02:27
Show Gist options
  • Select an option

  • Save horacioibrahim/e0a9f9fe76174b71fa6130dbd137a959 to your computer and use it in GitHub Desktop.

Select an option

Save horacioibrahim/e0a9f9fe76174b71fa6130dbd137a959 to your computer and use it in GitHub Desktop.
"""Imprime os valores pagos pelo governo para determinado favorecido,
nos anos selecionados
:favorecido: cnpj
:years: lista de anos para consultar
"""
# built-in
import calendar
import datetime
import time
import random
# third-party
import scrapy
from bs4 import BeautifulSoup as bs
class PortalTransparencia(scrapy.Spider):
name = 'transparenciaspider'
start_urls = []
years = [2013, 2014, 2015, 2016] # TODO via args
favorecido = '07175665000185' # TODO via args
current_year = datetime.datetime.now().year
current_month = datetime.datetime.now().month
URL_DESPESAS_PAGAS = "http://www.transparencia.gov.br/despesasdiarias/resultado?consulta=rapida&"
# Sample url date
# 01%2F01%2F2012 => 01/01/2012
# 31%2F01%2F2012= => 31/01/2012
for year in years:
for month in range(1, 13):
# if 1 -> '01' ... 2 -> '02'
m = str(month).zfill(2)
monthrange = calendar.monthrange(year, month)
d_end = monthrange[1]
dt_start = "01%2F{m}%2F{y}".format(m=m, y=year)
dt_end = "{d}%2F{m}%2F{y}".format(d=d_end, m=m, y=year)
complement = "periodoInicio={dt_start}&periodoFim={dt_end}&fase=PAG&codigoOS=TOD&codigoFavorecido={favorecido}".format(dt_start=dt_start, dt_end=dt_end, favorecido=favorecido)
url_target = ''.join([URL_DESPESAS_PAGAS, complement])
start_urls.append(url_target)
# exit of the for if current month
if year == current_year:
if month > current_month:
break
def parse(self, response):
table = response.css("table.tabela")
rows = table.css('tr')
rows.pop(0)
with open("values.txt", "a+") as myfile:
for row in rows:
valor = row.css('td.valor')
cell = valor.extract()[0]
h = bs(cell, 'html.parser')
h = h.text.replace(' ','').strip()
val = str(h)
myfile.write(val + '\n')
# need?
myfile.close()
time.sleep(random.randint(1, 10))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment