Last active
April 7, 2016 02:27
-
-
Save horacioibrahim/e0a9f9fe76174b71fa6130dbd137a959 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Imprime os valores pagos pelo governo para determinado favorecido, | |
| nos anos selecionados | |
| :favorecido: cnpj | |
| :years: lista de anos para consultar | |
| """ | |
| # built-in | |
| import calendar | |
| import datetime | |
| import time | |
| import random | |
| # third-party | |
| import scrapy | |
| from bs4 import BeautifulSoup as bs | |
| class PortalTransparencia(scrapy.Spider): | |
| name = 'transparenciaspider' | |
| start_urls = [] | |
| years = [2013, 2014, 2015, 2016] # TODO via args | |
| favorecido = '07175665000185' # TODO via args | |
| current_year = datetime.datetime.now().year | |
| current_month = datetime.datetime.now().month | |
| URL_DESPESAS_PAGAS = "http://www.transparencia.gov.br/despesasdiarias/resultado?consulta=rapida&" | |
| # Sample url date | |
| # 01%2F01%2F2012 => 01/01/2012 | |
| # 31%2F01%2F2012= => 31/01/2012 | |
| for year in years: | |
| for month in range(1, 13): | |
| # if 1 -> '01' ... 2 -> '02' | |
| m = str(month).zfill(2) | |
| monthrange = calendar.monthrange(year, month) | |
| d_end = monthrange[1] | |
| dt_start = "01%2F{m}%2F{y}".format(m=m, y=year) | |
| dt_end = "{d}%2F{m}%2F{y}".format(d=d_end, m=m, y=year) | |
| complement = "periodoInicio={dt_start}&periodoFim={dt_end}&fase=PAG&codigoOS=TOD&codigoFavorecido={favorecido}".format(dt_start=dt_start, dt_end=dt_end, favorecido=favorecido) | |
| url_target = ''.join([URL_DESPESAS_PAGAS, complement]) | |
| start_urls.append(url_target) | |
| # exit of the for if current month | |
| if year == current_year: | |
| if month > current_month: | |
| break | |
| def parse(self, response): | |
| table = response.css("table.tabela") | |
| rows = table.css('tr') | |
| rows.pop(0) | |
| with open("values.txt", "a+") as myfile: | |
| for row in rows: | |
| valor = row.css('td.valor') | |
| cell = valor.extract()[0] | |
| h = bs(cell, 'html.parser') | |
| h = h.text.replace(' ','').strip() | |
| val = str(h) | |
| myfile.write(val + '\n') | |
| # need? | |
| myfile.close() | |
| time.sleep(random.randint(1, 10)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment