-
-
Save rochacbruno/9254786 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # coding=utf-8 | |
| """ | |
| **Implementa um Webcrawler para extracao de dados da pesquisa de media de precos realizada periodicamente pela ANP** | |
| Desenvolvido por Fabio C. Barrioneuvo da Luz. - 2013 | |
| Simple crawler to ANP site | |
| Copyright (C) 2013 Fabio C. Barrioneuvo da Luz. | |
| This program is free software; you can redistribute it and/or | |
| modify it under the terms of the GNU General Public License | |
| as published by the Free Software Foundation; either version 2 | |
| of the License, or (at your option) any later version. | |
| This program is distributed in the hope that it will be useful, | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with this program; if not, write to the Free Software | |
| Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | |
| """ | |
| import re | |
| try: | |
| from splinter import Browser | |
| except ImportError: | |
| print('please, install splinter\npip install splinter') | |
| try: | |
| from lxml import html | |
| except ImportError: | |
| print('please, install lxml\npip install lxml') | |
| from time import sleep | |
| XPATH_MEDIA = '//*[@id="postos_nota_fiscal"]/fieldset/div[1]/table/tbody/tr[2]/td[2]/text()' | |
| XPATH_DESVIO_PADRAO = '//*[@id="postos_nota_fiscal"]/fieldset/div[1]/table/tbody/tr[3]/td[2]/text()' | |
| XPATH_VALOR_MINIMO = '//*[@id="postos_nota_fiscal"]/fieldset/div[1]/table/tbody/tr[4]/td[2]/text()' | |
| XPATH_VALOR_MAXIMO = '//*[@id="postos_nota_fiscal"]/fieldset/div[1]/table/tbody/tr[5]/td[2]/text()' | |
| XPATH_PERIODO_APURACAO = '//*[@id="conteudo"]/div/div/h3[3]/text()' | |
| XPATH_TABELA = '//*[@id="postos_nota_fiscal"]/div/table' | |
| COMBUSTIVEIS = (('Gasolina', '487*Gasolina'), | |
| ('Diesel', '532*Diesel'), | |
| ('Diesel S10', '812*Diesel@S10'), | |
| ('Etanol', '643*Etanol')) | |
| COMPILED_RE_CARACTERES_REMOVER = re.compile('[A-Za-z \\\:\[\]]') | |
| def _verificar_preco(browser, nome_combustivel, cod_combustivel): | |
| """ | |
| Executa o crawler e | |
| Retorna um dicionario com os dados resultantes da pesquisa de media de preço | |
| de combustivel, realizadas periodicamente ANP. | |
| :browser: Instancia do Splinter Browser ( splinter.cobrateam.info ) | |
| :nome_combustivel: Nome do combustivel | |
| :cod_combustivel: Codigo do Combustivel no padrao do site da ANP | |
| """ | |
| print 'Verificando combustivel {0}'.format(nome_combustivel) | |
| url = "http://www.anp.gov.br/preco/" | |
| #navega para a pagina principal | |
| browser.visit(url) | |
| button = browser.find_link_by_text('Por Estado') | |
| #navega para a segunda pagina | |
| button.click() | |
| sleep(1) | |
| browser.select('selEstado', 'TO*TOCANTINS') | |
| browser.select('selCombustivel', cod_combustivel) | |
| button = browser.find_by_id('image1') | |
| #navega para a terceira pagina | |
| button.click() | |
| sleep(1) | |
| button = browser.find_link_by_text('Palmas') | |
| #navega para a quarta pagina | |
| button.click() | |
| sleep(1) | |
| tree = html.fromstring(browser.html) | |
| a = '{0}'.format(tree.xpath(XPATH_PERIODO_APURACAO)) | |
| a = a.replace('a', '-') | |
| PERIODO_APURACAO = re.sub(COMPILED_RE_CARACTERES_REMOVER, "", a) | |
| V_MEDIA = '{0}'.format((tree.xpath(XPATH_MEDIA))[0]) | |
| V_DESVIO_PADRAO = '{0}'.format((tree.xpath(XPATH_DESVIO_PADRAO))[0]) | |
| V_VALOR_MINIMO = '{0}'.format((tree.xpath(XPATH_VALOR_MINIMO))[0]) | |
| V_VALOR_MAXIMO = '{0}'.format((tree.xpath(XPATH_VALOR_MAXIMO))[0]) | |
| cotacao = {nome_combustivel: { | |
| 'periodo_apuracao': PERIODO_APURACAO, | |
| 'media': V_MEDIA, | |
| 'desvio_padrao': V_DESVIO_PADRAO, | |
| 'valor_minimo': V_VALOR_MINIMO, | |
| 'valor_maximo': V_VALOR_MAXIMO}} | |
| return cotacao | |
| def verificar_precos_combustiveis(): | |
| """ | |
| Executa o crawler e | |
| Retorna um dicionario com os dados resultantes da pesquisa de media de preço | |
| realizadas periodicamente ANP, para todos os combustiveis disponiveis, | |
| """ | |
| cotacao = [] | |
| # with Browser('phantomjs') as browser: | |
| #with Browser('zope.testbrowser') as browser: | |
| with Browser() as browser: | |
| for nome_combustivel, cod_combustivel in COMBUSTIVEIS: | |
| cotacao.append(_verificar_preco(browser, nome_combustivel, cod_combustivel)) | |
| return cotacao | |
| if __name__ == "__main__": | |
| import pprint | |
| result = verificar_precos_combustiveis() | |
| pprint.pprint(result, width=20) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment