Created
April 20, 2014 19:07
-
-
Save jazzido/11122358 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import mechanize | |
| import logging | |
| import unicodecsv | |
| from datetime import datetime | |
| from bs4 import BeautifulSoup | |
| logger = logging.getLogger("mechanize") | |
| logger.addHandler(logging.StreamHandler(sys.stderr)) | |
| logger.setLevel(logging.INFO) | |
| def log(m): | |
| print >>sys.stderr, "%s -- %s" % (datetime.now(), m) | |
| def postback(br, event): | |
| c = br.form.find_control(name='__EVENTTARGET') | |
| c.readonly = False | |
| br.form['__EVENTTARGET'] = event | |
| def scrape(url, start_date, end_date, demandante_id, modalidad_id, br): | |
| br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36')] | |
| response = br.open(url) | |
| br.select_form(name="aspnetForm") | |
| c = br.form.find_control(id='ctl00_ContentPlaceHolder1_txtFechaDesde') | |
| c.value = start_date.strftime('%d-%m-%Y') | |
| c = br.form.find_control(id='ctl00_ContentPlaceHolder1_txtFechaHasta') | |
| c.value = end_date.strftime('%d-%m-%Y') | |
| c = br.form.find_control(id='ctl00_ContentPlaceHolder1_lstModalidadContratacion') \ | |
| .get(str(modalidad_id)) | |
| c.selected = True | |
| # c = br.form.find_control(id='ctl00_ContentPlaceHolder1_txtNombreDemandante') | |
| # c.readonly = False | |
| # c.value = 'Ministerio de la Presidencia' | |
| # c = br.form.find_control(id='ctl00_ContentPlaceHolder1_hidNombreDemandante') | |
| # c.readonly = False | |
| # c.value = 'Ministerio de la Presidencia' | |
| c = br.form.find_control(id='ctl00_ContentPlaceHolder1_hidIdOrgC') | |
| c.readonly = False | |
| c.value = str(demandante_id) | |
| postback(br, 'ctl00$ContentPlaceHolder1$btnBuscar') | |
| response = br.submit().read() | |
| br.select_form(name="aspnetForm") | |
| total_pages = int(br.form.find_control(id='ctl00_ContentPlaceHolder1_ControlPaginacion_hidTotalPaginas').value) | |
| current_page = 1 | |
| records = 0 | |
| log("Scraping %d pages" % total_pages) | |
| while current_page <= total_pages: | |
| for r in scrape_table(response): | |
| records += 1 | |
| yield r | |
| current_page += 1 | |
| br.select_form(name="aspnetForm") | |
| c = br.form.find_control(id='ctl00_ContentPlaceHolder1_ControlPaginacion_hidNumeroPagina') | |
| c.readonly = False | |
| c.value = str(current_page) | |
| postback(br, 'ctl00$ContentPlaceHolder1$ControlPaginacion$btnCambiarPagina') | |
| log("Getting page: %d" % current_page) | |
| response = br.submit().read() | |
| log("Scraped %d records" % records) | |
| FIELDS = ['id', 'descripcion', 'entidad_unidad_de_compra', 'dependencia', 'fecha', 'modalidad', 'estado'] | |
| def scrape_table(html): | |
| html = BeautifulSoup(html) | |
| table = html.find(attrs={'class': 'adquisicionTablaResultados'}) | |
| for tr in table.findChildren('tr')[1:]: | |
| yield dict(zip(FIELDS, [td.text for td in tr.findChildren('td')])) | |
| if __name__ == '__main__': | |
| csv = unicodecsv.DictWriter(sys.stdout, FIELDS) | |
| csv.writeheader() | |
| br = mechanize.Browser() | |
| for record in scrape('https://www.panamacompra.gob.pa/ambientepublico/AP_BusquedaAvanzada.aspx', | |
| datetime(2009,9,1), | |
| datetime.now(), | |
| 1845262, | |
| 400, | |
| br): | |
| csv.writerow(record) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hola @jazzido, he tratado de ejecutar el código con la pagina actual pero me genera error, pregunta? tienes alguna actualización del code que funcione actualmente? Muchas gracias