Skip to content

Instantly share code, notes, and snippets.

@jazzido
Created April 20, 2014 19:07
Show Gist options
  • Select an option

  • Save jazzido/11122358 to your computer and use it in GitHub Desktop.

Select an option

Save jazzido/11122358 to your computer and use it in GitHub Desktop.
import sys
import mechanize
import logging
import unicodecsv
from datetime import datetime
from bs4 import BeautifulSoup
logger = logging.getLogger("mechanize")
logger.addHandler(logging.StreamHandler(sys.stderr))
logger.setLevel(logging.INFO)
def log(m):
print >>sys.stderr, "%s -- %s" % (datetime.now(), m)
def postback(br, event):
c = br.form.find_control(name='__EVENTTARGET')
c.readonly = False
br.form['__EVENTTARGET'] = event
def scrape(url, start_date, end_date, demandante_id, modalidad_id, br):
br.addheaders = [('User-agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36')]
response = br.open(url)
br.select_form(name="aspnetForm")
c = br.form.find_control(id='ctl00_ContentPlaceHolder1_txtFechaDesde')
c.value = start_date.strftime('%d-%m-%Y')
c = br.form.find_control(id='ctl00_ContentPlaceHolder1_txtFechaHasta')
c.value = end_date.strftime('%d-%m-%Y')
c = br.form.find_control(id='ctl00_ContentPlaceHolder1_lstModalidadContratacion') \
.get(str(modalidad_id))
c.selected = True
# c = br.form.find_control(id='ctl00_ContentPlaceHolder1_txtNombreDemandante')
# c.readonly = False
# c.value = 'Ministerio de la Presidencia'
# c = br.form.find_control(id='ctl00_ContentPlaceHolder1_hidNombreDemandante')
# c.readonly = False
# c.value = 'Ministerio de la Presidencia'
c = br.form.find_control(id='ctl00_ContentPlaceHolder1_hidIdOrgC')
c.readonly = False
c.value = str(demandante_id)
postback(br, 'ctl00$ContentPlaceHolder1$btnBuscar')
response = br.submit().read()
br.select_form(name="aspnetForm")
total_pages = int(br.form.find_control(id='ctl00_ContentPlaceHolder1_ControlPaginacion_hidTotalPaginas').value)
current_page = 1
records = 0
log("Scraping %d pages" % total_pages)
while current_page <= total_pages:
for r in scrape_table(response):
records += 1
yield r
current_page += 1
br.select_form(name="aspnetForm")
c = br.form.find_control(id='ctl00_ContentPlaceHolder1_ControlPaginacion_hidNumeroPagina')
c.readonly = False
c.value = str(current_page)
postback(br, 'ctl00$ContentPlaceHolder1$ControlPaginacion$btnCambiarPagina')
log("Getting page: %d" % current_page)
response = br.submit().read()
log("Scraped %d records" % records)
FIELDS = ['id', 'descripcion', 'entidad_unidad_de_compra', 'dependencia', 'fecha', 'modalidad', 'estado']
def scrape_table(html):
html = BeautifulSoup(html)
table = html.find(attrs={'class': 'adquisicionTablaResultados'})
for tr in table.findChildren('tr')[1:]:
yield dict(zip(FIELDS, [td.text for td in tr.findChildren('td')]))
if __name__ == '__main__':
csv = unicodecsv.DictWriter(sys.stdout, FIELDS)
csv.writeheader()
br = mechanize.Browser()
for record in scrape('https://www.panamacompra.gob.pa/ambientepublico/AP_BusquedaAvanzada.aspx',
datetime(2009,9,1),
datetime.now(),
1845262,
400,
br):
csv.writerow(record)
@jorgehsaavedra
Copy link

Hola @jazzido, he tratado de ejecutar el código con la pagina actual pero me genera error, pregunta? tienes alguna actualización del code que funcione actualmente? Muchas gracias

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment