Skip to content

Instantly share code, notes, and snippets.

@marsam
Last active December 19, 2015 17:09
Show Gist options
  • Select an option

  • Save marsam/5989056 to your computer and use it in GitHub Desktop.

Select an option

Save marsam/5989056 to your computer and use it in GitHub Desktop.
A simple onpe scraper
#!/usr/bin/env python
# -*- coding: utf-8 -*-
try:
from urllib import urlencode
from urlparse import urlparse, urljoin, parse_qsl
except ImportError:
from urllib.parse import urlparse, urljoin, urlencode, parse_qsl
from lxml.html import parse
def queryargs(url):
"""
>>> queryargs('http://example.com/?a=42&t=5')
{'a': 42, 't': 45}
"""
query = urlparse(url).query
return dict(parse_qsl(query))
def text_content(elements):
return ''.join(map(lambda e: e.text_content(), elements))
def _parse_locales(filename_or_url):
root = parse(filename_or_url)
return [e.get('value') for e in root.xpath('//option') if e.get('value')]
def _parse_actas(filename_or_url):
root = parse(filename_or_url)
return [queryargs(r.get('href'))['cnume_acta'] for r in root.xpath('/html/body/fieldset/table/tr/td[5]')[0]]
def _parse_actainfo(filename_or_url):
root = parse(filename_or_url)
def _text(xpath_exp):
return text_content(root.xpath(xpath_exp))
return {
'acta': {
'mesa': _text('/html/body/table[2]/tr[1]/td[5]'),
'copia': _text('/html/body/table[2]/tr[2]/td[4]'),
'departamento': _text('/html/body/table[2]/tr[4]/td[4]'),
'provincia': _text('/html/body/table[2]/tr[5]/td[4]'),
'distrito': _text('/html/body/table[2]/tr[6]/td[4]'),
'local': _text('/html/body/table[2]/tr[7]/td[4]'),
'direccion': _text('/html/body/table[2]/tr[8]/td[4]'),
'electores': _text('/html/body/table[2]/tr[10]/td[4]'),
'votaron': _text('/html/body/table[2]/tr[11]/td[4]'),
'estado': _text('/html/body/table[2]/tr[12]/td[4]'),
'historial': _text('/html/body/table[2]/tr[13]/td[4]'),
},
'votos': {
'validos': {
_text('/html/body/table[3]/tr[3]/td[1]'): _text('/html/body/table[3]/tr[3]/td[3]'),
_text('/html/body/table[3]/tr[4]/td[1]'): _text('/html/body/table[3]/tr[4]/td[3]'),
_text('/html/body/table[3]/tr[5]/td[1]'): _text('/html/body/table[3]/tr[5]/td[3]'),
_text('/html/body/table[3]/tr[6]/td[1]'): _text('/html/body/table[3]/tr[6]/td[3]'),
_text('/html/body/table[3]/tr[7]/td[1]'): _text('/html/body/table[3]/tr[7]/td[3]'),
_text('/html/body/table[3]/tr[8]/td[1]'): _text('/html/body/table[3]/tr[8]/td[3]'),
_text('/html/body/table[3]/tr[9]/td[1]'): _text('/html/body/table[3]/tr[9]/td[3]'),
_text('/html/body/table[3]/tr[11]/td[1]'): _text('/html/body/table[3]/tr[11]/td[3]'),
_text('/html/body/table[3]/tr[12]/td[1]'): _text('/html/body/table[3]/tr[12]/td[3]'),
_text('/html/body/table[3]/tr[13]/td[1]'): _text('/html/body/table[3]/tr[13]/td[3]'),
_text('/html/body/table[3]/tr[14]/td[1]'): _text('/html/body/table[3]/tr[14]/td[3]'),
},
'blancos': _text('/html/body/table[3]/tr[17]/td[2]'),
'nulos': _text('/html/body/table[3]/tr[18]/td[2]'),
'impugnados': _text('/html/body/table[3]/tr[19]/td[2]'),
'total': _text('/html/body/table[3]/tr[20]/td[2]'),
},
}
class OnpeScraper(object):
def __init__(self, baseurl):
self.baseurl = baseurl
def _url(self, path, params=None):
url = urljoin(self.baseurl, path)
if params is not None:
qs = urlencode(params)
return '{0}?{1}'.format(url, qs)
return url
def get_locales(self, ubigeo):
url = self._url('extras/locales.php', {'elegido': ubigeo})
return _parse_locales(url)
def get_actainfo(self, acta):
url = self._url('rep_mesas_det_pre.php', {'cnume_acta': acta})
return _parse_actainfo(url)
def get_actas(self, ubigeo, local):
params = {
'estado': 'T',
'estado2': 'T',
'ambito1': 'P',
'tipo_consulta1': 'UBIGEO',
'local': local,
'dist': ubigeo,
'prov': '{0}00'.format(ubigeo[0:4]),
'dpto': '{0}0000'.format(ubigeo[0:2]),
}
url = self._url('extras/buscar_ubigeo_actas.php', params)
return _parse_actas(url)
def get_ubigeos(self):
for prov_ubigeo in _parse_locales(self._url('extras/provincias.php', {'elegido': '1'})):
for dist_ubigeo in _parse_locales(self._url('extras/distritos.php', {'elegido': prov_ubigeo})):
yield dist_ubigeo
def actas_from_ubigeo(self, ubigeo):
for local in self.get_locales(ubigeo):
for acta in self.get_actas(ubigeo, local):
yield acta
if __name__ == '__main__':
onpe = OnpeScraper('http://www.web.onpe.gob.pe/modElecciones/elecciones/elecciones2011/1ravuelta/onpe/presidente/')
print(onpe.get_actainfo('159649'))
print(onpe.get_actainfo('168621'))
for ubigeo in onpe.get_ubigeos():
print('Getting actas from ubigeo: ', ubigeo)
for acta in onpe.actas_from_ubigeo(ubigeo):
print(onpe.get_actainfo(acta))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment