Last active
December 19, 2015 17:09
-
-
Save marsam/5989056 to your computer and use it in GitHub Desktop.
A simple onpe scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| try: | |
| from urllib import urlencode | |
| from urlparse import urlparse, urljoin, parse_qsl | |
| except ImportError: | |
| from urllib.parse import urlparse, urljoin, urlencode, parse_qsl | |
| from lxml.html import parse | |
| def queryargs(url): | |
| """ | |
| >>> queryargs('http://example.com/?a=42&t=5') | |
| {'a': 42, 't': 45} | |
| """ | |
| query = urlparse(url).query | |
| return dict(parse_qsl(query)) | |
| def text_content(elements): | |
| return ''.join(map(lambda e: e.text_content(), elements)) | |
| def _parse_locales(filename_or_url): | |
| root = parse(filename_or_url) | |
| return [e.get('value') for e in root.xpath('//option') if e.get('value')] | |
| def _parse_actas(filename_or_url): | |
| root = parse(filename_or_url) | |
| return [queryargs(r.get('href'))['cnume_acta'] for r in root.xpath('/html/body/fieldset/table/tr/td[5]')[0]] | |
| def _parse_actainfo(filename_or_url): | |
| root = parse(filename_or_url) | |
| def _text(xpath_exp): | |
| return text_content(root.xpath(xpath_exp)) | |
| return { | |
| 'acta': { | |
| 'mesa': _text('/html/body/table[2]/tr[1]/td[5]'), | |
| 'copia': _text('/html/body/table[2]/tr[2]/td[4]'), | |
| 'departamento': _text('/html/body/table[2]/tr[4]/td[4]'), | |
| 'provincia': _text('/html/body/table[2]/tr[5]/td[4]'), | |
| 'distrito': _text('/html/body/table[2]/tr[6]/td[4]'), | |
| 'local': _text('/html/body/table[2]/tr[7]/td[4]'), | |
| 'direccion': _text('/html/body/table[2]/tr[8]/td[4]'), | |
| 'electores': _text('/html/body/table[2]/tr[10]/td[4]'), | |
| 'votaron': _text('/html/body/table[2]/tr[11]/td[4]'), | |
| 'estado': _text('/html/body/table[2]/tr[12]/td[4]'), | |
| 'historial': _text('/html/body/table[2]/tr[13]/td[4]'), | |
| }, | |
| 'votos': { | |
| 'validos': { | |
| _text('/html/body/table[3]/tr[3]/td[1]'): _text('/html/body/table[3]/tr[3]/td[3]'), | |
| _text('/html/body/table[3]/tr[4]/td[1]'): _text('/html/body/table[3]/tr[4]/td[3]'), | |
| _text('/html/body/table[3]/tr[5]/td[1]'): _text('/html/body/table[3]/tr[5]/td[3]'), | |
| _text('/html/body/table[3]/tr[6]/td[1]'): _text('/html/body/table[3]/tr[6]/td[3]'), | |
| _text('/html/body/table[3]/tr[7]/td[1]'): _text('/html/body/table[3]/tr[7]/td[3]'), | |
| _text('/html/body/table[3]/tr[8]/td[1]'): _text('/html/body/table[3]/tr[8]/td[3]'), | |
| _text('/html/body/table[3]/tr[9]/td[1]'): _text('/html/body/table[3]/tr[9]/td[3]'), | |
| _text('/html/body/table[3]/tr[11]/td[1]'): _text('/html/body/table[3]/tr[11]/td[3]'), | |
| _text('/html/body/table[3]/tr[12]/td[1]'): _text('/html/body/table[3]/tr[12]/td[3]'), | |
| _text('/html/body/table[3]/tr[13]/td[1]'): _text('/html/body/table[3]/tr[13]/td[3]'), | |
| _text('/html/body/table[3]/tr[14]/td[1]'): _text('/html/body/table[3]/tr[14]/td[3]'), | |
| }, | |
| 'blancos': _text('/html/body/table[3]/tr[17]/td[2]'), | |
| 'nulos': _text('/html/body/table[3]/tr[18]/td[2]'), | |
| 'impugnados': _text('/html/body/table[3]/tr[19]/td[2]'), | |
| 'total': _text('/html/body/table[3]/tr[20]/td[2]'), | |
| }, | |
| } | |
| class OnpeScraper(object): | |
| def __init__(self, baseurl): | |
| self.baseurl = baseurl | |
| def _url(self, path, params=None): | |
| url = urljoin(self.baseurl, path) | |
| if params is not None: | |
| qs = urlencode(params) | |
| return '{0}?{1}'.format(url, qs) | |
| return url | |
| def get_locales(self, ubigeo): | |
| url = self._url('extras/locales.php', {'elegido': ubigeo}) | |
| return _parse_locales(url) | |
| def get_actainfo(self, acta): | |
| url = self._url('rep_mesas_det_pre.php', {'cnume_acta': acta}) | |
| return _parse_actainfo(url) | |
| def get_actas(self, ubigeo, local): | |
| params = { | |
| 'estado': 'T', | |
| 'estado2': 'T', | |
| 'ambito1': 'P', | |
| 'tipo_consulta1': 'UBIGEO', | |
| 'local': local, | |
| 'dist': ubigeo, | |
| 'prov': '{0}00'.format(ubigeo[0:4]), | |
| 'dpto': '{0}0000'.format(ubigeo[0:2]), | |
| } | |
| url = self._url('extras/buscar_ubigeo_actas.php', params) | |
| return _parse_actas(url) | |
| def get_ubigeos(self): | |
| for prov_ubigeo in _parse_locales(self._url('extras/provincias.php', {'elegido': '1'})): | |
| for dist_ubigeo in _parse_locales(self._url('extras/distritos.php', {'elegido': prov_ubigeo})): | |
| yield dist_ubigeo | |
| def actas_from_ubigeo(self, ubigeo): | |
| for local in self.get_locales(ubigeo): | |
| for acta in self.get_actas(ubigeo, local): | |
| yield acta | |
| if __name__ == '__main__': | |
| onpe = OnpeScraper('http://www.web.onpe.gob.pe/modElecciones/elecciones/elecciones2011/1ravuelta/onpe/presidente/') | |
| print(onpe.get_actainfo('159649')) | |
| print(onpe.get_actainfo('168621')) | |
| for ubigeo in onpe.get_ubigeos(): | |
| print('Getting actas from ubigeo: ', ubigeo) | |
| for acta in onpe.actas_from_ubigeo(ubigeo): | |
| print(onpe.get_actainfo(acta)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment