Created
July 31, 2024 12:44
-
-
Save pcbje/bff709cde31f2805522c1aaeef2bd2f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import re | |
| def extract_count(line): | |
| lower_line = line.lower() | |
| numbers = re.findall(' (cero|uno|dos|tres|cuatro|cinco|seis|siete|ocho|nueve)(?:$|,| )', lower_line) | |
| if len(numbers) != 4: | |
| return None | |
| text = '' | |
| for number in numbers: | |
| if number == 'cero': | |
| text += '0' | |
| elif number == 'uno': | |
| text += '1' | |
| elif number == 'dos': | |
| text += '2' | |
| elif number == 'tres': | |
| text += '3' | |
| elif number == 'cuatro': | |
| text += '4' | |
| elif number == 'cinco': | |
| text += '5' | |
| elif number == 'seis': | |
| text += '6' | |
| elif number == 'siete': | |
| text += '7' | |
| elif number == 'ocho': | |
| text += '8' | |
| elif number == 'nueve': | |
| text += '9' | |
| return int(text) | |
| def find_first_value(lines, prefix): | |
| for (i, line) in enumerate(lines): | |
| if prefix.lower() in line.lower(): | |
| value = line | |
| if len(lines[i + 1]) > 0 and len(lines[i + 1]) == 0: | |
| value += ' ' + lines[i + 1] | |
| return value.replace(prefix, '').replace('.', '').replace(',', '').strip() | |
| return '' | |
| candidates_list = [ | |
| 'Nicolas Maduro', | |
| 'Luis Martinez', | |
| 'Javier Bertucci', | |
| 'Jose Brito', | |
| 'Antonio Ecarri', | |
| 'Claudio Fermin', | |
| 'Daniel Ceballos', | |
| 'Edmundo Gonzalez', | |
| 'Enrique Marques', | |
| 'Benjamin Rasseo' | |
| ] | |
| votantes_list = ['En la maquina de votación', 'En la cuaderno de votación'] | |
| resumen_de_votos = [ | |
| 'Votos', | |
| 'Votos validos', | |
| 'Votos parciales', | |
| 'Votos nulos', | |
| 'Votos vacios', | |
| 'Opciones validas', | |
| 'Opciones nulas', | |
| 'Opciones vacias' | |
| ] | |
| def process_acta_text(out, lines, image_url): | |
| counts = [] | |
| for line in lines: | |
| count = extract_count(line) | |
| if count is not None: | |
| counts.append(count) | |
| # an acta is supposed to have exactly 48 counts (e.g. Cero, Cero, Cero, Cero). | |
| # if we found less/more, we have not been able to process the file correctly | |
| if len(counts) != 48: | |
| return | |
| pais = find_first_value(lines, 'Pais') | |
| estado = find_first_value(lines, 'Estado EDO') | |
| if estado == '': | |
| estado = find_first_value(lines, 'Estado') | |
| municipio = find_first_value(lines, 'Municipio MP') | |
| parroquia = find_first_value(lines, 'Parroquia PQ') | |
| electores = find_first_value(lines, 'Electores') | |
| votantes = { | |
| 'En la maquina de votación': counts[0], | |
| 'En la cuaderno de votación': counts[1], | |
| } | |
| candidates = { | |
| 'Nicolas Maduro': counts[2 : 2 + 13], # 13 parties | |
| 'Luis Martinez': counts[15 : 15 + 6], # 6 parties | |
| 'Javier Bertucci': counts[21 : 21 + 1], # 1 party | |
| 'Jose Brito': counts[22 : 22 + 4], # 4 parties | |
| 'Antonio Ecarri': counts[26 : 26 + 6], # 6 parties | |
| 'Claudio Fermin': counts[32 : 32 + 1], # 1 party | |
| 'Daniel Ceballos': counts[33 : 33 + 2], # 2 parties | |
| 'Edmundo Gonzalez': counts[35 : 35 + 3], # 3 parties | |
| 'Enrique Marques': counts[38 : 38 + 1], # 1 party | |
| 'Benjamin Rasseo': counts[39 : 39 + 1] # 1 party | |
| } | |
| resumen_de_votos = { | |
| 'Votos': counts[40], | |
| 'Votos validos': counts[41], | |
| 'Votos parciales': counts[42], | |
| 'Votos nulos': counts[43], | |
| 'Votos vacios': counts[44], | |
| 'Opciones validas': counts[45], | |
| 'Opciones nulas': counts[46], | |
| 'Opciones vacias': counts[47], # 48th number | |
| } | |
| values = [ | |
| image_url, | |
| pais, | |
| estado, | |
| municipio, | |
| parroquia, | |
| electores, | |
| votantes['En la maquina de votación'], | |
| votantes['En la cuaderno de votación'], | |
| sum(candidates['Nicolas Maduro']), | |
| sum(candidates['Luis Martinez']), | |
| sum(candidates['Javier Bertucci']), | |
| sum(candidates['Jose Brito']), | |
| sum(candidates['Antonio Ecarri']), | |
| sum(candidates['Claudio Fermin']), | |
| sum(candidates['Daniel Ceballos']), | |
| sum(candidates['Edmundo Gonzalez']), | |
| sum(candidates['Enrique Marques']), | |
| sum(candidates['Benjamin Rasseo']), | |
| resumen_de_votos['Votos'], | |
| resumen_de_votos['Votos validos'], | |
| resumen_de_votos['Votos parciales'], | |
| resumen_de_votos['Votos nulos'], | |
| resumen_de_votos['Votos vacios'], | |
| resumen_de_votos['Opciones validas'], | |
| resumen_de_votos['Opciones nulas'], | |
| resumen_de_votos['Opciones vacias'], | |
| ] | |
| total_votes = sum([sum(candidates['Nicolas Maduro']), | |
| sum(candidates['Luis Martinez']), | |
| sum(candidates['Javier Bertucci']), | |
| sum(candidates['Jose Brito']), | |
| sum(candidates['Antonio Ecarri']), | |
| sum(candidates['Claudio Fermin']), | |
| sum(candidates['Daniel Ceballos']), | |
| sum(candidates['Edmundo Gonzalez']), | |
| sum(candidates['Enrique Marques']), | |
| sum(candidates['Benjamin Rasseo'])]) | |
| if total_votes != resumen_de_votos['Votos validos']: | |
| # Not a valid count | |
| return | |
| out.write('\t'.join([str(value) for value in values])) | |
| out.write('\n') | |
| with open('image-urls.txt') as inp: | |
| image_urls = [url.strip() for url in inp.readlines()] | |
| def get_image_url(name): | |
| for image_url in image_urls: | |
| if name.replace('.txt', '') in image_url: | |
| return image_url | |
| if __name__ == '__main__': | |
| header = [ | |
| 'Acta image URL', | |
| 'Pais', | |
| 'Estado', | |
| 'Municipio', | |
| 'Parroquia', | |
| 'Electores' | |
| ] | |
| for name in votantes_list: | |
| header.append(name) | |
| for name in candidates_list: | |
| header.append(name) | |
| for name in resumen_de_votos: | |
| header.append(name) | |
| with open('processed-actas.csv', 'w') as out: | |
| out.write('\t'.join(header)) | |
| out.write('\n') | |
| for name in os.listdir('extracted_text'): | |
| if not name.endswith('.txt'): | |
| continue | |
| with open(os.path.join('extracted_text', name)) as inp: | |
| lines = [line.strip() for line in inp.readlines()] | |
| image_url = get_image_url(name) | |
| if image_url is None: | |
| continue | |
| process_acta_text(out, lines, image_url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment