Skip to content

Instantly share code, notes, and snippets.

@pcbje
Created July 31, 2024 12:44
Show Gist options
  • Select an option

  • Save pcbje/bff709cde31f2805522c1aaeef2bd2f8 to your computer and use it in GitHub Desktop.

Select an option

Save pcbje/bff709cde31f2805522c1aaeef2bd2f8 to your computer and use it in GitHub Desktop.
import os
import re
def extract_count(line):
lower_line = line.lower()
numbers = re.findall(' (cero|uno|dos|tres|cuatro|cinco|seis|siete|ocho|nueve)(?:$|,| )', lower_line)
if len(numbers) != 4:
return None
text = ''
for number in numbers:
if number == 'cero':
text += '0'
elif number == 'uno':
text += '1'
elif number == 'dos':
text += '2'
elif number == 'tres':
text += '3'
elif number == 'cuatro':
text += '4'
elif number == 'cinco':
text += '5'
elif number == 'seis':
text += '6'
elif number == 'siete':
text += '7'
elif number == 'ocho':
text += '8'
elif number == 'nueve':
text += '9'
return int(text)
def find_first_value(lines, prefix):
for (i, line) in enumerate(lines):
if prefix.lower() in line.lower():
value = line
if len(lines[i + 1]) > 0 and len(lines[i + 1]) == 0:
value += ' ' + lines[i + 1]
return value.replace(prefix, '').replace('.', '').replace(',', '').strip()
return ''
candidates_list = [
'Nicolas Maduro',
'Luis Martinez',
'Javier Bertucci',
'Jose Brito',
'Antonio Ecarri',
'Claudio Fermin',
'Daniel Ceballos',
'Edmundo Gonzalez',
'Enrique Marques',
'Benjamin Rasseo'
]
votantes_list = ['En la maquina de votación', 'En la cuaderno de votación']
resumen_de_votos = [
'Votos',
'Votos validos',
'Votos parciales',
'Votos nulos',
'Votos vacios',
'Opciones validas',
'Opciones nulas',
'Opciones vacias'
]
def process_acta_text(out, lines, image_url):
counts = []
for line in lines:
count = extract_count(line)
if count is not None:
counts.append(count)
# an acta is supposed to have exactly 48 counts (e.g. Cero, Cero, Cero, Cero).
# if we found less/more, we have not been able to process the file correctly
if len(counts) != 48:
return
pais = find_first_value(lines, 'Pais')
estado = find_first_value(lines, 'Estado EDO')
if estado == '':
estado = find_first_value(lines, 'Estado')
municipio = find_first_value(lines, 'Municipio MP')
parroquia = find_first_value(lines, 'Parroquia PQ')
electores = find_first_value(lines, 'Electores')
votantes = {
'En la maquina de votación': counts[0],
'En la cuaderno de votación': counts[1],
}
candidates = {
'Nicolas Maduro': counts[2 : 2 + 13], # 13 parties
'Luis Martinez': counts[15 : 15 + 6], # 6 parties
'Javier Bertucci': counts[21 : 21 + 1], # 1 party
'Jose Brito': counts[22 : 22 + 4], # 4 parties
'Antonio Ecarri': counts[26 : 26 + 6], # 6 parties
'Claudio Fermin': counts[32 : 32 + 1], # 1 party
'Daniel Ceballos': counts[33 : 33 + 2], # 2 parties
'Edmundo Gonzalez': counts[35 : 35 + 3], # 3 parties
'Enrique Marques': counts[38 : 38 + 1], # 1 party
'Benjamin Rasseo': counts[39 : 39 + 1] # 1 party
}
resumen_de_votos = {
'Votos': counts[40],
'Votos validos': counts[41],
'Votos parciales': counts[42],
'Votos nulos': counts[43],
'Votos vacios': counts[44],
'Opciones validas': counts[45],
'Opciones nulas': counts[46],
'Opciones vacias': counts[47], # 48th number
}
values = [
image_url,
pais,
estado,
municipio,
parroquia,
electores,
votantes['En la maquina de votación'],
votantes['En la cuaderno de votación'],
sum(candidates['Nicolas Maduro']),
sum(candidates['Luis Martinez']),
sum(candidates['Javier Bertucci']),
sum(candidates['Jose Brito']),
sum(candidates['Antonio Ecarri']),
sum(candidates['Claudio Fermin']),
sum(candidates['Daniel Ceballos']),
sum(candidates['Edmundo Gonzalez']),
sum(candidates['Enrique Marques']),
sum(candidates['Benjamin Rasseo']),
resumen_de_votos['Votos'],
resumen_de_votos['Votos validos'],
resumen_de_votos['Votos parciales'],
resumen_de_votos['Votos nulos'],
resumen_de_votos['Votos vacios'],
resumen_de_votos['Opciones validas'],
resumen_de_votos['Opciones nulas'],
resumen_de_votos['Opciones vacias'],
]
total_votes = sum([sum(candidates['Nicolas Maduro']),
sum(candidates['Luis Martinez']),
sum(candidates['Javier Bertucci']),
sum(candidates['Jose Brito']),
sum(candidates['Antonio Ecarri']),
sum(candidates['Claudio Fermin']),
sum(candidates['Daniel Ceballos']),
sum(candidates['Edmundo Gonzalez']),
sum(candidates['Enrique Marques']),
sum(candidates['Benjamin Rasseo'])])
if total_votes != resumen_de_votos['Votos validos']:
# Not a valid count
return
out.write('\t'.join([str(value) for value in values]))
out.write('\n')
with open('image-urls.txt') as inp:
image_urls = [url.strip() for url in inp.readlines()]
def get_image_url(name):
for image_url in image_urls:
if name.replace('.txt', '') in image_url:
return image_url
if __name__ == '__main__':
header = [
'Acta image URL',
'Pais',
'Estado',
'Municipio',
'Parroquia',
'Electores'
]
for name in votantes_list:
header.append(name)
for name in candidates_list:
header.append(name)
for name in resumen_de_votos:
header.append(name)
with open('processed-actas.csv', 'w') as out:
out.write('\t'.join(header))
out.write('\n')
for name in os.listdir('extracted_text'):
if not name.endswith('.txt'):
continue
with open(os.path.join('extracted_text', name)) as inp:
lines = [line.strip() for line in inp.readlines()]
image_url = get_image_url(name)
if image_url is None:
continue
process_acta_text(out, lines, image_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment