Skip to content

Instantly share code, notes, and snippets.

@rinze
Last active January 15, 2019 10:51
Show Gist options
  • Save rinze/0af2b05754e8df568c04 to your computer and use it in GitHub Desktop.
Save rinze/0af2b05754e8df568c04 to your computer and use it in GitHub Desktop.
Parser para los archivos .DAT del Ministerio del Interior y el archivo de códigos de municipios del INE y código en R para gráficas simples.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import csv
import codecs
import cStringIO
import os
from collections import namedtuple
def getParties(parties_file):
"""
Return the parties along with their numerical code as a
hashtable (key => code, value => party)
"""
f = open(parties_file)
res = dict()
for line in f:
# Assign fields of fixed length according to the documentation.
code = line[8:14]
party = unicode(line[64:214].strip(), "iso-8859-1")
res[code] = party
f.close()
return(res)
def getTowns(towns_file):
"""
Return a dictionary with the codes for each town along with their names (key => code, value => town).
The code is a tuple containing (Province, Town) codes.
"""
f = open(towns_file)
reader = csv.reader(f)
reader.next() # Skip first line, header
res = dict()
for entry in reader: # Skip first line, header
code = (entry[0], entry[1]) # (Province, Town)
town = unicode(entry[3], "utf-8")
if code not in res:
res[code] = town
f.close()
return(res)
def getElectionResults(election_file, parties_dict, towns_dict):
"""
Return elections results at the level of individual urn. Return a list of
namedtuples with the following fields (see code -- first line).
"""
ElectionEntry = namedtuple('ElectionEntry', 'prov_code, town_code,' + \
'dist_code, section_code,' + \
'table_code, party_code, ' + \
'town_name, party_name, votes')
f = open(election_file)
def parseLine(line):
# As in getParties(), parse according to official field description.
prov_code = line[11:13]
town_code = line[13:16]
dist_code = line[16:18]
section_code = line[18:22]
table_code = line[22:23]
party_code = line[23:29]
votes = line[29:36]
entry = ElectionEntry(prov_code, town_code, dist_code, \
section_code, table_code, \
party_code, towns_dict[(prov_code, town_code)], \
parties_dict[party_code], votes)
return(entry)
res = map(parseLine, f)
return(res)
# From https://docs.python.org/2/library/csv.html#examples
class UnicodeWriter:
"""
A CSV writer which will write rows to CSV file "f",
which is encoded in the given encoding.
"""
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
# Redirect output to a queue
self.queue = cStringIO.StringIO()
self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
self.stream = f
self.encoder = codecs.getincrementalencoder(encoding)()
def writerow(self, row):
self.writer.writerow([s.encode("utf-8") for s in row])
# Fetch UTF-8 output from the queue ...
data = self.queue.getvalue()
data = data.decode("utf-8")
# ... and reencode it into the target encoding
data = self.encoder.encode(data)
# write to the target stream
self.stream.write(data)
# empty queue
self.queue.truncate(0)
def writerows(self, rows):
for row in rows:
self.writerow(row)
#### Main ####
if (__name__ == "__main__"):
# Hardcoded file paths. Change it to use your own data. See README.md
# for info on each file.
# From: http://www.infoelectoral.interior.es/docxl/apliext/04201105_MESA.zip
parties_file = "/home/chema/Dropbox/data/elecciones20112015/04201105_MESA/03041105.DAT"
elections_file = "/home/chema/Dropbox/data/elecciones20112015/04201105_MESA/10041105.DAT"
# From: http://www.ine.es/daco/daco42/codmun/codmun11/11codmunmapa.htm
towns_file = "/home/chema/Dropbox/data/elecciones20112015/11codmun.csv"
parties = getParties(parties_file)
towns = getTowns(towns_file)
results = getElectionResults(elections_file, parties, towns)
# Save as CSV for analysis with R.
if not os.path.isdir('data'):
os.mkdir('data')
f = open('data/elections2011.csv', 'w')
csv_writer = UnicodeWriter(f)
csv_writer.writerow(results[0]._fields)
csv_writer.writerows(results)
f.close()
# Analisis mesas
library(ggplot2)
theme_set(theme_bw())
mesas <- read.csv('data/elections2011.csv', stringsAsFactors = FALSE)
# Unique "mesa" ID
mesas$id <- with(mesas, paste0(prov_code, town_code, dist_code,
section_code, table_code))
# Compute total votes per district per section per table
totalv <- aggregate(votes ~ id, mesas, sum)
names(totalv) <- c("id", "total_votes")
mesas <- merge(mesas, totalv)
mesas$vote_p <- mesas$votes / mesas$total_votes
# Don't plot cities with less than 100000 votes
nvot <- aggregate(votes ~ town_name, mesas, sum)
nvot <- nvot[nvot$votes > 100000, ]
mesas <- mesas[mesas$town_name %in% nvot$town_name, ]
for (t in unique(mesas$town_name)) {
cat(t, "...")
town <- mesas[mesas$town_name == t, ]
town <- town[order(town$dist_code), ]
town$x <- 1:nrow(town) # Not very correct, but only for plotting
town$party_name <- factor(town$party_name)
p1 <- ggplot(town, aes(y = vote_p * 100, x = x, color = factor(dist_code)),
size = 10,
alpha = 0.5) +
geom_point(size = 3, alpha = 0.6) +
xlab("\nNúmero de mesa (arbitrario)") + ylab("Porcentaje de voto\n") +
ggtitle(sprintf("Voto por mesa en %s\n", t)) +
facet_wrap(~ party_name)
ggsave(sprintf("/tmp/mesas2011/%s.pdf", gsub("/", "-", t)), p1, width = 20,
height = 15)
cat("OK\n")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment