Skip to content

Instantly share code, notes, and snippets.

@natbusa
Last active December 27, 2015 09:19
Show Gist options
  • Select an option

  • Save natbusa/7303557 to your computer and use it in GitHub Desktop.

Select an option

Save natbusa/7303557 to your computer and use it in GitHub Desktop.
Basic statistics on dutch election 2010
# read in ducth election file as provided by http://www.engagedata.eu/dataset/14399
# https://engagefp7.s3.amazonaws.com/resources/dataset_14399/TK2010.csv
dat = read.csv2('http://engagefp7.s3.amazonaws.com/resources/dataset_14399/TK2010.csv', skip=31, header=FALSE)
#define column names
columns = '
Gemeente;
geldige stemmen;
ongeldige stemmen;
blanco stemmen;
Percentage blanco stemmen;
kiesgerechtigden;
Opkomst;
Opkomstpercentage;
VVD;
Partij van de Arbeid (P.v.d.A.);
PVV (Partij voor de Vrijheid);
Christen Democratisch Appel (CDA);
SP (Socialistische Partij);
Democraten 66 (D66);
GROENLINKS;
ChristenUnie;
Staatkundig Gereformeerde Partij (SGP);
Partij voor de Dieren;
TROTS OP NEDERLAND LIJST RITA VERDONK;
Partij voor Mens en Spirit (MenS);
Piratenpartij;
Lijst 17;
Partij een;
Nieuw Nederland;
Heel NL;
Evangelische Partij Nederland
'
#parse the names as a dataframe
column.names = read.csv2(text=columns, header=FALSE)
#rename the columns of the data frame
colnames(dat) <- column.names[,1]
#remove last four lines
dat = dat[1:(nrow(dat)-4),]
#how many municipalities?
nrow(dat)
length(dat$Gemeente)
length(unique(dat$Gemeente))
#how many parties
n = names(dat)
from = which(names(dat) == 'Opkomstpercentage') + 1
to = length(n)
#select the columns relative to parties
parties = n[from:to]
parties
#how many parties?
length(parties)
#votes per party
total = colSums(dat[parties])
total
#plot bars
barplot(total)
#plot log10 (some alternatives)
barplot(log10(total))
barplot(total, log='y')
# most votes
total.max.value = max(total)
total.max = total[total==total.max.value]
total.max
#minum votes
total.min.value = min(total)
total.min = total[total==total.min.value]
total.min
# coverage min votes per municipality
#at least one vote for the looser
I = dat[names(total.min)] > 0
#exactly 12 votes for the looser
J = dat[names(total.min)] == 12
#select the municipalities for the loosing party (at least one vote)
party.loosing = dat[I,c('Gemeente', names(total.min))]
nrow(party.loosing)
#select the municipalities for the loosing party (exactly 12 votes)
party.loosing = dat[J,c('Gemeente', names(total.min))]
nrow(party.loosing)
# CDA < VVD < PVV
I = dat['VVD'] < dat['PVV (Partij voor de Vrijheid)']
J = dat['VVD'] > dat['Christen Democratisch Appel (CDA)']
d = dat[I & J, c('Gemeente', 'VVD', 'PVV (Partij voor de Vrijheid)', 'Christen Democratisch Appel (CDA)')]
#which municipalities where CDA < VVD < PVV
d$Gemeente
nrow(d)
# time for pies!
pie(total)
# create a others category for the small parties
n = 7
sorted.total = sort(total, decreasing=TRUE)
votes = sorted.total[1:n]
votes = c(votes, others=sum(sorted.total[(n+1):length(total)]))
# a better pie
pie(votes, main='dutch elections 2010')
# as above but for a specific municipality
municipality = 'Almere'
municipality.selection = dat$Gemeente == municipality
municipalities.votes = dat[municipality.selection, parties]
# reuse previous code ...
total = as.numeric(municipalities.votes)
names(total) = parties
n = 7
sorted.total = sort(total, decreasing=TRUE)
votes = sorted.total[1:n]
votes = c(votes, others=sum(sorted.total[(n+1):length(total)]))
# a better pie
pie(votes, main=municipality)
# take into account valid votes and correct the votes by valid votes
relative = dat[parties] / dat[,'geldige stemmen']
# relative correlation
cross.corr = cor(relative)
# do not look in the diagonal
diag(cross.corr) = NA
# VVD correlation
vvd.corr = cross.corr['VVD',]
vvd.corr
# min and max correlation for vvd
vvd.corr.max = max(vvd.corr, na.rm=TRUE)
vvd.corr.max
vvd.corr.min = min(vvd.corr, na.rm=TRUE)
vvd.corr.min
# which party correlates?
# positive correlation
vvd.corr[which.max(vvd.corr)]
# which party correlates?
# negative correlation
vvd.corr[which.min(vvd.corr)]
#get the names of the parties (row and column are the same)
cross.corr.parties = rownames(cross.corr)
# max correlation (all columns)
corr.max.value = apply(cross.corr, 2, FUN=function(x) {max(x, na.rm=TRUE)} )
corr.max.value
corr.max.party = apply(cross.corr, 2, FUN=function(x) {cross.corr.parties[which.max(x)]} )
corr.max.party
#combine into a data frame
cross.corr.dataframe = data.frame( cbind(corr.max.value, corr.max.party) )
cross.corr.dataframe
#sort by highest correlation
I = order(cross.corr.dataframe[,1],decreasing=TRUE)
cross.corr.dataframe[I,]
#highest correlation
cross.corr.dataframe[I[1],]
# min correlation (all columns)
corr.min.value = apply(cross.corr, 2, FUN=function(x) {min(x, na.rm=TRUE)} )
corr.min.value
corr.min.party = apply(cross.corr, 2, FUN=function(x) {cross.corr.parties[which.min(x)]} )
corr.min.party
#combine into a data frame
cross.corr.dataframe = data.frame( cbind(corr.min.value, corr.min.party) )
cross.corr.dataframe
#sort by highest negative correlation
I = order(cross.corr.dataframe[,1],decreasing=TRUE)
cross.corr.dataframe[I,]
#highest correlation
cross.corr.dataframe[I[1],]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment