-
-
Save ulfelder/5724731 to your computer and use it in GitHub Desktop.
# This script converts Freedom House's annual Freedom in the World data from | |
# the ugly, table-like Excel format in which it's posted into a data frame in R. | |
# It is set up to work on future updates, too, and should automatically adjust | |
# as years or countries are added. | |
# | |
# Before running this script: | |
# | |
# 1. Open a browser & go to http://www.freedomhouse.org/report-types/freedom-world | |
# | |
# 2. Download the file "Country ratings and status, FIW 1973-2013(EXCEL)" | |
# | |
# 3. Open that file. | |
# | |
# 4. Under the File menu, click on "Save As." | |
# | |
# 5. Select your R working directory as the destination, name the file "fiw2013", | |
# and set "Save as type" to "Text (Tab delimited)". Click "Yes" in response | |
# to the warning that follows. | |
# | |
# Then run the following in R. | |
library(reshape) | |
fiw <- read.delim("fiw2013.txt", header = FALSE) # Assumes file is in working directory | |
fiw2 <- fiw[9:dim(fiw)[1],] # Lops off junk rows at the top | |
row.names(fiw2) <- NULL # Removes bogus variable names | |
today <- Sys.Date() | |
thisyear <- as.numeric(substr(today,1,4)) | |
ncountries <- nrow(fiw2[nchar(as.character(fiw2$V1)) > 1,]) # Counts countries | |
nyears <- thisyear - 1972 - 1 # Counts years, adjusts for no data for 1981 | |
fiw3 <- fiw2[1:ncountries,1:((3 * nyears) + 1)] # Trims the rest of the fat | |
fiwvars <- c("name", rep(c("civlibs", "polrights", "status"), | |
times = thisyear - 1972 - 1) ) # Again, -1 bc no data for 1981 | |
names(fiw3) <- fiwvars | |
fiw4 <- t(fiw3) | |
countries <- fiw4[1,] | |
fiw5 <- fiw4[-1,] | |
fiw6 <- melt(fiw5) | |
fiwyears <- c(rep(1972:1980, each = 3), rep(1982:(thisyear - 1), each = 3)) | |
name <- rep(countries, each = length(fiwyears)) | |
year <- rep(fiwyears, times = length(countries)) | |
fiw7 <- cbind(name, year, fiw6) | |
fiw7[,4] <- NULL | |
names(fiw7) <- c("name", "year", "var", "value") | |
civlibs <- subset(fiw7, var=="civlibs") | |
civlibs[,3] <- NULL | |
names(civlibs) <- c("name", "year", "civlibs") | |
polrights <- subset(fiw7, var=="polrights") | |
polrights[,3] <- NULL | |
names(polrights) <- c("name", "year", "polrights") | |
status <- subset(fiw7, var=="status") | |
status[,3] <- NULL | |
names(status) <- c("name", "year", "status") | |
fiw8 <- merge(civlibs, polrights) | |
fiw9 <- merge(fiw8, status) | |
fiw9$name <- as.character(fiw9$name) | |
fiw9$civlibs <- as.numeric(as.character(fiw9$civlibs)) | |
fiw9$polrights <- as.numeric(as.character(fiw9$polrights)) | |
fiw9$status <- as.character(fiw9$status) | |
fiw10 <- subset(fiw9, is.na(civlibs)==FALSE) | |
fiw <- fiw10 | |
rm(fiw2,fiw3,fiw4,fiw5,fiw6,fiw7,fiw8,fiw9,fiw10) | |
# After running this script, you'll probably want to add ID codes | |
# for countries to facilitate merging with other TSCS data. The | |
# 'countrycode' package is useful here. For example, to get | |
# Correlates of War numeric codes, you could run: | |
library(countrycode) | |
fiw$ccode <- countrycode(fiw$name, "country.name", "cown", warn = TRUE) | |
# Note that this process usually requires some massaging to get | |
# matches for all cases. One way to do this is to open countrycode_data, | |
# find the names it's using for the cases that don't match, replace | |
# the relevant country names in 'fiw' with the ones countrycode wants, | |
# and iterate this until all names match properly. |
save the file as .R so that we get syntax highlighting.
Done, and thanks, smach.
Here is more concise code using xlsx and reshape2 package
# download report in xls format
require(downloader)
download('http://www.freedomhouse.org/sites/default/files/Country%20Status%20and%20Ratings%2C%201973-2013%20%28FINAL%29_0.xls', destfile = 'datafile.xls')
# read data using the xlsx package
require(xlsx)
dat <- read.xlsx2('datafile.xls', sheetIndex = 1, startRow = 7)
# format column names as variable_year
nYears = (NCOL(dat) - 1)/3
var_years = expand.grid( x= c('PR', 'CL', 'Status'), y = 1972:(1972 + nYears - 1))
names(dat) = c('country', paste(var_years$x, var_years$y, sep = "_"))
# melt the data, split the variable_year column and voila!
require(reshape2)
dat_m <- melt(dat, id = 'country')
dat_m <- cbind(dat_m, colsplit(dat_m$variable, "_", names = c('indicator', 'year')))
dat_m$variable = NULL
Awesome, ramnathv, thank you. I hadn't seen either 'downloader' or 'xlsx' and I have to do this task a lot, so those will be a big help.
Jay, thanks a lot for this. Saved me a couple of hours of work today. One quick comment: line 25 reads as
fiw2 <- fiw[9:dim(fiw)[1],] # Lops off junk rows at the top
You should change it to
fiw2 <- fiw[8:dim(fiw)[1],] # Lops off junk rows at the top
Note the 8
as beginning of the row numbers to work with. Starting from row 9 leaves out Afghanistan which is somewhat unfortunate if you do work on conflict... ;) The row numbers have probably changed in the recent Freedom House release of the Excel sheets which caused the problem.
Other than that, the script works like a charm and produces a very neat data frame (although you're right about the massaging-part with countrycode
).
Instead of hard-coding number of countries, possibly change to
ncountries <- nrow(fiw2[nchar(as.character(fiw2$V1)) > 1,])
fiw3 <- fiw2[1:ncountries,1:((3 * (thisyear - 1973)) + 1)]