Skip to content

Instantly share code, notes, and snippets.

@briatte
Last active December 16, 2015 20:10
Show Gist options
  • Save briatte/5490761 to your computer and use it in GitHub Desktop.
Save briatte/5490761 to your computer and use it in GitHub Desktop.
basic summary statistics from R (see example use at bottom); the plain text file it produces can be copy-pasted in Google Docs and formatted from there with styles and tabs
require(countrycode)
require(devtools)
require(downloader)
require(pastecs)
## GET DATA
# download Quality of Government Standard dataset
file = "data/qog-cs.txt"
if(!file.exists(file)) {
if(!file.exists(dta <- "data/qog-cs.dta")) {
url = "http://www.qogdata.pol.gu.se/data/qog_std_cs.dta"
download(url, dta, mode = "wb")
}
write.csv(read.dta(dta), file)
}
# open local copy
data <- read.csv(file, stringsAsFactors = FALSE, header = TRUE)
## PREPARE DATA
# extract variables
data <- with(data, data.frame(
country = cname, # country name
ccode = ccodealp, # ISO-3C codes
continent = countrycode(ccodealp, "iso3c", "continent"), # UN continent
gdpc = wdi_gdpc / 10^3, # in 1,000 USD
wrights = ciri_wosoc,
regime = gol_polreg))
# drop missing cases
data <- na.omit(data)
# Get stats.table() function.
source_gist("5490761")
## EXPORT SUMMARY STATS
# Example usage:
# - continuous variables are in columns 4 and 5
# - categorical variables are in columns 3 and 6
# - remember to delete missing values first!
stats.table(data, file = "summary.stats.txt",
continuous = 4:5, categorical = c(3, 6))
stats.table <- function(data, continuous, categorical,
file = "stats.txt", digits = 1) {
if(!require(pastecs)) stop("Install.packages('pastecs') first.")
# continuous summary (n, mean, sd, min, max) of selected variable columns
con.stats <- round(t(stat.desc(data[, continuous]))[, c(1, 9, 13, 4:5)], digits)
# categorical summary (n, frequencies) of selected variable columns
cat.stats <- lapply(data[, categorical], FUN = function(x) {
data.frame(x <- table(x), percent = round(100 * x / nrow(data), digits))[-3]
})
# grossly inefficient binding
for(i in 1:length(names(cat.stats))) {
if(i == 1) x <- NULL
x <- rbind(x, c(names(cat.stats)[i], "N", "%"), as.matrix(cat.stats[[i]]))
}
# grossly inefficient binding (bis)
x <- rbind(cbind(rownames(con.stats), con.stats),
cbind(x, matrix(NA, nrow = nrow(x), ncol = 1 + ncol(con.stats) - ncol(x))))
# simplistic formatting
rownames(x) <- colnames(x) <- NULL
write.table(data.frame(x), file = file,
na = "", sep = "\t", quote = FALSE, row.names = FALSE,
col.names = c("", "N", "Mean/%", "SD", "Min", "Max"))
cat("Summary statistics written to", file, "\n")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment