Created
December 23, 2014 07:19
-
-
Save briatte/d997da906fecd6b2765e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## IDA Exercise 5 | |
| ## -------------- | |
| # This exercise replicates PCA plots using World Values Survey data on | |
| # aggregate confidence in public institutions. | |
| # The data is taken from the Quality of Government dataset. The functions | |
| # contain a mix of base functions and custom packages. | |
| # Packages | |
| # -------- | |
| # Load/install packages. | |
| packages <- c("ape", "ClustOfVar", "cluster", "countrycode", "devtools", "FactoMineR", "foreign", "ggdendro", "ggplot2", "reshape") | |
| packages <- lapply(packages, FUN = function(x) { | |
| if (!require(x, character.only = TRUE)) { | |
| install.packages(x) | |
| library(x, character.only = TRUE) | |
| } | |
| }) | |
| # install ggbiplot | |
| if(!"ggbiplot" %in% installed.packages()[, 1]) install_github("ggbiplot", "vqv") | |
| require(ggbiplot) | |
| # Worldwide fertility | |
| # ------------------- | |
| # downloaded during last session: | |
| qog.data <- read.csv("data/qog.cs.csv", stringsAsFactors = FALSE) | |
| # drop duplicate observation: | |
| qog.data = subset(qog.data, cname != "Congo, Democratic Republic") | |
| # subset: | |
| qog <- with(qog.data, data.frame( | |
| fertility.rate = wdi_fr, # fertility rate | |
| gdp.per.capita = wdi_gdpc, # GDP per capita | |
| gov.expenditure = wdi_ge, # government expenditures as % of GDP | |
| healthcare.exp = wdi_hec, # health expenditure per capita | |
| gini.inequality = wdi_gini, # Gini coefficient of inequality | |
| female.school.years = bl_asy25f, # average female years of schooling | |
| female.school.ratio = wdi_gris, # female to male ratio in schools | |
| women.in.parliament = gid_wip # % of women in parliament | |
| )) | |
| # row names: | |
| rownames(qog) = gsub("\\(.*", "", qog.data$cname) | |
| # inspect: | |
| head(qog) | |
| # sample reduction: | |
| qog <- na.omit(qog) | |
| # Heatmap | |
| # ------- | |
| # 1. refactor | |
| qog_hm = data.frame(qog, cname = row.names(qog)) | |
| qog_hm$cname = with(qog_hm, reorder(cname, fertility.rate)) | |
| # 2. melt | |
| qog_hm <- melt(qog_hm) | |
| qog_hm <- ddply(qog_hm, .(variable), transform, rescale = scale(value)) | |
| # 3. plot | |
| p <- ggplot(qog_hm, aes(variable, cname)) + | |
| geom_tile(aes(fill = rescale), colour="white") + | |
| scale_fill_gradient(low = "white", high = "steelblue") | |
| # 4. theme | |
| p + theme_grey(base_size = 12) + | |
| labs(x = NULL, y = NULL) + | |
| scale_x_discrete(expand = c(0, 0)) + | |
| scale_y_discrete(expand = c(0, 0)) + | |
| theme(legend.position = "none", | |
| axis.text.x = element_text(size = 16, angle = 60, hjust = 1, colour = "grey50")) | |
| # Principal components | |
| # -------------------- | |
| fit <- princomp(qog, cor=TRUE) | |
| # screeplot: | |
| plot(fit, type = "l") | |
| # loadings: | |
| loadings(fit) | |
| # biplot: | |
| biplot(fit) | |
| # Hierarchical Clustering | |
| # ----------------------- | |
| # create Ward distance matrix | |
| d <- dist(qog, method = "euclidean") | |
| h <- hclust(d, method = "ward") | |
| # dendogram: | |
| plot(h) | |
| # cut 5 clusters: | |
| clus5 = cutree(h, 5) | |
| # plot with red borders: | |
| rect.hclust(h, k = 5, border = "red") | |
| # phylogenic plots with ape: | |
| h.phylo = as.phylo(h) | |
| h.color = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")[clus5] | |
| # basic fan plot: | |
| plot(h.phylo, type = "fan", tip.color = h.color, label.offset = 1) | |
| # size of labels set to (log) fertility rate: | |
| plot(h.phylo, type = "fan", tip.color = h.color, label.offset = 1, | |
| cex = log(qog$fertility.rate, 2.5)) | |
| # k-means | |
| # ------- | |
| # k-means for 5 clusters: | |
| k <- kmeans(qog, 5) | |
| # describe the matrix | |
| k | |
| # plot with the cluster package | |
| clusplot(qog, k$cluster, color = TRUE, shade = TRUE, labels = 2, lines = 0) | |
| # k-means with ggplot2: | |
| # --------------------- | |
| # 2-dimensional subset: | |
| k = qog[, c("fertility.rate", "female.school.years")] | |
| # k-means: | |
| k = kmeans(k, 3) | |
| # extract centers: | |
| centers = data.frame(k$centers) | |
| # stick clusters to data: | |
| qog$cluster = factor(k$cluster) | |
| # reintroduce country codes: | |
| qog$iso3c = countrycode(row.names(qog), "country.name", "iso3c") | |
| # plot with ggplot2: | |
| ggplot(data = qog, aes(y = fertility.rate, x = female.school.years, color = cluster)) + | |
| geom_text(aes(label = iso3c)) + | |
| geom_point(data = centers, | |
| aes(color = 'Center'), | |
| size = 52, | |
| alpha = .3, | |
| show_guide = FALSE) | |
| # Confidence in public institutions | |
| # --------------------------------- | |
| # Quality of Government, Stata version. | |
| # Cheers to the OQG team and to the Swedish taxpayer. | |
| qog <- read.dta("data/qog.cs.dta") | |
| # Copy ISO-3C country code to row names. | |
| row.names(qog) <- qog$ccodealp | |
| # The next block takes advantage of the Stata format, which includes variable labels (short text descriptions). By default, the labels are stored as dataset attributes: the next code block copies them to the variable names and then subsets the data to variables featuring the keyword "confidence" in their description. | |
| # Copy Stata label over variable names. | |
| names(qog) <- attr(qog, "var.labels") | |
| # Subset to variables with "confidence" in their name. | |
| qog <- qog[, grepl("confidence", names(qog), ignore.case = TRUE)] | |
| # Clean up variable names. | |
| names(qog) <- gsub("Confidence: |the ", "", names(qog)) | |
| # Remove a few columns with less observations. | |
| qog <- qog[, !names(qog) %in% c("European Union", "NATO")] | |
| # Subset to nonmissing data. | |
| qog <- na.omit(qog) | |
| # Finalized dataset. | |
| str(qog) | |
| # Countries considered. | |
| rownames(qog) | |
| # Extract regions. | |
| table(countrycode(rownames(qog), "iso3c", "region")) | |
| # PCA with FactoMineR | |
| # ------------------- | |
| # The two-lines approach. | |
| qog.PCA <- PCA(qog, graph = FALSE) | |
| plot(qog.PCA) | |
| # with ggplot, you need the standard prcomp object: | |
| qog.prcomp <- prcomp(qog, scale = TRUE) | |
| # ggbiplot | |
| ggbiplot(qog.prcomp, | |
| labels = rownames(qog), | |
| groups = countrycode(rownames(qog), "iso3c", "continent"), | |
| ellipse = TRUE) + | |
| coord_equal(xlim = c(-3, 3), ylim = c(-3, 3)) | |
| # Hierarchical clustering with ClustOfVar | |
| # --------------------------------------- | |
| # The two-lines approach. | |
| qog.tree <- hclustvar(qog) | |
| plot(qog.tree) | |
| # ggplot2 requires the standard hclust object: | |
| qog.hclust <- hclust(dist(qog)) | |
| # plot countries by similarity with ggdendro: | |
| ggdendrogram(qog.hclust, rotate = TRUE, size=4) | |
| # We'll let you build your examples in groups for the first class exercise. | |
| # 2013-06-21 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## IDA Session 6 | |
| ## ------------- | |
| packages <- c("gplots", "ggplot2", "Hmisc", "car", "gmodels") | |
| packages <- lapply(packages, FUN = function(x) { | |
| if (!require(x, character.only = TRUE)) { | |
| install.packages(x) | |
| library(x, character.only = TRUE) | |
| } | |
| }) | |
| qog <- read.csv("data/qog_basic_cs.csv") | |
| # The list of variables is quasi-identical to the one used previously. | |
| # Let's simply add an extra variable coding for geographical regions. | |
| qog_b <- with(qog, data.frame( | |
| ccode = ccodealp, # country three letter code | |
| cname = cname, # country name | |
| births = wdi_fr, # fertility rate | |
| gdpc = wdi_gdpc, # GDP per capita | |
| gexp = wdi_ge, # government expenditures as percentage of GDP | |
| hexpc = wdi_hec, # health care expenditure per capita | |
| gini = uw_gini, # GINI coefficient of inequality | |
| edu = bl_asyt25,# average years of schooling | |
| gris = wdi_gris, # female to male ratio in schools | |
| winpar = m_wominpar, # percentage of women in parliament | |
| region = ht_region # region of country | |
| )) | |
| # Clean up missing data. | |
| qog_b <- na.omit(qog_b) | |
| # Clean up row names. | |
| rownames(qog_b) <- NULL | |
| # Reorder country factor levels by their respective fertility rates. | |
| qog_b$cname <- with(qog_b, reorder(cname, births)) | |
| # Recodings | |
| # --------- | |
| #Let's give value labels to the region var | |
| qog_b$region<-factor(qog_b$region, levels=c(1,2,3,4,5,6,7,8,9,10), labels=c("EEur","LatAm","NAfr&MEast","SubSAfr","WEur&NAm","EAsia","SEAsia","SAsia","Pacif","Carib")) | |
| #Let's recode some vars into categorical variables | |
| #This will be useful for presenting some of the stats and graphs from this session | |
| #Recode all continuous vars into two equally sized categories for low and high levels | |
| qog_b$births2<-(cut(qog_b$births, breaks=quantile(qog_b$births,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$gdpc2<-(cut(qog_b$gdpc, breaks=quantile(qog_b$gdpc,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$gexp2<-(cut(qog_b$gexp, breaks=quantile(qog_b$gexp,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$hexpc2<-(cut(qog_b$hexpc, breaks=quantile(qog_b$hexpc,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$gini2<-(cut(qog_b$gini, breaks=quantile(qog_b$gini,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$edu2<-(cut(qog_b$edu, breaks=quantile(qog_b$edu,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$gris2<-(cut(qog_b$gris, breaks=quantile(qog_b$gris,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| qog_b$winpar2<-(cut(qog_b$winpar, breaks=quantile(qog_b$winpar,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T)) | |
| #Finally, let's tell R to look in the qog_b subset for any variable we mention subsequently | |
| #The command saves us time and frees us from the need to type "qog_b$" in front of every variable call | |
| #Note that the command is canceled as follows: detach(qog_b) | |
| attach(qog_b) | |
| #DESCRIPTIVE UNIVARIATE STATS | |
| #Basic commands for continuous (and ordinal) measures | |
| describe(qog_b) | |
| summary(qog_b) | |
| #You can obtain specific stats with the following commands | |
| max(births) | |
| min(births) | |
| mean(births) | |
| median(births) | |
| sd(births) | |
| quantile(births) | |
| quantile(births, probs=c(0.1, 0.9)) #for specific cut points | |
| #Basic commands for categorical (and ordinal) measures | |
| table(region) #frequencies by category | |
| prop.table(table(region)) #in proportions | |
| round(prop.table(table(region)), 2) #rounds to two decimals points | |
| round(100*(prop.table(table(region)))) #in % | |
| #GRAPHICS OF DISTRIBUTIONS | |
| #For continuous vars | |
| hist(births, main="Historgram of Fertility in 81 countries") | |
| hist(births, br=20) | |
| stripchart(births, method="jitter", main="Stripchart with Jitter") | |
| boxplot(births, main="Boxplot of Fertility in 81 countries") | |
| qqnorm(births) | |
| qqline(births) | |
| #For categorical vars | |
| barplot(table(region)) | |
| barplot(table(region), horiz=T) | |
| dotchart(table(region), cex=1) | |
| pie(table(region)) | |
| #DESCRIPTIVE BIVARIATE STATS | |
| #For a pair of categorical vars | |
| table(region, births2) #frequencies | |
| round(100*(prop.table(table(region, births2))),2) #in total % | |
| round(100*(prop.table(table(region, births2),1)),2) #in row % | |
| round(100*(prop.table(table(region, births2),2)),2) #in column % | |
| #GRAPHICS OF BIVARIATE ASSOCIATIONS | |
| #For a pair of categorical vars | |
| mosaicplot(table(region, births2), col=T) | |
| mosaicplot(table(gini2, births2), col=T) | |
| #For a continuous and a categorical vars | |
| boxplot(births~region) | |
| qplot(region, births, geom=c("boxplot","point")) | |
| boxplot(births~gini2) | |
| qplot(gini2, births, geom=c("boxplot","point")) | |
| qplot(region, births, geom="boxplot", fill=gini2) | |
| #ASSOCIATION TESTS | |
| #Test of independence of a pair of categorical variables | |
| chisq.test(table(region, births2)) #chi-squared test of independence | |
| fisher.test(table(region, births2)) #for vars with categories<5 observations, this test is more appropriate | |
| chisq.test(table(gini2, births2)) | |
| chisq.test(table(gdpc2, births2)) | |
| chisq.test(table(gexp2, births2)) #etc. | |
| #Test of independence of a binary and a continuous variables | |
| t.test(births~gini2) | |
| t.test(births~gdpc2) | |
| t.test(births~gexp2) #etc. | |
| #CONFIDENCE INTERVALS | |
| t.test(births) | |
| #The barplots below don't work for region b/c there are categories with too few cases to estimate the conf. intervals | |
| b_gini_mean<-tapply(births, gini2, mean) | |
| lower<-tapply(births, gini2, function(v) t.test(v)$conf.int[1]) | |
| upper<-tapply(births, gini2, function(v) t.test(v)$conf.int[2]) | |
| barplot2(b_gini_mean, plot.ci=T, ci.l=lower, ci.u=upper) | |
| b_gdpc_mean<-tapply(births, gdpc2, mean) | |
| lower<-tapply(births, gdpc2, function(v) t.test(v)$conf.int[1]) | |
| upper<-tapply(births, gdpc2, function(v) t.test(v)$conf.int[2]) | |
| barplot2(b_gdpc_mean, plot.ci=T, ci.l=lower, ci.u=upper) | |
| b_gexp_mean<-tapply(births, gexp2, mean) | |
| lower<-tapply(births, gexp2, function(v) t.test(v)$conf.int[1]) | |
| upper<-tapply(births, gexp2, function(v) t.test(v)$conf.int[2]) | |
| barplot2(b_gexp_mean, plot.ci=T, ci.l=lower, ci.u=upper) | |
| b_gris_mean<-tapply(births, gris2, mean) | |
| lower<-tapply(births, gris2, function(v) t.test(v)$conf.int[1]) | |
| upper<-tapply(births, gris2, function(v) t.test(v)$conf.int[2]) | |
| barplot2(b_gris_mean, plot.ci=T, ci.l=lower, ci.u=upper) | |
| b_winpar_mean<-tapply(births, winpar2, mean) | |
| lower<-tapply(births, winpar2, function(v) t.test(v)$conf.int[1]) | |
| upper<-tapply(births, winpar2, function(v) t.test(v)$conf.int[2]) | |
| barplot2(b_winpar_mean, plot.ci=T, ci.l=lower, ci.u=upper) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## IDA Session 7 | |
| ## ------------- | |
| # Package loading | |
| # --------------- | |
| # Load/install packages. | |
| packages <- c("countrycode", "ggplot2", "Hmisc", "plyr", "scales") | |
| packages <- lapply(packages, FUN = function(x) { | |
| if (!require(x, character.only = TRUE)) { | |
| install.packages(x) | |
| library(x, character.only = TRUE) | |
| } | |
| }) | |
| # DATA PREPARATION | |
| # ---------------- | |
| # Let's first review some of the things that we now know how to do with data. | |
| # Load dataset. | |
| qog_cs <- read.csv("data/qog_basic_cs.csv") | |
| # Create an extract of the QOG cross-sectional dataset. The list of variables is | |
| # quasi-identical to the one used previously, with an extra variable for regions. | |
| qog <- with(qog_cs, data.frame( | |
| row.names = cname, # Country name (set as row name in the data extract) | |
| ccode = ccodealp, # Country 3-letter code | |
| region = ht_region, # Geographical region of the country | |
| births = wdi_fr, # Fertility rate | |
| gdpc = wdi_gdpc, # GDP per capita | |
| gexp = wdi_ge, # Government expenditures as percentage of GDP | |
| hexpc = wdi_hec, # Health care expenditure per capita | |
| gini = uw_gini, # GINI coefficient of economic inequality | |
| edu = bl_asyt25, # Average years of schooling | |
| gris = wdi_gris, # Female to male ratio in schools | |
| winpar = m_wominpar # Percentage of women in parliament | |
| )) | |
| # Geographic indicators | |
| # --------------------- | |
| # Let's now see what kind of geographic descriptors we have in the dataset. | |
| # The first one is a list of ISO-3 country codes, an international standard. | |
| levels(qog$ccode) | |
| # This list has been supplemented, in the QOG dataset, by a geographic variable | |
| # combining spatial proximity and expert decisions on regional democratization. | |
| str(qog$region) | |
| # An issue here is that we imported the data without its labels, so the regions | |
| # are uninformative. But with ISO-3 country codes, we can recreate UN regions. | |
| table(countrycode(qog$ccode, "iso3c", "region")) | |
| # We save the UN continent instead to create groups with many countries each. | |
| qog$region <- with(qog, countrycode(ccode, "iso3c", "continent")) | |
| # Check result. | |
| qplot(qog$region, geom = "bar") | |
| # Missing data | |
| # ------------ | |
| # Some entries in the QOG dataset are not recognized as UN countries. | |
| rownames(qog)[is.na(qog$region)] | |
| # Drop these cases, that have been counted in other country indicators. | |
| qog <- subset(qog, !is.na(region)) | |
| # The analysis that we will run today also runs faster without missing data. | |
| # Let's identify the rows with missing data. | |
| qog$fulldata <- complete.cases(qog) | |
| # Check result. The complete.cases() function is a utility function that will | |
| # return TRUE if there is absolutely no missing data on a row of the dataset. | |
| table(qog$fulldata) | |
| # Identify where the missing data lies. There is a lot, so we will remember that | |
| # when we look at plots and reading the results of statistical tests. | |
| qplot(data = qog, x = region, fill = fulldata, geom = "bar") | |
| # Subset to full data rows. | |
| qog <- na.omit(qog) | |
| # Reordering factors | |
| # ------------------ | |
| # In R, sorting the data is different from ordering levels in a variable. | |
| # Let's see how to sort the data from highest to lowest fertility rates. | |
| by_births <- order(qog$births, decreasing = TRUE) | |
| # Show the first three columns in that sorting order. | |
| head(qog[by_births, 1:3]) | |
| # For ascending sort order, you would simply not specify the decreasing option. | |
| head(qog[order(qog$births), 1:3]) | |
| # Countries for which fertility rates are missing are always sorted last. | |
| tail(qog[order(qog$births), 1:3]) | |
| # But this is different from the order of a string variable like country codes. | |
| # The country code variable holds text, coded as factors and ordered by levels. | |
| str(qog$ccode) | |
| # An issue here is that R naturally orders factors by alphabetical order, which | |
| # is not optimal for ordering categorical variables like countries in plots. | |
| # Here's an example where countries are alphabetically ordered. | |
| qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point") | |
| # Now, reorder country code factor levels by their respective fertility rates. | |
| qog$ccode <- with(qog, reorder(ccode, births, mean)) | |
| # The example now shows countries ordered by fertility rates. | |
| qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point") | |
| # Let's now visualize the distribution of fertility rates in each region. | |
| qplot(data = qog, x = births, fill = region, color = region, geom = "density") + | |
| facet_grid(region ~ .) | |
| # The regions are shown in alphabetical order. Let's fix that. | |
| qog$region <- with(qog, reorder(region, -births, mean)) | |
| # Rerun the plot to see regions ordered by average fertility rates. | |
| qplot(data = qog, x = births, fill = region, color = region, geom = "density") + | |
| facet_grid(region ~ .) | |
| # These manipulations show how to plot distributions as densities and how to use | |
| # the mean value of a distribution to summarize it. More after our last recodes. | |
| # Log transformation | |
| # ------------------ | |
| # Let's turn to a different variable: GDP per capita. | |
| qplot(data = qog, x = region, y = gdpc, color = region, geom = "boxplot") | |
| # For such a variable, a log-transformation helps. | |
| qplot(data = qog, x = region, y = log(gdpc), color = region, geom = "boxplot") | |
| # What exactly are we doing here? We are basically compressing the distribution | |
| # of the variable to get outliers in closer range to other observations. | |
| # Let's look at the distribution of GDP per capita more closely to get that. | |
| summary(qog$gdpc) | |
| # The maximum value is pretty far away from the median value. | |
| with(qog, max(gdpc) / median(gdpc)) | |
| # Order the data by GDP per capita. | |
| qog <- qog[order(qog$gdpc, decreasing = TRUE), ] | |
| # Get the five leader countries. | |
| head(qog)[1:4] | |
| # Now reorder the levels of the country code factor variable by GDP per capita. | |
| qog$ccode <- reorder(qog$ccode, qog$gdpc, mean) | |
| # In fact, the top 5-10% observations are just on a completely different level. | |
| qplot(data = qog, x = gdpc, stat = "ecdf", geom = "step") | |
| # This curve is the empirical cumulative distribution function (ECDF) of the | |
| # variable: it shows how its values change throughout its quantiles. | |
| # Here's the ECDF for each region: see which one creates most variation. | |
| qplot(data = qog, x = gdpc, stat = "ecdf", color = region, geom = "step") | |
| # Now see what happens to the ECDFs when you log the variable. | |
| qplot(data = qog, x = log(gdpc), stat = "ecdf", color = region, geom = "step") | |
| # The overall distribution of log-GDP per capita is more linear. | |
| qplot(data = qog, x = log(gdpc), stat = "ecdf", geom = "step") | |
| # Log-transform GDP per capita. | |
| qog$gdpc <- log(qog$gdpc) | |
| # Interval recodes | |
| # ---------------- | |
| # Let's turn to yet another variable: government expenditure as % of GDP. | |
| qplot(data = qog, x = region, y = gexp, color = region, geom = "boxplot") | |
| # The plots show great variability of government expenditure within regions. | |
| qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") + | |
| facet_grid(region ~ .) | |
| # Let's try a 'Hans Rosling' genius plot: a stacked density plot by region. | |
| qplot(data = qog, x = gexp, ..density.., geom = "density", | |
| color = region, fill = region, alpha = I(.75), position = "stack") | |
| # However, this plot does not preserve the count of each region. This one does. | |
| qplot(data = qog, x = gexp, ..count.., geom = "density", | |
| color = region, fill = region, alpha = I(.75), position = "fill") | |
| # One option now is to recode this continuous variable to categories of low, | |
| # medium and high government expenditure, for instance. How do we get there? | |
| # Let's recode some continuous variables into categorical ones by dividing them | |
| # into two equally sized groups defined by their median (50th percentile) value. | |
| # Summarize the variable. | |
| summary(qog$gexp) | |
| # The quartiles and median are the 25th, 50th and 75th percentiles. They can be | |
| # used to "cut" the variable to intervals. First, compute the quartile values. | |
| q <- quantile(qog$gexp) | |
| # Check the quartile values. | |
| q | |
| # Plot regional distributions of government expenditures with sample quartiles. | |
| qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") + | |
| geom_vline(x = q, linetype = "dashed") + facet_grid(region ~ .) | |
| # Now create a new variable by using these cutoff points as interval categories. | |
| qog$gexp.4 <- cut(qog$gexp, breaks = q) | |
| # Check the interval categories. | |
| table(qog$gexp.4) | |
| # Check them with boxplots. | |
| qplot(data = qog, y = gexp, x = gexp.4, geom = "boxplot") | |
| # Stack them by region. | |
| g <- qplot(data = qog, x = region, fill = gexp.4, geom = "bar") | |
| # Check result, which is unsatisfactory with default colors and missing values. | |
| g | |
| # Plot with blue sequential gradient. | |
| g + scale_fill_brewer() | |
| # Plot with red-blue diverging gradient. | |
| g + scale_fill_brewer(palette = "RdBu") | |
| # Back to the data. We are going to produce interval recodes for all variables, | |
| # using a very simple cutoff point for the values: above or below the median. | |
| # Define a function to cut the data at median and label segments "lo" and "hi". | |
| hilo <- function(x) { | |
| cut(x, breaks = quantile(x, probs = 0:2/2), labels = c("lo", "hi"), | |
| ordered = TRUE, include.lowest = TRUE) | |
| } | |
| # Apply to all continuous variables. | |
| x <- lapply(qog[, 3:10], FUN = hilo) | |
| # Rename variables with original names, followed by ".2" to discriminate them. | |
| names(x) <- paste0(names(x), ".2") | |
| # Convert to data frame and check result. | |
| str(x) | |
| # Add these variables to the original QOG data. | |
| qog <- cbind(qog, as.data.frame(x)) | |
| # Check final result. | |
| str(qog) | |
| # Let's tell R to look in the QOG data for any variable we mention subsequently. | |
| # The command saves us time and frees us from the need to call the qog object in | |
| # in front of every variable call (e.g. qog$births). To cancel the attachment, | |
| # we will type detach(qog) when were are done. | |
| attach(qog) | |
| # DESCRIPTIVE STATISTICS | |
| # ---------------------- | |
| # What's on the menu here? Take a variable like log-GDP per capita. This is the | |
| # distribution, a.k.a the probability density function (PDF) of the variable. | |
| qplot(gdpc, geom = "density") | |
| # It is easier to read it in natural units (constant USD), so exponentiate it. | |
| qplot(exp(gdpc), geom = "density") | |
| # And now have a look again at which countries compose the distribution. | |
| qplot(exp(gdpc), ..count.., geom = "density", | |
| color = region, fill = region, alpha = I(.75), position = "stack") | |
| # For such a variable, the mean and the median can describe the distribution. | |
| # Now turn again to some previously studied examples of categorical variables. | |
| # Frequencies of government expenditure in each region. | |
| qplot(x = region, fill = gexp.4, geom = "bar") | |
| # Plot stacked (relative) frequencies of government expenditure in each region. | |
| qplot(x = region, fill = gexp.4, geom = "bar", position = "fill") + | |
| scale_y_continuous(labels = percent) + scale_fill_brewer() | |
| # Here, frequencies and relative frequencies (percentages) make sense. | |
| # The next sections show to produce these summary statistics. | |
| # Continuous (and ordinal) measures | |
| # --------------------------------- | |
| # Standard summary statistics (the 'five-number summary' figures). | |
| summary(qog[, 3:9]) | |
| # Obtain summary statistics for the fertility rate: | |
| # (1) Range | |
| max(births) | |
| min(births) | |
| # (2) Arithmetic mean | |
| mean(births) | |
| sum(births) / length(births) # formula | |
| # (3) Standard deviation | |
| sd(births) | |
| sqrt(sum((births - mean(births))^2) / length(births)) # formula: sqrt(var(x)) | |
| # (4) Median | |
| median(births) | |
| quantile(births, probs = .5) # 50th percentile | |
| # (5) Percentiles | |
| quantile(births) # quartiles | |
| quantile(births, probs = c(0.1, 0.9)) # specific cutoff points | |
| # Plots for continuous variables | |
| # ------------------------------ | |
| # Histograms. | |
| hist(births, main="Histogram of Fertility in 81 countries") | |
| hist(births, br = 20) | |
| # For normality assessment, you first want to visualize the normal distribution. | |
| curve(dnorm, from = -3, to = 3, col = "red", lwd = 2) | |
| # Then you want to visualize the quantiles of your variable against normal ones. | |
| # Say hello to the normal quantile-quantile ('QQ') plot. | |
| qqnorm(births) | |
| # And finally you want to add a line that would correspond to perfect normality. | |
| # Note that this command requires that you have just plotted a normal QQ-plot. | |
| qqline(births, col = "red") | |
| # More default plots. | |
| stripchart(births, method = "jitter", main = "Stripchart with jitter") | |
| boxplot(births, main = "Boxplot") | |
| # For a continuous and a categorical variable, boxplots have an easy syntax. | |
| boxplot(births ~ region) | |
| boxplot(births ~ gini.2) | |
| # The next plots go further than the default R syntax. You need to learn ggplot2 | |
| # to get the syntax. An awesome handbook is Winston Chang's R Graphics Cookbook. | |
| # Histograms, using ggplot2 syntax. | |
| qplot(births) | |
| qplot(births, binwidth = 1) | |
| ggplot(qog, aes(x = births)) + | |
| geom_histogram(binwidth = 1, fill = "white", color = "black") | |
| # Density plots, using ggplot2 syntax. | |
| qplot(births, geom = "density") | |
| qplot(births, stat = "density", geom = "line") | |
| # Combined histogram and density plots. | |
| ggplot(qog, aes(x = births, y = ..density..)) + | |
| geom_histogram(binwidth = 1, fill = "cornsilk", color = "grey50") + | |
| geom_line(stat = "density", size = 2) | |
| # Normal QQ-plot. | |
| qplot(sample = births) | |
| qplot(sample = births, color = region) | |
| # Boxplots. | |
| qplot(y = births, x = region, geom = "boxplot") | |
| qplot(y = births, x = region, geom = "boxplot") + geom_point(alpha = .5) | |
| qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2) | |
| qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2) + coord_flip() | |
| # Violin plots. | |
| qplot(y = births, x = region, geom = "violin") + geom_point() | |
| qplot(y = births, x = region, geom = "violin") + geom_point(alpha = .5) | |
| # Categorical (and ordinal) measures | |
| # ---------------------------------- | |
| # Obtain frequencies (count and relative) for geographic regions: | |
| table(region) # frequencies by category | |
| prop.table(table(region)) # in proportions | |
| round(prop.table(table(region)), 1) # rounded to one decimal point | |
| round(100 * (prop.table(table(region)))) # shown in percentages | |
| # R was clearly not designed by people who use percentages a lot. | |
| # Let's write a function to do this in as less code as possible. | |
| percents <- function(r, c, which = 0, digits = 1) { | |
| if(which == "rows") which = 1 | |
| if(which == "cols") which = 2 | |
| if(which == "cell" | which == 0) which = NULL | |
| round(100 * prop.table(table(r, c), which), digits) | |
| } | |
| # Cell percentages. | |
| percents(region, births.2) | |
| # Check that the function works. | |
| sum(percents(region, births.2, "cell")) # this is the default | |
| # Row percentages. | |
| percents(region, births.2, "rows") # you can use 1 instead of "rows" | |
| # Column percentages. | |
| percents(region, births.2, "cols") # you can use 2 instead of "cols" | |
| # Plots for categorical variables | |
| # ------------------------------- | |
| # Default bar/dot plots, for categorical variables. | |
| barplot(table(region)) | |
| barplot(table(region), horiz = TRUE) | |
| dotchart(table(region), cex = 1) | |
| # For a pair of categorical variables, use mosaic plots. | |
| mosaicplot(table(region, births.2), col = TRUE) | |
| mosaicplot(table(gini.2, births.2), col = TRUE) | |
| # And finally, the infamous pie chart that makes R look like Excel 97. | |
| pie(table(region)) | |
| # Using ggplot2 syntax, we can show how to transform bars into polar coordinates | |
| # (which is, again, a perceptual nightmare for the reader). | |
| # Bar plots. | |
| qplot(region, geom = "bar") | |
| qplot(region, geom = "bar", fill = gexp.4) + coord_flip() | |
| # "This is a pie of my favourite bars" (Barney Stinson). | |
| qplot(region, geom = "bar", fill = gexp.4) + coord_polar() + | |
| scale_fill_brewer(palette = "RdBu") | |
| # Dot plots. | |
| qplot(region, geom = "point") | |
| # ASSOCIATION TESTS | |
| # ----------------- | |
| # Significance testing revolves around the idea that a statistic, whatever it | |
| # might stand for, can be estimated to be significantly different from zero. | |
| # One of the simplest forms of test, the t-test, is shown below. | |
| t.test(births) | |
| # The bounds of the test form a confidence interval: the range in which the | |
| # test estimates that the mean of the fertility rate is located. The width of | |
| # the interval depends on both the sample size and the level of confidence. | |
| c(t.test(births)$conf.int[1], mean(births), t.test(births)$conf.int[2]) | |
| # Confidence intervals | |
| # -------------------- | |
| # Here's the manual way to obtain a confidence interval, based on the assumption | |
| # that the data follows some approximation of the normal distribution. We first | |
| # compute the mean, standard deviation and N of fertility rates in each region. | |
| ci <- ddply(qog, .(region), summarise, | |
| mu = mean(births), | |
| sd = sd(births), | |
| n = length(births)) | |
| # We now turn to the standard error, which is an approximation of the standard | |
| # deviation of the data in the true population (each region), assuming that it | |
| # can be approximated from the sample standard deviation divided by sqrt(N). | |
| ci$se <- ci$sd / sqrt(ci$n) | |
| # We finally decide for a 95% level of confidence that corresponds roughly to | |
| # two standard errors around the mean. In a normal distribution, roughly 95% | |
| # of all observations fall within this range around the mean. | |
| ci$lo <- ci$mu - 1.96 * ci$se | |
| ci$hi <- ci$mu + 1.96 * ci$se | |
| # Plot 95% CIs. | |
| qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") + | |
| geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) | |
| # Plot 99% CIs. | |
| qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") + | |
| geom_errorbar(aes(ymax = mu + 2.58 * se, ymin = mu - 2.58 * se), width = .25) | |
| # Throw all this code into a convenience function. | |
| getCI <- function(data, x, group, z = 1.96) { | |
| require(plyr) | |
| df <- with(data, data.frame(x, group, z, n = 1)) | |
| df <- na.omit(df) | |
| print(head(df)) | |
| ci <- ddply(df, .(group), summarise, | |
| mu = mean(x), | |
| sd = sd(x), | |
| n = sum(n), | |
| se = sd(x) / sqrt(sum(n)), | |
| lo = mean(x) - mean(z) * sd(x) / sqrt(sum(n)), | |
| hi = mean(x) + mean(z) * sd(x) / sqrt(sum(n))) | |
| return(ci) | |
| } | |
| # Test by getting 95% CIs for fertility in the high and low GINI country groups. | |
| ci <- getCI(qog, births, gini.2) | |
| # Plot the result. | |
| qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") + | |
| geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) + | |
| labs(y = "Mean fertility rate", x = "Level of GINI coefficient") | |
| # Test by getting 95% CIs for fertility in the high and low GINI country groups. | |
| ci <- getCI(qog, births, gdpc.2) | |
| # Plot the result. | |
| qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") + | |
| geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) + | |
| labs(y = "Mean fertility rate", x = "Level of GDP per capita") | |
| # In truth, this is typically the case where we would be graphing scatterplots. | |
| qplot(births, gdpc) + geom_smooth() | |
| # ... but that will have to wait one more week :) | |
| # With two categorical variables | |
| # ------------------------------ | |
| chisq.test(table(region, births.2)) # Chi-squared test of independence | |
| fisher.test(table(region, births.2)) # if cell counts < 5, Fisher's test is recommended | |
| chisq.test(table(gini.2, births.2)) | |
| chisq.test(table(gdpc.2, births.2)) | |
| chisq.test(table(gexp.2, births.2)) # etc. | |
| # With one continuous variable and a binary variable | |
| # -------------------------------------------------- | |
| t.test(births ~ gini.2) | |
| t.test(births ~ gdpc.2) | |
| t.test(births ~ gexp.2) # etc. | |
| detach(qog) | |
| # That's all for now! Enjoy your day. | |
| # 2013-02-18 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment