briatte · December 23, 2014 07:19
diff --git a/5_trust.r b/5_trust.r

 ## IDA Exercise 5
 ## --------------


 # This exercise replicates PCA plots using World Values Survey data on
 # aggregate confidence in public institutions.

 # The data is taken from the Quality of Government dataset. The functions
 # contain a mix of base functions and custom packages.


 # Packages
 # --------

 # Load/install packages.
 packages <- c("ape", "ClustOfVar", "cluster", "countrycode", "devtools", "FactoMineR", "foreign", "ggdendro", "ggplot2", "reshape")
 packages <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x)
    library(x, character.only = TRUE)
  }
 })

 # install ggbiplot
 if(!"ggbiplot" %in% installed.packages()[, 1]) install_github("ggbiplot", "vqv")
 require(ggbiplot)


 # Worldwide fertility
 # -------------------

 # downloaded during last session:
 qog.data <- read.csv("data/qog.cs.csv", stringsAsFactors = FALSE)

 # drop duplicate observation:
 qog.data = subset(qog.data, cname != "Congo, Democratic Republic")

 # subset:
 qog <- with(qog.data, data.frame(
  fertility.rate  = wdi_fr,    # fertility rate
  gdp.per.capita  = wdi_gdpc,  # GDP per capita
  gov.expenditure = wdi_ge,    # government expenditures as % of GDP
  healthcare.exp  = wdi_hec,   # health expenditure per capita
  gini.inequality = wdi_gini,  # Gini coefficient of inequality
  female.school.years = bl_asy25f, # average female years of schooling
  female.school.ratio = wdi_gris,  # female to male ratio in schools
  women.in.parliament = gid_wip    # % of women in parliament
  ))

 # row names:
 rownames(qog) = gsub("\\(.*", "", qog.data$cname)

 # inspect:
 head(qog)

 # sample reduction:
 qog <- na.omit(qog)


 # Heatmap
 # -------

 # 1. refactor
 qog_hm = data.frame(qog, cname = row.names(qog))
 qog_hm$cname = with(qog_hm, reorder(cname, fertility.rate))

 # 2. melt
 qog_hm <- melt(qog_hm)
 qog_hm <- ddply(qog_hm, .(variable), transform, rescale = scale(value))

 # 3. plot
 p <- ggplot(qog_hm, aes(variable, cname)) +
  geom_tile(aes(fill = rescale), colour="white") +
  scale_fill_gradient(low = "white", high = "steelblue")

 # 4. theme
 p + theme_grey(base_size = 12) + 
  labs(x = NULL, y = NULL) + 
  scale_x_discrete(expand = c(0, 0)) + 
  scale_y_discrete(expand = c(0, 0)) + 
  theme(legend.position = "none",
        axis.text.x = element_text(size = 16, angle = 60, hjust = 1, colour = "grey50"))


 # Principal components
 # --------------------

 fit <- princomp(qog, cor=TRUE)

 # screeplot:
 plot(fit, type = "l")

 # loadings:
 loadings(fit)

 # biplot:
 biplot(fit)


 # Hierarchical Clustering
 # -----------------------

 # create Ward distance matrix
 d <- dist(qog, method = "euclidean")
 h <- hclust(d, method = "ward")

 # dendogram:
 plot(h)

 # cut 5 clusters:
 clus5 = cutree(h, 5)

 # plot with red borders:
 rect.hclust(h, k = 5, border = "red")

 # phylogenic plots with ape:
 h.phylo = as.phylo(h)
 h.color = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")[clus5]

 # basic fan plot:
 plot(h.phylo, type = "fan", tip.color = h.color, label.offset = 1)

 # size of labels set to (log) fertility rate:
 plot(h.phylo, type = "fan", tip.color = h.color, label.offset = 1,
     cex = log(qog$fertility.rate, 2.5))


 # k-means
 # -------

 # k-means for 5 clusters:
 k <- kmeans(qog, 5)

 # describe the matrix
 k

 # plot with the cluster package
 clusplot(qog, k$cluster, color = TRUE, shade = TRUE, labels = 2, lines = 0)


 # k-means with ggplot2:
 # ---------------------

 # 2-dimensional subset:
 k = qog[, c("fertility.rate", "female.school.years")]

 # k-means:
 k = kmeans(k, 3)

 # extract centers:
 centers = data.frame(k$centers)

 # stick clusters to data:
 qog$cluster = factor(k$cluster)

 # reintroduce country codes:
 qog$iso3c = countrycode(row.names(qog), "country.name", "iso3c")

 # plot with ggplot2:
 ggplot(data = qog, aes(y = fertility.rate, x = female.school.years, color = cluster)) + 
  geom_text(aes(label = iso3c)) +
  geom_point(data = centers, 
             aes(color = 'Center'), 
             size = 52, 
             alpha = .3, 
             show_guide = FALSE)


 # Confidence in public institutions
 # ---------------------------------

 # Quality of Government, Stata version.
 # Cheers to the OQG team and to the Swedish taxpayer.
 qog <- read.dta("data/qog.cs.dta")

 # Copy ISO-3C country code to row names.
 row.names(qog) <- qog$ccodealp

 # The next block takes advantage of the Stata format, which includes variable labels (short text descriptions). By default, the labels are stored as dataset attributes: the next code block copies them to the variable names and then subsets the data to variables featuring the keyword "confidence" in their description.

 # Copy Stata label over variable names.
 names(qog) <- attr(qog, "var.labels")

 # Subset to variables with "confidence" in their name.
 qog <- qog[, grepl("confidence", names(qog), ignore.case = TRUE)]

 # Clean up variable names.
 names(qog) <- gsub("Confidence: |the ", "", names(qog))

 # Remove a few columns with less observations.
 qog <- qog[, !names(qog) %in% c("European Union", "NATO")]

 # Subset to nonmissing data.
 qog <- na.omit(qog)

 # Finalized dataset.
 str(qog)

 # Countries considered.
 rownames(qog)

 # Extract regions.
 table(countrycode(rownames(qog), "iso3c", "region"))


 # PCA with FactoMineR
 # -------------------

 # The two-lines approach.
 qog.PCA <- PCA(qog, graph = FALSE)
 plot(qog.PCA)

 # with ggplot, you need the standard prcomp object:
 qog.prcomp <- prcomp(qog, scale = TRUE)

 # ggbiplot
 ggbiplot(qog.prcomp,
         labels = rownames(qog),
         groups = countrycode(rownames(qog), "iso3c", "continent"),
         ellipse = TRUE) + 
  coord_equal(xlim = c(-3, 3), ylim = c(-3, 3))


 # Hierarchical clustering with ClustOfVar
 # ---------------------------------------

 # The two-lines approach.
 qog.tree <- hclustvar(qog)
 plot(qog.tree)

 # ggplot2 requires the standard hclust object:
 qog.hclust <- hclust(dist(qog))

 # plot countries by similarity with ggdendro:
 ggdendrogram(qog.hclust, rotate = TRUE, size=4)


 # We'll let you build your examples in groups for the first class exercise.
 # 2013-06-21
diff --git a/6_chisq.r b/6_chisq.r
 ## IDA Session 6
 ## -------------

 packages <- c("gplots", "ggplot2", "Hmisc", "car", "gmodels")
 packages <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x)
    library(x, character.only = TRUE)
  }
 })
 qog <- read.csv("data/qog_basic_cs.csv")

 # The list of variables is quasi-identical to the one used previously.
 # Let's simply add an extra variable coding for geographical regions.
 qog_b <- with(qog, data.frame(
  ccode  = ccodealp, # country three letter code
  cname  = cname,    # country name
  births = wdi_fr,   # fertility rate
  gdpc   = wdi_gdpc, # GDP per capita
  gexp   = wdi_ge,   # government expenditures as percentage of GDP
  hexpc  = wdi_hec,  # health care expenditure per capita
  gini   = uw_gini,  # GINI coefficient of inequality 
  edu    = bl_asyt25,# average years of schooling
  gris   = wdi_gris, # female to male ratio in schools
  winpar = m_wominpar, # percentage of women in parliament
  region = ht_region # region of country
 ))

 # Clean up missing data.
 qog_b <- na.omit(qog_b)
 # Clean up row names.
 rownames(qog_b) <- NULL
 # Reorder country factor levels by their respective fertility rates.
 qog_b$cname <- with(qog_b, reorder(cname, births))


 # Recodings
 # ---------

 #Let's give value labels to the region var
 qog_b$region<-factor(qog_b$region, levels=c(1,2,3,4,5,6,7,8,9,10), labels=c("EEur","LatAm","NAfr&MEast","SubSAfr","WEur&NAm","EAsia","SEAsia","SAsia","Pacif","Carib"))

 #Let's recode some vars into categorical variables
 #This will be useful for presenting some of the stats and graphs from this session
 #Recode all continuous vars into two equally sized categories for low and high levels
 qog_b$births2<-(cut(qog_b$births, breaks=quantile(qog_b$births,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$gdpc2<-(cut(qog_b$gdpc, breaks=quantile(qog_b$gdpc,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$gexp2<-(cut(qog_b$gexp, breaks=quantile(qog_b$gexp,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$hexpc2<-(cut(qog_b$hexpc, breaks=quantile(qog_b$hexpc,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$gini2<-(cut(qog_b$gini, breaks=quantile(qog_b$gini,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$edu2<-(cut(qog_b$edu, breaks=quantile(qog_b$edu,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$gris2<-(cut(qog_b$gris, breaks=quantile(qog_b$gris,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
 qog_b$winpar2<-(cut(qog_b$winpar, breaks=quantile(qog_b$winpar,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))

 #Finally, let's tell R to look in the qog_b subset for any variable we mention subsequently
 #The command saves us time and frees us from the need to type "qog_b$" in front of every variable call
 #Note that the command is canceled as follows: detach(qog_b)
 attach(qog_b)

 #DESCRIPTIVE UNIVARIATE STATS
 #Basic commands for continuous (and ordinal) measures
 describe(qog_b)
 summary(qog_b)
 #You can obtain specific stats with the following commands
 max(births)
 min(births)
 mean(births)
 median(births)
 sd(births)
 quantile(births)
 quantile(births, probs=c(0.1, 0.9)) #for specific cut points

 #Basic commands for categorical (and ordinal) measures
 table(region) #frequencies by category
 prop.table(table(region)) #in proportions
 round(prop.table(table(region)), 2) #rounds to two decimals points
 round(100*(prop.table(table(region)))) #in %

 #GRAPHICS OF DISTRIBUTIONS
 #For continuous vars
 hist(births, main="Historgram of Fertility in 81 countries")
 hist(births, br=20)
 stripchart(births, method="jitter", main="Stripchart with Jitter")
 boxplot(births, main="Boxplot of Fertility in 81 countries")
 qqnorm(births)
 qqline(births)
 #For categorical vars
 barplot(table(region))
 barplot(table(region), horiz=T)
 dotchart(table(region), cex=1)
 pie(table(region))

 #DESCRIPTIVE BIVARIATE STATS
 #For a pair of categorical vars
 table(region, births2) #frequencies
 round(100*(prop.table(table(region, births2))),2) #in total %
 round(100*(prop.table(table(region, births2),1)),2) #in row %
 round(100*(prop.table(table(region, births2),2)),2) #in column %

 #GRAPHICS OF BIVARIATE ASSOCIATIONS
 #For a pair of categorical vars
 mosaicplot(table(region, births2), col=T)
 mosaicplot(table(gini2, births2), col=T)
 #For a continuous and a categorical vars
 boxplot(births~region)
 qplot(region, births, geom=c("boxplot","point"))
 boxplot(births~gini2)
 qplot(gini2, births, geom=c("boxplot","point"))
 qplot(region, births, geom="boxplot", fill=gini2)

 #ASSOCIATION TESTS
 #Test of independence of a pair of categorical variables
 chisq.test(table(region, births2)) #chi-squared test of independence
 fisher.test(table(region, births2)) #for vars with categories<5 observations, this test is more appropriate
 chisq.test(table(gini2, births2))
 chisq.test(table(gdpc2, births2))
 chisq.test(table(gexp2, births2)) #etc.

 #Test of independence of a binary and a continuous variables
 t.test(births~gini2)
 t.test(births~gdpc2)
 t.test(births~gexp2) #etc.

 #CONFIDENCE INTERVALS
 t.test(births)
 #The barplots below don't work for region b/c there are categories with too few cases to estimate the conf. intervals
 b_gini_mean<-tapply(births, gini2, mean)
 lower<-tapply(births, gini2, function(v) t.test(v)$conf.int[1])
 upper<-tapply(births, gini2, function(v) t.test(v)$conf.int[2])
 barplot2(b_gini_mean, plot.ci=T, ci.l=lower, ci.u=upper)

 b_gdpc_mean<-tapply(births, gdpc2, mean)
 lower<-tapply(births, gdpc2, function(v) t.test(v)$conf.int[1])
 upper<-tapply(births, gdpc2, function(v) t.test(v)$conf.int[2])
 barplot2(b_gdpc_mean, plot.ci=T, ci.l=lower, ci.u=upper)

 b_gexp_mean<-tapply(births, gexp2, mean)
 lower<-tapply(births, gexp2, function(v) t.test(v)$conf.int[1])
 upper<-tapply(births, gexp2, function(v) t.test(v)$conf.int[2])
 barplot2(b_gexp_mean, plot.ci=T, ci.l=lower, ci.u=upper)

 b_gris_mean<-tapply(births, gris2, mean)
 lower<-tapply(births, gris2, function(v) t.test(v)$conf.int[1])
 upper<-tapply(births, gris2, function(v) t.test(v)$conf.int[2])
 barplot2(b_gris_mean, plot.ci=T, ci.l=lower, ci.u=upper)

 b_winpar_mean<-tapply(births, winpar2, mean)
 lower<-tapply(births, winpar2, function(v) t.test(v)$conf.int[1])
 upper<-tapply(births, winpar2, function(v) t.test(v)$conf.int[2])
 barplot2(b_winpar_mean, plot.ci=T, ci.l=lower, ci.u=upper)
diff --git a/7_ttest.r b/7_ttest.r

 ## IDA Session 7
 ## -------------


 # Package loading
 # ---------------

 # Load/install packages.
 packages <- c("countrycode", "ggplot2", "Hmisc", "plyr", "scales")
 packages <- lapply(packages, FUN = function(x) {
  if (!require(x, character.only = TRUE)) {
    install.packages(x)
    library(x, character.only = TRUE)
  }
 })


 # DATA PREPARATION
 # ----------------

 # Let's first review some of the things that we now know how to do with data.

 # Load dataset.
 qog_cs <- read.csv("data/qog_basic_cs.csv")

 # Create an extract of the QOG cross-sectional dataset. The list of variables is
 # quasi-identical to the one used previously, with an extra variable for regions.
 qog <- with(qog_cs, data.frame(
  row.names  = cname, # Country name (set as row name in the data extract)
  ccode  = ccodealp,  # Country 3-letter code
  region = ht_region, # Geographical region of the country
  births = wdi_fr,    # Fertility rate
  gdpc   = wdi_gdpc,  # GDP per capita
  gexp   = wdi_ge,    # Government expenditures as percentage of GDP
  hexpc  = wdi_hec,   # Health care expenditure per capita
  gini   = uw_gini,   # GINI coefficient of economic inequality 
  edu    = bl_asyt25, # Average years of schooling
  gris   = wdi_gris,  # Female to male ratio in schools
  winpar = m_wominpar # Percentage of women in parliament
 ))


 # Geographic indicators
 # ---------------------

 # Let's now see what kind of geographic descriptors we have in the dataset.
 # The first one is a list of ISO-3 country codes, an international standard.
 levels(qog$ccode)

 # This list has been supplemented, in the QOG dataset, by a geographic variable
 # combining spatial proximity and expert decisions on regional democratization.
 str(qog$region)

 # An issue here is that we imported the data without its labels, so the regions
 # are uninformative. But with ISO-3 country codes, we can recreate UN regions.
 table(countrycode(qog$ccode, "iso3c", "region"))

 # We save the UN continent instead to create groups with many countries each.
 qog$region <- with(qog, countrycode(ccode, "iso3c", "continent"))

 # Check result.
 qplot(qog$region, geom = "bar")


 # Missing data
 # ------------

 # Some entries in the QOG dataset are not recognized as UN countries.
 rownames(qog)[is.na(qog$region)]

 # Drop these cases, that have been counted in other country indicators.
 qog <- subset(qog, !is.na(region))

 # The analysis that we will run today also runs faster without missing data.
 # Let's identify the rows with missing data.
 qog$fulldata <- complete.cases(qog)

 # Check result. The complete.cases() function is a utility function that will
 # return TRUE if there is absolutely no missing data on a row of the dataset.
 table(qog$fulldata)

 # Identify where the missing data lies. There is a lot, so we will remember that
 # when we look at plots and reading the results of statistical tests.
 qplot(data = qog, x = region, fill = fulldata, geom = "bar")

 # Subset to full data rows.
 qog <- na.omit(qog)


 # Reordering factors
 # ------------------

 # In R, sorting the data is different from ordering levels in a variable.
 # Let's see how to sort the data from highest to lowest fertility rates.
 by_births <- order(qog$births, decreasing = TRUE)

 # Show the first three columns in that sorting order.
 head(qog[by_births, 1:3])

 # For ascending sort order, you would simply not specify the decreasing option.
 head(qog[order(qog$births), 1:3])

 # Countries for which fertility rates are missing are always sorted last.
 tail(qog[order(qog$births), 1:3])

 # But this is different from the order of a string variable like country codes.
 # The country code variable holds text, coded as factors and ordered by levels.
 str(qog$ccode)

 # An issue here is that R naturally orders factors by alphabetical order, which
 # is not optimal for ordering categorical variables like countries in plots.

 # Here's an example where countries are alphabetically ordered.
 qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point")

 # Now, reorder country code factor levels by their respective fertility rates.
 qog$ccode <- with(qog, reorder(ccode, births, mean))

 # The example now shows countries ordered by fertility rates.
 qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point")

 # Let's now visualize the distribution of fertility rates in each region.
 qplot(data = qog, x = births, fill = region, color = region, geom = "density") +
  facet_grid(region ~ .)

 # The regions are shown in alphabetical order. Let's fix that.
 qog$region <- with(qog, reorder(region, -births, mean))

 # Rerun the plot to see regions ordered by average fertility rates.
 qplot(data = qog, x = births, fill = region, color = region, geom = "density") +
  facet_grid(region ~ .)

 # These manipulations show how to plot distributions as densities and how to use
 # the mean value of a distribution to summarize it. More after our last recodes.


 # Log transformation
 # ------------------

 # Let's turn to a different variable: GDP per capita.
 qplot(data = qog, x = region, y = gdpc, color = region, geom = "boxplot")

 # For such a variable, a log-transformation helps.
 qplot(data = qog, x = region, y = log(gdpc), color = region, geom = "boxplot")

 # What exactly are we doing here? We are basically compressing the distribution
 # of the variable to get outliers in closer range to other observations.

 # Let's look at the distribution of GDP per capita more closely to get that.
 summary(qog$gdpc)

 # The maximum value is pretty far away from the median value.
 with(qog, max(gdpc) / median(gdpc))

 # Order the data by GDP per capita.
 qog <- qog[order(qog$gdpc, decreasing = TRUE), ]

 # Get the five leader countries.
 head(qog)[1:4]

 # Now reorder the levels of the country code factor variable by GDP per capita.
 qog$ccode <- reorder(qog$ccode, qog$gdpc, mean)

 # In fact, the top 5-10% observations are just on a completely different level.
 qplot(data = qog, x = gdpc, stat = "ecdf", geom = "step")

 # This curve is the empirical cumulative distribution function (ECDF) of the
 # variable: it shows how its values change throughout its quantiles.

 # Here's the ECDF for each region: see which one creates most variation.
 qplot(data = qog, x = gdpc, stat = "ecdf", color = region, geom = "step")

 # Now see what happens to the ECDFs when you log the variable.
 qplot(data = qog, x = log(gdpc), stat = "ecdf", color = region, geom = "step")

 # The overall distribution of log-GDP per capita is more linear.
 qplot(data = qog, x = log(gdpc), stat = "ecdf", geom = "step")

 # Log-transform GDP per capita.
 qog$gdpc <- log(qog$gdpc)


 # Interval recodes
 # ----------------

 # Let's turn to yet another variable: government expenditure as % of GDP.
 qplot(data = qog, x = region, y = gexp, color = region, geom = "boxplot")

 # The plots show great variability of government expenditure within regions.
 qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") + 
  facet_grid(region ~ .)

 # Let's try a 'Hans Rosling' genius plot: a stacked density plot by region.
 qplot(data = qog, x = gexp, ..density.., geom = "density", 
      color = region, fill = region, alpha = I(.75), position = "stack")

 # However, this plot does not preserve the count of each region. This one does.
 qplot(data = qog, x = gexp, ..count.., geom = "density", 
      color = region, fill = region, alpha = I(.75), position = "fill")

 # One option now is to recode this continuous variable to categories of low,
 # medium and high government expenditure, for instance. How do we get there?

 # Let's recode some continuous variables into categorical ones by dividing them
 # into two equally sized groups defined by their median (50th percentile) value.

 # Summarize the variable.
 summary(qog$gexp)

 # The quartiles and median are the 25th, 50th and 75th percentiles. They can be
 # used to "cut" the variable to intervals. First, compute the quartile values.
 q <- quantile(qog$gexp)

 # Check the quartile values.
 q

 # Plot regional distributions of government expenditures with sample quartiles.
 qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") + 
  geom_vline(x = q, linetype = "dashed") + facet_grid(region ~ .)

 # Now create a new variable by using these cutoff points as interval categories.
 qog$gexp.4 <- cut(qog$gexp, breaks = q)

 # Check the interval categories. 
 table(qog$gexp.4)

 # Check them with boxplots.
 qplot(data = qog, y = gexp, x = gexp.4, geom = "boxplot")

 # Stack them by region.
 g <- qplot(data = qog, x = region, fill = gexp.4, geom = "bar")

 # Check result, which is unsatisfactory with default colors and missing values.
 g

 # Plot with blue sequential gradient. 
 g + scale_fill_brewer()

 # Plot with red-blue diverging gradient.
 g + scale_fill_brewer(palette = "RdBu")

 # Back to the data. We are going to produce interval recodes for all variables,
 # using a very simple cutoff point for the values: above or below the median.

 # Define a function to cut the data at median and label segments "lo" and "hi".
 hilo <- function(x) { 
  cut(x, breaks = quantile(x, probs = 0:2/2), labels = c("lo", "hi"), 
      ordered = TRUE, include.lowest = TRUE)
 }

 # Apply to all continuous variables.
 x <- lapply(qog[, 3:10], FUN = hilo)

 # Rename variables with original names, followed by ".2" to discriminate them.
 names(x) <- paste0(names(x), ".2")

 # Convert to data frame and check result.
 str(x)

 # Add these variables to the original QOG data.
 qog <- cbind(qog, as.data.frame(x))

 # Check final result.
 str(qog)

 # Let's tell R to look in the QOG data for any variable we mention subsequently.
 # The command saves us time and frees us from the need to call the qog object in
 # in front of every variable call (e.g. qog$births). To cancel the attachment,
 # we will type detach(qog) when were are done.

 attach(qog)


 # DESCRIPTIVE STATISTICS
 # ----------------------

 # What's on the menu here? Take a variable like log-GDP per capita. This is the
 # distribution, a.k.a the probability density function (PDF) of the variable.
 qplot(gdpc, geom = "density")

 # It is easier to read it in natural units (constant USD), so exponentiate it.
 qplot(exp(gdpc), geom = "density")

 # And now have a look again at which countries compose the distribution.
 qplot(exp(gdpc), ..count.., geom = "density", 
      color = region, fill = region, alpha = I(.75), position = "stack")

 # For such a variable, the mean and the median can describe the distribution.
 # Now turn again to some previously studied examples of categorical variables.

 # Frequencies of government expenditure in each region.
 qplot(x = region, fill = gexp.4, geom = "bar")

 # Plot stacked (relative) frequencies of government expenditure in each region.
 qplot(x = region, fill = gexp.4, geom = "bar", position = "fill") + 
  scale_y_continuous(labels = percent) + scale_fill_brewer()

 # Here, frequencies and relative frequencies (percentages) make sense.
 # The next sections show to produce these summary statistics.


 # Continuous (and ordinal) measures
 # ---------------------------------

 # Standard summary statistics (the 'five-number summary' figures).
 summary(qog[, 3:9])

 # Obtain summary statistics for the fertility rate:

 # (1) Range
 max(births)
 min(births)

 # (2) Arithmetic mean
 mean(births)
 sum(births) / length(births) # formula

 # (3) Standard deviation
 sd(births)
 sqrt(sum((births - mean(births))^2) / length(births)) # formula: sqrt(var(x))

 # (4) Median
 median(births)
 quantile(births, probs = .5) # 50th percentile

 # (5) Percentiles
 quantile(births) # quartiles
 quantile(births, probs = c(0.1, 0.9)) # specific cutoff points


 # Plots for continuous variables
 # ------------------------------

 # Histograms.
 hist(births, main="Histogram of Fertility in 81 countries")
 hist(births, br = 20)

 # For normality assessment, you first want to visualize the normal distribution.
 curve(dnorm, from = -3, to = 3, col = "red", lwd = 2)

 # Then you want to visualize the quantiles of your variable against normal ones.
 # Say hello to the normal quantile-quantile ('QQ') plot.
 qqnorm(births)

 # And finally you want to add a line that would correspond to perfect normality.
 # Note that this command requires that you have just plotted a normal QQ-plot.
 qqline(births, col = "red")

 # More default plots.
 stripchart(births, method = "jitter", main = "Stripchart with jitter")
 boxplot(births, main = "Boxplot")

 # For a continuous and a categorical variable, boxplots have an easy syntax.
 boxplot(births ~ region)
 boxplot(births ~ gini.2)

 # The next plots go further than the default R syntax. You need to learn ggplot2
 # to get the syntax. An awesome handbook is Winston Chang's R Graphics Cookbook.

 # Histograms, using ggplot2 syntax.
 qplot(births)
 qplot(births, binwidth = 1)
 ggplot(qog, aes(x = births)) +
  geom_histogram(binwidth = 1, fill = "white", color = "black")

 # Density plots, using ggplot2 syntax.
 qplot(births, geom = "density")
 qplot(births, stat = "density", geom = "line")

 # Combined histogram and density plots.
 ggplot(qog, aes(x = births, y = ..density..)) +
  geom_histogram(binwidth = 1, fill = "cornsilk", color = "grey50") +
  geom_line(stat = "density", size = 2)

 # Normal QQ-plot.
 qplot(sample = births)
 qplot(sample = births, color = region)

 # Boxplots.
 qplot(y = births, x = region, geom = "boxplot")
 qplot(y = births, x = region, geom = "boxplot") + geom_point(alpha = .5)
 qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2)
 qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2) + coord_flip()

 # Violin plots.
 qplot(y = births, x = region, geom = "violin") + geom_point()
 qplot(y = births, x = region, geom = "violin") + geom_point(alpha = .5)


 # Categorical (and ordinal) measures
 # ----------------------------------

 # Obtain frequencies (count and relative) for geographic regions:

 table(region)                            # frequencies by category
 prop.table(table(region))                # in proportions
 round(prop.table(table(region)), 1)      # rounded to one decimal point
 round(100 * (prop.table(table(region)))) # shown in percentages

 # R was clearly not designed by people who use percentages a lot.
 # Let's write a function to do this in as less code as possible.

 percents <- function(r, c, which = 0, digits = 1) {
  if(which == "rows") which = 1
  if(which == "cols") which = 2
  if(which == "cell" | which == 0) which = NULL
  round(100 * prop.table(table(r, c), which), digits)
 }

 # Cell percentages.
 percents(region, births.2)

 # Check that the function works.
 sum(percents(region, births.2, "cell")) # this is the default

 # Row percentages.
 percents(region, births.2, "rows") # you can use 1 instead of "rows"

 # Column percentages.
 percents(region, births.2, "cols") # you can use 2 instead of "cols"


 # Plots for categorical variables
 # -------------------------------

 # Default bar/dot plots, for categorical variables.
 barplot(table(region))
 barplot(table(region), horiz = TRUE)
 dotchart(table(region), cex = 1)

 # For a pair of categorical variables, use mosaic plots.
 mosaicplot(table(region, births.2), col = TRUE)
 mosaicplot(table(gini.2, births.2), col = TRUE)

 # And finally, the infamous pie chart that makes R look like Excel 97.
 pie(table(region))

 # Using ggplot2 syntax, we can show how to transform bars into polar coordinates
 # (which is, again, a perceptual nightmare for the reader).

 # Bar plots.
 qplot(region, geom = "bar")
 qplot(region, geom = "bar", fill = gexp.4) + coord_flip()

 # "This is a pie of my favourite bars" (Barney Stinson).
 qplot(region, geom = "bar", fill = gexp.4) + coord_polar() + 
  scale_fill_brewer(palette = "RdBu")

 # Dot plots.
 qplot(region, geom = "point")


 # ASSOCIATION TESTS
 # -----------------


 # Significance testing revolves around the idea that a statistic, whatever it
 # might stand for, can be estimated to be significantly different from zero.
 # One of the simplest forms of test, the t-test, is shown below.
 t.test(births)

 # The bounds of the test form a confidence interval: the range in which the
 # test estimates that the mean of the fertility rate is located. The width of
 # the interval depends on both the sample size and the level of confidence.
 c(t.test(births)$conf.int[1], mean(births), t.test(births)$conf.int[2])


 # Confidence intervals
 # --------------------

 # Here's the manual way to obtain a confidence interval, based on the assumption
 # that the data follows some approximation of the normal distribution. We first
 # compute the mean, standard deviation and N of fertility rates in each region.
 ci <- ddply(qog, .(region), summarise, 
      mu = mean(births), 
      sd = sd(births), 
      n  = length(births))

 # We now turn to the standard error, which is an approximation of the standard
 # deviation of the data in the true population (each region), assuming that it
 # can be approximated from the sample standard deviation divided by sqrt(N).
 ci$se <- ci$sd / sqrt(ci$n) 

 # We finally decide for a 95% level of confidence that corresponds roughly to
 # two standard errors around the mean. In a normal distribution, roughly 95%
 # of all observations fall within this range around the mean.
 ci$lo <- ci$mu - 1.96 * ci$se
 ci$hi <- ci$mu + 1.96 * ci$se

 # Plot 95% CIs.
 qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") + 
  geom_errorbar(aes(ymax = hi, ymin = lo), width = .25)

 # Plot 99% CIs.
 qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") + 
  geom_errorbar(aes(ymax = mu + 2.58 * se, ymin = mu - 2.58 * se), width = .25)

 # Throw all this code into a convenience function.
 getCI <- function(data, x, group, z = 1.96) {
  require(plyr)
  df <- with(data, data.frame(x, group, z, n = 1))
  df <- na.omit(df)
  print(head(df))
  ci <- ddply(df, .(group), summarise,
              mu = mean(x),
              sd = sd(x),
              n  = sum(n),
              se = sd(x) / sqrt(sum(n)),
              lo = mean(x) - mean(z) * sd(x) / sqrt(sum(n)),
              hi = mean(x) + mean(z) * sd(x) / sqrt(sum(n)))
  return(ci)
 }

 # Test by getting 95% CIs for fertility in the high and low GINI country groups.
 ci <- getCI(qog, births, gini.2)

 # Plot the result.
 qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") + 
  geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) +
  labs(y = "Mean fertility rate", x = "Level of GINI coefficient")

 # Test by getting 95% CIs for fertility in the high and low GINI country groups.
 ci <- getCI(qog, births, gdpc.2)

 # Plot the result.
 qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") + 
  geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) +
  labs(y = "Mean fertility rate", x = "Level of GDP per capita")

 # In truth, this is typically the case where we would be graphing scatterplots.
 qplot(births, gdpc) + geom_smooth()

 # ... but that will have to wait one more week :)


 # With two categorical variables
 # ------------------------------

 chisq.test(table(region, births.2))  # Chi-squared test of independence
 fisher.test(table(region, births.2)) # if cell counts < 5, Fisher's test is recommended

 chisq.test(table(gini.2, births.2))
 chisq.test(table(gdpc.2, births.2))
 chisq.test(table(gexp.2, births.2)) # etc.

 # With one continuous variable and a binary variable
 # --------------------------------------------------

 t.test(births ~ gini.2)
 t.test(births ~ gdpc.2)
 t.test(births ~ gexp.2) # etc.

 detach(qog)


 # That's all for now! Enjoy your day.
 # 2013-02-18

	## IDA Exercise 5
	## --------------


	# This exercise replicates PCA plots using World Values Survey data on
	# aggregate confidence in public institutions.

	# The data is taken from the Quality of Government dataset. The functions
	# contain a mix of base functions and custom packages.


	# Packages
	# --------

	# Load/install packages.
	packages <- c("ape", "ClustOfVar", "cluster", "countrycode", "devtools", "FactoMineR", "foreign", "ggdendro", "ggplot2", "reshape")
	packages <- lapply(packages, FUN = function(x) {
	if (!require(x, character.only = TRUE)) {
	install.packages(x)
	library(x, character.only = TRUE)
	}
	})

	# install ggbiplot
	if(!"ggbiplot" %in% installed.packages()[, 1]) install_github("ggbiplot", "vqv")
	require(ggbiplot)


	# Worldwide fertility
	# -------------------

	# downloaded during last session:
	qog.data <- read.csv("data/qog.cs.csv", stringsAsFactors = FALSE)

	# drop duplicate observation:
	qog.data = subset(qog.data, cname != "Congo, Democratic Republic")

	# subset:
	qog <- with(qog.data, data.frame(
	fertility.rate = wdi_fr, # fertility rate
	gdp.per.capita = wdi_gdpc, # GDP per capita
	gov.expenditure = wdi_ge, # government expenditures as % of GDP
	healthcare.exp = wdi_hec, # health expenditure per capita
	gini.inequality = wdi_gini, # Gini coefficient of inequality
	female.school.years = bl_asy25f, # average female years of schooling
	female.school.ratio = wdi_gris, # female to male ratio in schools
	women.in.parliament = gid_wip # % of women in parliament
	))

	# row names:
	rownames(qog) = gsub("\\(.*", "", qog.data$cname)

	# inspect:
	head(qog)

	# sample reduction:
	qog <- na.omit(qog)


	# Heatmap
	# -------

	# 1. refactor
	qog_hm = data.frame(qog, cname = row.names(qog))
	qog_hm$cname = with(qog_hm, reorder(cname, fertility.rate))

	# 2. melt
	qog_hm <- melt(qog_hm)
	qog_hm <- ddply(qog_hm, .(variable), transform, rescale = scale(value))

	# 3. plot
	p <- ggplot(qog_hm, aes(variable, cname)) +
	geom_tile(aes(fill = rescale), colour="white") +
	scale_fill_gradient(low = "white", high = "steelblue")

	# 4. theme
	p + theme_grey(base_size = 12) +
	labs(x = NULL, y = NULL) +
	scale_x_discrete(expand = c(0, 0)) +
	scale_y_discrete(expand = c(0, 0)) +
	theme(legend.position = "none",
	axis.text.x = element_text(size = 16, angle = 60, hjust = 1, colour = "grey50"))


	# Principal components
	# --------------------

	fit <- princomp(qog, cor=TRUE)

	# screeplot:
	plot(fit, type = "l")

	# loadings:
	loadings(fit)

	# biplot:
	biplot(fit)


	# Hierarchical Clustering
	# -----------------------

	# create Ward distance matrix
	d <- dist(qog, method = "euclidean")
	h <- hclust(d, method = "ward")

	# dendogram:
	plot(h)

	# cut 5 clusters:
	clus5 = cutree(h, 5)

	# plot with red borders:
	rect.hclust(h, k = 5, border = "red")

	# phylogenic plots with ape:
	h.phylo = as.phylo(h)
	h.color = c("#556270", "#4ECDC4", "#1B676B", "#FF6B6B", "#C44D58")[clus5]

	# basic fan plot:
	plot(h.phylo, type = "fan", tip.color = h.color, label.offset = 1)

	# size of labels set to (log) fertility rate:
	plot(h.phylo, type = "fan", tip.color = h.color, label.offset = 1,
	cex = log(qog$fertility.rate, 2.5))


	# k-means
	# -------

	# k-means for 5 clusters:
	k <- kmeans(qog, 5)

	# describe the matrix
	k

	# plot with the cluster package
	clusplot(qog, k$cluster, color = TRUE, shade = TRUE, labels = 2, lines = 0)


	# k-means with ggplot2:
	# ---------------------

	# 2-dimensional subset:
	k = qog[, c("fertility.rate", "female.school.years")]

	# k-means:
	k = kmeans(k, 3)

	# extract centers:
	centers = data.frame(k$centers)

	# stick clusters to data:
	qog$cluster = factor(k$cluster)

	# reintroduce country codes:
	qog$iso3c = countrycode(row.names(qog), "country.name", "iso3c")

	# plot with ggplot2:
	ggplot(data = qog, aes(y = fertility.rate, x = female.school.years, color = cluster)) +
	geom_text(aes(label = iso3c)) +
	geom_point(data = centers,
	aes(color = 'Center'),
	size = 52,
	alpha = .3,
	show_guide = FALSE)


	# Confidence in public institutions
	# ---------------------------------

	# Quality of Government, Stata version.
	# Cheers to the OQG team and to the Swedish taxpayer.
	qog <- read.dta("data/qog.cs.dta")

	# Copy ISO-3C country code to row names.
	row.names(qog) <- qog$ccodealp

	# The next block takes advantage of the Stata format, which includes variable labels (short text descriptions). By default, the labels are stored as dataset attributes: the next code block copies them to the variable names and then subsets the data to variables featuring the keyword "confidence" in their description.

	# Copy Stata label over variable names.
	names(qog) <- attr(qog, "var.labels")

	# Subset to variables with "confidence" in their name.
	qog <- qog[, grepl("confidence", names(qog), ignore.case = TRUE)]

	# Clean up variable names.
	names(qog) <- gsub("Confidence: \|the ", "", names(qog))

	# Remove a few columns with less observations.
	qog <- qog[, !names(qog) %in% c("European Union", "NATO")]

	# Subset to nonmissing data.
	qog <- na.omit(qog)

	# Finalized dataset.
	str(qog)

	# Countries considered.
	rownames(qog)

	# Extract regions.
	table(countrycode(rownames(qog), "iso3c", "region"))


	# PCA with FactoMineR
	# -------------------

	# The two-lines approach.
	qog.PCA <- PCA(qog, graph = FALSE)
	plot(qog.PCA)

	# with ggplot, you need the standard prcomp object:
	qog.prcomp <- prcomp(qog, scale = TRUE)

	# ggbiplot
	ggbiplot(qog.prcomp,
	labels = rownames(qog),
	groups = countrycode(rownames(qog), "iso3c", "continent"),
	ellipse = TRUE) +
	coord_equal(xlim = c(-3, 3), ylim = c(-3, 3))


	# Hierarchical clustering with ClustOfVar
	# ---------------------------------------

	# The two-lines approach.
	qog.tree <- hclustvar(qog)
	plot(qog.tree)

	# ggplot2 requires the standard hclust object:
	qog.hclust <- hclust(dist(qog))

	# plot countries by similarity with ggdendro:
	ggdendrogram(qog.hclust, rotate = TRUE, size=4)


	# We'll let you build your examples in groups for the first class exercise.
	# 2013-06-21
	## IDA Session 6
	## -------------

	packages <- c("gplots", "ggplot2", "Hmisc", "car", "gmodels")
	packages <- lapply(packages, FUN = function(x) {
	if (!require(x, character.only = TRUE)) {
	install.packages(x)
	library(x, character.only = TRUE)
	}
	})
	qog <- read.csv("data/qog_basic_cs.csv")

	# The list of variables is quasi-identical to the one used previously.
	# Let's simply add an extra variable coding for geographical regions.
	qog_b <- with(qog, data.frame(
	ccode = ccodealp, # country three letter code
	cname = cname, # country name
	births = wdi_fr, # fertility rate
	gdpc = wdi_gdpc, # GDP per capita
	gexp = wdi_ge, # government expenditures as percentage of GDP
	hexpc = wdi_hec, # health care expenditure per capita
	gini = uw_gini, # GINI coefficient of inequality
	edu = bl_asyt25,# average years of schooling
	gris = wdi_gris, # female to male ratio in schools
	winpar = m_wominpar, # percentage of women in parliament
	region = ht_region # region of country
	))

	# Clean up missing data.
	qog_b <- na.omit(qog_b)
	# Clean up row names.
	rownames(qog_b) <- NULL
	# Reorder country factor levels by their respective fertility rates.
	qog_b$cname <- with(qog_b, reorder(cname, births))


	# Recodings
	# ---------

	#Let's give value labels to the region var
	qog_b$region<-factor(qog_b$region, levels=c(1,2,3,4,5,6,7,8,9,10), labels=c("EEur","LatAm","NAfr&MEast","SubSAfr","WEur&NAm","EAsia","SEAsia","SAsia","Pacif","Carib"))

	#Let's recode some vars into categorical variables
	#This will be useful for presenting some of the stats and graphs from this session
	#Recode all continuous vars into two equally sized categories for low and high levels
	qog_b$births2<-(cut(qog_b$births, breaks=quantile(qog_b$births,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$gdpc2<-(cut(qog_b$gdpc, breaks=quantile(qog_b$gdpc,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$gexp2<-(cut(qog_b$gexp, breaks=quantile(qog_b$gexp,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$hexpc2<-(cut(qog_b$hexpc, breaks=quantile(qog_b$hexpc,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$gini2<-(cut(qog_b$gini, breaks=quantile(qog_b$gini,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$edu2<-(cut(qog_b$edu, breaks=quantile(qog_b$edu,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$gris2<-(cut(qog_b$gris, breaks=quantile(qog_b$gris,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))
	qog_b$winpar2<-(cut(qog_b$winpar, breaks=quantile(qog_b$winpar,probs=seq(0,1, by=.5)), labels=c("l","h"), ordered=T, include.lowest=T))

	#Finally, let's tell R to look in the qog_b subset for any variable we mention subsequently
	#The command saves us time and frees us from the need to type "qog_b$" in front of every variable call
	#Note that the command is canceled as follows: detach(qog_b)
	attach(qog_b)

	#DESCRIPTIVE UNIVARIATE STATS
	#Basic commands for continuous (and ordinal) measures
	describe(qog_b)
	summary(qog_b)
	#You can obtain specific stats with the following commands
	max(births)
	min(births)
	mean(births)
	median(births)
	sd(births)
	quantile(births)
	quantile(births, probs=c(0.1, 0.9)) #for specific cut points

	#Basic commands for categorical (and ordinal) measures
	table(region) #frequencies by category
	prop.table(table(region)) #in proportions
	round(prop.table(table(region)), 2) #rounds to two decimals points
	round(100*(prop.table(table(region)))) #in %

	#GRAPHICS OF DISTRIBUTIONS
	#For continuous vars
	hist(births, main="Historgram of Fertility in 81 countries")
	hist(births, br=20)
	stripchart(births, method="jitter", main="Stripchart with Jitter")
	boxplot(births, main="Boxplot of Fertility in 81 countries")
	qqnorm(births)
	qqline(births)
	#For categorical vars
	barplot(table(region))
	barplot(table(region), horiz=T)
	dotchart(table(region), cex=1)
	pie(table(region))

	#DESCRIPTIVE BIVARIATE STATS
	#For a pair of categorical vars
	table(region, births2) #frequencies
	round(100*(prop.table(table(region, births2))),2) #in total %
	round(100*(prop.table(table(region, births2),1)),2) #in row %
	round(100*(prop.table(table(region, births2),2)),2) #in column %

	#GRAPHICS OF BIVARIATE ASSOCIATIONS
	#For a pair of categorical vars
	mosaicplot(table(region, births2), col=T)
	mosaicplot(table(gini2, births2), col=T)
	#For a continuous and a categorical vars
	boxplot(births~region)
	qplot(region, births, geom=c("boxplot","point"))
	boxplot(births~gini2)
	qplot(gini2, births, geom=c("boxplot","point"))
	qplot(region, births, geom="boxplot", fill=gini2)

	#ASSOCIATION TESTS
	#Test of independence of a pair of categorical variables
	chisq.test(table(region, births2)) #chi-squared test of independence
	fisher.test(table(region, births2)) #for vars with categories<5 observations, this test is more appropriate
	chisq.test(table(gini2, births2))
	chisq.test(table(gdpc2, births2))
	chisq.test(table(gexp2, births2)) #etc.

	#Test of independence of a binary and a continuous variables
	t.test(births~gini2)
	t.test(births~gdpc2)
	t.test(births~gexp2) #etc.

	#CONFIDENCE INTERVALS
	t.test(births)
	#The barplots below don't work for region b/c there are categories with too few cases to estimate the conf. intervals
	b_gini_mean<-tapply(births, gini2, mean)
	lower<-tapply(births, gini2, function(v) t.test(v)$conf.int[1])
	upper<-tapply(births, gini2, function(v) t.test(v)$conf.int[2])
	barplot2(b_gini_mean, plot.ci=T, ci.l=lower, ci.u=upper)

	b_gdpc_mean<-tapply(births, gdpc2, mean)
	lower<-tapply(births, gdpc2, function(v) t.test(v)$conf.int[1])
	upper<-tapply(births, gdpc2, function(v) t.test(v)$conf.int[2])
	barplot2(b_gdpc_mean, plot.ci=T, ci.l=lower, ci.u=upper)

	b_gexp_mean<-tapply(births, gexp2, mean)
	lower<-tapply(births, gexp2, function(v) t.test(v)$conf.int[1])
	upper<-tapply(births, gexp2, function(v) t.test(v)$conf.int[2])
	barplot2(b_gexp_mean, plot.ci=T, ci.l=lower, ci.u=upper)

	b_gris_mean<-tapply(births, gris2, mean)
	lower<-tapply(births, gris2, function(v) t.test(v)$conf.int[1])
	upper<-tapply(births, gris2, function(v) t.test(v)$conf.int[2])
	barplot2(b_gris_mean, plot.ci=T, ci.l=lower, ci.u=upper)

	b_winpar_mean<-tapply(births, winpar2, mean)
	lower<-tapply(births, winpar2, function(v) t.test(v)$conf.int[1])
	upper<-tapply(births, winpar2, function(v) t.test(v)$conf.int[2])
	barplot2(b_winpar_mean, plot.ci=T, ci.l=lower, ci.u=upper)

	## IDA Session 7
	## -------------


	# Package loading
	# ---------------

	# Load/install packages.
	packages <- c("countrycode", "ggplot2", "Hmisc", "plyr", "scales")
	packages <- lapply(packages, FUN = function(x) {
	if (!require(x, character.only = TRUE)) {
	install.packages(x)
	library(x, character.only = TRUE)
	}
	})


	# DATA PREPARATION
	# ----------------

	# Let's first review some of the things that we now know how to do with data.

	# Load dataset.
	qog_cs <- read.csv("data/qog_basic_cs.csv")

	# Create an extract of the QOG cross-sectional dataset. The list of variables is
	# quasi-identical to the one used previously, with an extra variable for regions.
	qog <- with(qog_cs, data.frame(
	row.names = cname, # Country name (set as row name in the data extract)
	ccode = ccodealp, # Country 3-letter code
	region = ht_region, # Geographical region of the country
	births = wdi_fr, # Fertility rate
	gdpc = wdi_gdpc, # GDP per capita
	gexp = wdi_ge, # Government expenditures as percentage of GDP
	hexpc = wdi_hec, # Health care expenditure per capita
	gini = uw_gini, # GINI coefficient of economic inequality
	edu = bl_asyt25, # Average years of schooling
	gris = wdi_gris, # Female to male ratio in schools
	winpar = m_wominpar # Percentage of women in parliament
	))


	# Geographic indicators
	# ---------------------

	# Let's now see what kind of geographic descriptors we have in the dataset.
	# The first one is a list of ISO-3 country codes, an international standard.
	levels(qog$ccode)

	# This list has been supplemented, in the QOG dataset, by a geographic variable
	# combining spatial proximity and expert decisions on regional democratization.
	str(qog$region)

	# An issue here is that we imported the data without its labels, so the regions
	# are uninformative. But with ISO-3 country codes, we can recreate UN regions.
	table(countrycode(qog$ccode, "iso3c", "region"))

	# We save the UN continent instead to create groups with many countries each.
	qog$region <- with(qog, countrycode(ccode, "iso3c", "continent"))

	# Check result.
	qplot(qog$region, geom = "bar")


	# Missing data
	# ------------

	# Some entries in the QOG dataset are not recognized as UN countries.
	rownames(qog)[is.na(qog$region)]

	# Drop these cases, that have been counted in other country indicators.
	qog <- subset(qog, !is.na(region))

	# The analysis that we will run today also runs faster without missing data.
	# Let's identify the rows with missing data.
	qog$fulldata <- complete.cases(qog)

	# Check result. The complete.cases() function is a utility function that will
	# return TRUE if there is absolutely no missing data on a row of the dataset.
	table(qog$fulldata)

	# Identify where the missing data lies. There is a lot, so we will remember that
	# when we look at plots and reading the results of statistical tests.
	qplot(data = qog, x = region, fill = fulldata, geom = "bar")

	# Subset to full data rows.
	qog <- na.omit(qog)


	# Reordering factors
	# ------------------

	# In R, sorting the data is different from ordering levels in a variable.
	# Let's see how to sort the data from highest to lowest fertility rates.
	by_births <- order(qog$births, decreasing = TRUE)

	# Show the first three columns in that sorting order.
	head(qog[by_births, 1:3])

	# For ascending sort order, you would simply not specify the decreasing option.
	head(qog[order(qog$births), 1:3])

	# Countries for which fertility rates are missing are always sorted last.
	tail(qog[order(qog$births), 1:3])

	# But this is different from the order of a string variable like country codes.
	# The country code variable holds text, coded as factors and ordered by levels.
	str(qog$ccode)

	# An issue here is that R naturally orders factors by alphabetical order, which
	# is not optimal for ordering categorical variables like countries in plots.

	# Here's an example where countries are alphabetically ordered.
	qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point")

	# Now, reorder country code factor levels by their respective fertility rates.
	qog$ccode <- with(qog, reorder(ccode, births, mean))

	# The example now shows countries ordered by fertility rates.
	qplot(data = qog[1:25, ], y = ccode, x = births, color = region, geom = "point")

	# Let's now visualize the distribution of fertility rates in each region.
	qplot(data = qog, x = births, fill = region, color = region, geom = "density") +
	facet_grid(region ~ .)

	# The regions are shown in alphabetical order. Let's fix that.
	qog$region <- with(qog, reorder(region, -births, mean))

	# Rerun the plot to see regions ordered by average fertility rates.
	qplot(data = qog, x = births, fill = region, color = region, geom = "density") +
	facet_grid(region ~ .)

	# These manipulations show how to plot distributions as densities and how to use
	# the mean value of a distribution to summarize it. More after our last recodes.


	# Log transformation
	# ------------------

	# Let's turn to a different variable: GDP per capita.
	qplot(data = qog, x = region, y = gdpc, color = region, geom = "boxplot")

	# For such a variable, a log-transformation helps.
	qplot(data = qog, x = region, y = log(gdpc), color = region, geom = "boxplot")

	# What exactly are we doing here? We are basically compressing the distribution
	# of the variable to get outliers in closer range to other observations.

	# Let's look at the distribution of GDP per capita more closely to get that.
	summary(qog$gdpc)

	# The maximum value is pretty far away from the median value.
	with(qog, max(gdpc) / median(gdpc))

	# Order the data by GDP per capita.
	qog <- qog[order(qog$gdpc, decreasing = TRUE), ]

	# Get the five leader countries.
	head(qog)[1:4]

	# Now reorder the levels of the country code factor variable by GDP per capita.
	qog$ccode <- reorder(qog$ccode, qog$gdpc, mean)

	# In fact, the top 5-10% observations are just on a completely different level.
	qplot(data = qog, x = gdpc, stat = "ecdf", geom = "step")

	# This curve is the empirical cumulative distribution function (ECDF) of the
	# variable: it shows how its values change throughout its quantiles.

	# Here's the ECDF for each region: see which one creates most variation.
	qplot(data = qog, x = gdpc, stat = "ecdf", color = region, geom = "step")

	# Now see what happens to the ECDFs when you log the variable.
	qplot(data = qog, x = log(gdpc), stat = "ecdf", color = region, geom = "step")

	# The overall distribution of log-GDP per capita is more linear.
	qplot(data = qog, x = log(gdpc), stat = "ecdf", geom = "step")

	# Log-transform GDP per capita.
	qog$gdpc <- log(qog$gdpc)


	# Interval recodes
	# ----------------

	# Let's turn to yet another variable: government expenditure as % of GDP.
	qplot(data = qog, x = region, y = gexp, color = region, geom = "boxplot")

	# The plots show great variability of government expenditure within regions.
	qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") +
	facet_grid(region ~ .)

	# Let's try a 'Hans Rosling' genius plot: a stacked density plot by region.
	qplot(data = qog, x = gexp, ..density.., geom = "density",
	color = region, fill = region, alpha = I(.75), position = "stack")

	# However, this plot does not preserve the count of each region. This one does.
	qplot(data = qog, x = gexp, ..count.., geom = "density",
	color = region, fill = region, alpha = I(.75), position = "fill")

	# One option now is to recode this continuous variable to categories of low,
	# medium and high government expenditure, for instance. How do we get there?

	# Let's recode some continuous variables into categorical ones by dividing them
	# into two equally sized groups defined by their median (50th percentile) value.

	# Summarize the variable.
	summary(qog$gexp)

	# The quartiles and median are the 25th, 50th and 75th percentiles. They can be
	# used to "cut" the variable to intervals. First, compute the quartile values.
	q <- quantile(qog$gexp)

	# Check the quartile values.
	q

	# Plot regional distributions of government expenditures with sample quartiles.
	qplot(data = qog, x = gexp, color = region, fill = region, geom = "density") +
	geom_vline(x = q, linetype = "dashed") + facet_grid(region ~ .)

	# Now create a new variable by using these cutoff points as interval categories.
	qog$gexp.4 <- cut(qog$gexp, breaks = q)

	# Check the interval categories.
	table(qog$gexp.4)

	# Check them with boxplots.
	qplot(data = qog, y = gexp, x = gexp.4, geom = "boxplot")

	# Stack them by region.
	g <- qplot(data = qog, x = region, fill = gexp.4, geom = "bar")

	# Check result, which is unsatisfactory with default colors and missing values.
	g

	# Plot with blue sequential gradient.
	g + scale_fill_brewer()

	# Plot with red-blue diverging gradient.
	g + scale_fill_brewer(palette = "RdBu")

	# Back to the data. We are going to produce interval recodes for all variables,
	# using a very simple cutoff point for the values: above or below the median.

	# Define a function to cut the data at median and label segments "lo" and "hi".
	hilo <- function(x) {
	cut(x, breaks = quantile(x, probs = 0:2/2), labels = c("lo", "hi"),
	ordered = TRUE, include.lowest = TRUE)
	}

	# Apply to all continuous variables.
	x <- lapply(qog[, 3:10], FUN = hilo)

	# Rename variables with original names, followed by ".2" to discriminate them.
	names(x) <- paste0(names(x), ".2")

	# Convert to data frame and check result.
	str(x)

	# Add these variables to the original QOG data.
	qog <- cbind(qog, as.data.frame(x))

	# Check final result.
	str(qog)

	# Let's tell R to look in the QOG data for any variable we mention subsequently.
	# The command saves us time and frees us from the need to call the qog object in
	# in front of every variable call (e.g. qog$births). To cancel the attachment,
	# we will type detach(qog) when were are done.

	attach(qog)


	# DESCRIPTIVE STATISTICS
	# ----------------------

	# What's on the menu here? Take a variable like log-GDP per capita. This is the
	# distribution, a.k.a the probability density function (PDF) of the variable.
	qplot(gdpc, geom = "density")

	# It is easier to read it in natural units (constant USD), so exponentiate it.
	qplot(exp(gdpc), geom = "density")

	# And now have a look again at which countries compose the distribution.
	qplot(exp(gdpc), ..count.., geom = "density",
	color = region, fill = region, alpha = I(.75), position = "stack")

	# For such a variable, the mean and the median can describe the distribution.
	# Now turn again to some previously studied examples of categorical variables.

	# Frequencies of government expenditure in each region.
	qplot(x = region, fill = gexp.4, geom = "bar")

	# Plot stacked (relative) frequencies of government expenditure in each region.
	qplot(x = region, fill = gexp.4, geom = "bar", position = "fill") +
	scale_y_continuous(labels = percent) + scale_fill_brewer()

	# Here, frequencies and relative frequencies (percentages) make sense.
	# The next sections show to produce these summary statistics.


	# Continuous (and ordinal) measures
	# ---------------------------------

	# Standard summary statistics (the 'five-number summary' figures).
	summary(qog[, 3:9])

	# Obtain summary statistics for the fertility rate:

	# (1) Range
	max(births)
	min(births)

	# (2) Arithmetic mean
	mean(births)
	sum(births) / length(births) # formula

	# (3) Standard deviation
	sd(births)
	sqrt(sum((births - mean(births))^2) / length(births)) # formula: sqrt(var(x))

	# (4) Median
	median(births)
	quantile(births, probs = .5) # 50th percentile

	# (5) Percentiles
	quantile(births) # quartiles
	quantile(births, probs = c(0.1, 0.9)) # specific cutoff points


	# Plots for continuous variables
	# ------------------------------

	# Histograms.
	hist(births, main="Histogram of Fertility in 81 countries")
	hist(births, br = 20)

	# For normality assessment, you first want to visualize the normal distribution.
	curve(dnorm, from = -3, to = 3, col = "red", lwd = 2)

	# Then you want to visualize the quantiles of your variable against normal ones.
	# Say hello to the normal quantile-quantile ('QQ') plot.
	qqnorm(births)

	# And finally you want to add a line that would correspond to perfect normality.
	# Note that this command requires that you have just plotted a normal QQ-plot.
	qqline(births, col = "red")

	# More default plots.
	stripchart(births, method = "jitter", main = "Stripchart with jitter")
	boxplot(births, main = "Boxplot")

	# For a continuous and a categorical variable, boxplots have an easy syntax.
	boxplot(births ~ region)
	boxplot(births ~ gini.2)

	# The next plots go further than the default R syntax. You need to learn ggplot2
	# to get the syntax. An awesome handbook is Winston Chang's R Graphics Cookbook.

	# Histograms, using ggplot2 syntax.
	qplot(births)
	qplot(births, binwidth = 1)
	ggplot(qog, aes(x = births)) +
	geom_histogram(binwidth = 1, fill = "white", color = "black")

	# Density plots, using ggplot2 syntax.
	qplot(births, geom = "density")
	qplot(births, stat = "density", geom = "line")

	# Combined histogram and density plots.
	ggplot(qog, aes(x = births, y = ..density..)) +
	geom_histogram(binwidth = 1, fill = "cornsilk", color = "grey50") +
	geom_line(stat = "density", size = 2)

	# Normal QQ-plot.
	qplot(sample = births)
	qplot(sample = births, color = region)

	# Boxplots.
	qplot(y = births, x = region, geom = "boxplot")
	qplot(y = births, x = region, geom = "boxplot") + geom_point(alpha = .5)
	qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2)
	qplot(y = births, x = region, geom = "boxplot") + aes(fill = gris.2) + coord_flip()

	# Violin plots.
	qplot(y = births, x = region, geom = "violin") + geom_point()
	qplot(y = births, x = region, geom = "violin") + geom_point(alpha = .5)


	# Categorical (and ordinal) measures
	# ----------------------------------

	# Obtain frequencies (count and relative) for geographic regions:

	table(region) # frequencies by category
	prop.table(table(region)) # in proportions
	round(prop.table(table(region)), 1) # rounded to one decimal point
	round(100 * (prop.table(table(region)))) # shown in percentages

	# R was clearly not designed by people who use percentages a lot.
	# Let's write a function to do this in as less code as possible.

	percents <- function(r, c, which = 0, digits = 1) {
	if(which == "rows") which = 1
	if(which == "cols") which = 2
	if(which == "cell" \| which == 0) which = NULL
	round(100 * prop.table(table(r, c), which), digits)
	}

	# Cell percentages.
	percents(region, births.2)

	# Check that the function works.
	sum(percents(region, births.2, "cell")) # this is the default

	# Row percentages.
	percents(region, births.2, "rows") # you can use 1 instead of "rows"

	# Column percentages.
	percents(region, births.2, "cols") # you can use 2 instead of "cols"


	# Plots for categorical variables
	# -------------------------------

	# Default bar/dot plots, for categorical variables.
	barplot(table(region))
	barplot(table(region), horiz = TRUE)
	dotchart(table(region), cex = 1)

	# For a pair of categorical variables, use mosaic plots.
	mosaicplot(table(region, births.2), col = TRUE)
	mosaicplot(table(gini.2, births.2), col = TRUE)

	# And finally, the infamous pie chart that makes R look like Excel 97.
	pie(table(region))

	# Using ggplot2 syntax, we can show how to transform bars into polar coordinates
	# (which is, again, a perceptual nightmare for the reader).

	# Bar plots.
	qplot(region, geom = "bar")
	qplot(region, geom = "bar", fill = gexp.4) + coord_flip()

	# "This is a pie of my favourite bars" (Barney Stinson).
	qplot(region, geom = "bar", fill = gexp.4) + coord_polar() +
	scale_fill_brewer(palette = "RdBu")

	# Dot plots.
	qplot(region, geom = "point")


	# ASSOCIATION TESTS
	# -----------------


	# Significance testing revolves around the idea that a statistic, whatever it
	# might stand for, can be estimated to be significantly different from zero.
	# One of the simplest forms of test, the t-test, is shown below.
	t.test(births)

	# The bounds of the test form a confidence interval: the range in which the
	# test estimates that the mean of the fertility rate is located. The width of
	# the interval depends on both the sample size and the level of confidence.
	c(t.test(births)$conf.int[1], mean(births), t.test(births)$conf.int[2])


	# Confidence intervals
	# --------------------

	# Here's the manual way to obtain a confidence interval, based on the assumption
	# that the data follows some approximation of the normal distribution. We first
	# compute the mean, standard deviation and N of fertility rates in each region.
	ci <- ddply(qog, .(region), summarise,
	mu = mean(births),
	sd = sd(births),
	n = length(births))

	# We now turn to the standard error, which is an approximation of the standard
	# deviation of the data in the true population (each region), assuming that it
	# can be approximated from the sample standard deviation divided by sqrt(N).
	ci$se <- ci$sd / sqrt(ci$n)

	# We finally decide for a 95% level of confidence that corresponds roughly to
	# two standard errors around the mean. In a normal distribution, roughly 95%
	# of all observations fall within this range around the mean.
	ci$lo <- ci$mu - 1.96 * ci$se
	ci$hi <- ci$mu + 1.96 * ci$se

	# Plot 95% CIs.
	qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") +
	geom_errorbar(aes(ymax = hi, ymin = lo), width = .25)

	# Plot 99% CIs.
	qplot(data = ci, x = region, y = mu, fill = region, stat = "identity", geom = "bar") +
	geom_errorbar(aes(ymax = mu + 2.58 * se, ymin = mu - 2.58 * se), width = .25)

	# Throw all this code into a convenience function.
	getCI <- function(data, x, group, z = 1.96) {
	require(plyr)
	df <- with(data, data.frame(x, group, z, n = 1))
	df <- na.omit(df)
	print(head(df))
	ci <- ddply(df, .(group), summarise,
	mu = mean(x),
	sd = sd(x),
	n = sum(n),
	se = sd(x) / sqrt(sum(n)),
	lo = mean(x) - mean(z) * sd(x) / sqrt(sum(n)),
	hi = mean(x) + mean(z) * sd(x) / sqrt(sum(n)))
	return(ci)
	}

	# Test by getting 95% CIs for fertility in the high and low GINI country groups.
	ci <- getCI(qog, births, gini.2)

	# Plot the result.
	qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") +
	geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) +
	labs(y = "Mean fertility rate", x = "Level of GINI coefficient")

	# Test by getting 95% CIs for fertility in the high and low GINI country groups.
	ci <- getCI(qog, births, gdpc.2)

	# Plot the result.
	qplot(data = ci, x = group, y = mu, fill = group, stat = "identity", geom = "bar") +
	geom_errorbar(aes(ymax = hi, ymin = lo), width = .25) +
	labs(y = "Mean fertility rate", x = "Level of GDP per capita")

	# In truth, this is typically the case where we would be graphing scatterplots.
	qplot(births, gdpc) + geom_smooth()

	# ... but that will have to wait one more week :)


	# With two categorical variables
	# ------------------------------

	chisq.test(table(region, births.2)) # Chi-squared test of independence
	fisher.test(table(region, births.2)) # if cell counts < 5, Fisher's test is recommended

	chisq.test(table(gini.2, births.2))
	chisq.test(table(gdpc.2, births.2))
	chisq.test(table(gexp.2, births.2)) # etc.

	# With one continuous variable and a binary variable
	# --------------------------------------------------

	t.test(births ~ gini.2)
	t.test(births ~ gdpc.2)
	t.test(births ~ gexp.2) # etc.

	detach(qog)


	# That's all for now! Enjoy your day.
	# 2013-02-18