ptoche · August 29, 2015 14:00 · briatte · May 4, 2014
diff --git a/ps-income-shares.csv b/ps-income-shares.csv
diff --git a/ps.R b/ps.R
 # 3 May 2014, Patrick Toche
 # Very minor edits: experiment with other styles, use of ggplot() instead of qplot().
 # First version: François Briatte
 # http://f.briatte.org/teaching/ida/093_practice.html
 # Data from Thomas Piketty & Emmanuel Saez
 # http://elsa.berkeley.edu/users/saez/TabFig2012prel.xls

 # Set directory:
 if(.Platform$OS.type == "windows"){
  setwd("c:/R/capital21c")
 } else { 
  setwd("~/R/capital21c") 
 }

 # Plot Aesthetics
 # http://jkunst.com/blog/post/ggplot-highcharts-taste/
 # Set Colors:
 colors_hc <- c("#7CB5EC", "#313131", "#F7A35C",
               "#90EE7E", "#7798BF", "#AAEEEE",
               "#FF0066", "#EEAAEE", "#55BF3B",
               "#DF5353", "#7798BF", "#AAEEEE")
 # Load theme font:
 library("extrafont")
 # font_import(open-sans)
 loadfonts()
 # Define theme:
 theme_hc <- function(){
  theme(
    text                = element_text(family = "Open Sans", size = 12), # raised size
    title               = element_text(hjust = 0), 
    axis.title.x        = element_text(hjust = .5),
    axis.title.y        = element_text(hjust = .5),
    panel.grid.major.y  = element_line(color = 'gray', size = .3),
    panel.grid.minor.y  = element_blank(),
    panel.grid.major.x  = element_blank(),
    panel.grid.minor.x  = element_blank(),
    panel.border        = element_blank(),
    panel.background    = element_blank(),
    legend.position     = "bottom",
    legend.title        = element_blank()
  )
 }


 # Data on Income Shares:
 # Table A1: income share of the top 1% income earners (excluding capital gains)
 library("xlsx")
 data <- "TabFig2012prel.xls"
 df <- read.xlsx(data, sheetName = "Table A1", startRow = 4, endRow = 105, colIndex = 1:7)
 df <- df[-1, ] # Remove empty row
 # Fix Dates:
 colnames(df)[1] <- "Year"
 df[,1] <- as.Date(paste0(df$Year, "-01-01"), format = "%Y-%m-%d")

 # Reorder Variables:
 names(df) <- c("Year", c(10, 5, 1, 0.5, 0.1, 0.01))
 # order(names(df[,-1]), decreasing = FALSE) # not the desired order
 ## [1] 6 5 4 3 1 2
 # order(as.numeric(names(df[,-1])), decreasing = FALSE) # the desired order
 ## [1] 6 5 4 3 2 1
 names(df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))])
 ## [1] "Year" "0.01" "0.1"  "0.5"  "1"    "5"    "10" 
 df <- df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))]
 # Rename Variables:
 names(df) <- c("Year", paste0("Top ", c(0.01, 0.1, 0.5, 1, 5, 10), "%"))
 View(df)

 # Reshape to long format, dropping NA values:
 library("reshape")
 df <- melt(df, id = "Year", variable_name = "Fractile", na.rm = TRUE)
 # convert value to numeric
 df$value <- as.numeric(as.character(df$value))/100
 View(df)

 str(df)
 ## 'data.frame':	592 obs. of  3 variables:
 ##  $ Year    : Date, format: "1913-01-01" ...
 ##  $ Fractile: Factor w/ 6 levels "Top 0.01%","Top 0.1%",..: 1 1 1 1 1 1 1 1 1 1 ...
 ##  $ value   : num  0.0276 0.0273 0.0436 0.044 0.0333 ...
  
 # Save Data
 # save(df, file = "ps-income-shares.rda")
 load("ps-income-shares.rda") # local copy
 # write.csv(df, file = "ps-income-shares.csv")

 # Figure 1
 # Plot all top fractile income shares, colored by income fractile. 
 # This plot shows some of the same data as Figure 1 in the original Piketty-Saez paper
 # It reveals a U-shaped trend, starting with a general contraction of the income share of 
 # top income earners at the end of World War II, and followed by an expansion in recent decades.
 library("ggplot2")
 library("scales")
 p <- ggplot(data = df, aes(x = Year, y = value, color = Fractile))
 p <- p + geom_line()
 p <- p + theme_hc()
 p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
 p <- p + scale_y_continuous(labels = percent)
 p <- p + theme(legend.position = "none")
 p <- p + geom_text(data = subset(df, Year == "2012-01-01"), aes(x = Year, label = Fractile, hjust = -0.2), size = 4)
 p <- p + xlab("")
 p <- p + ylab("")
 p <- p + ggtitle("U.S. top income shares (%)")
 p

 ggsave(p, file = "ps-us-top-income-shares.pdf", width = 10, height = 6)


 # Data on Income Levels:
 # Table_Incomegrowth: real income levels (including capital gains) for the lowest 90%, top 10% and top 1% income fractiles.
 library("xlsx")
 data <- "TabFig2012prel.xls"
 df <- read.xlsx(data, sheetName = "Table_Incomegrowth", startRow = 1, endRow = 104, colIndex = c(10, 5, 3))
 df <- df[-1, ] # Remove empty row
 # Fix Dates:
 df <- cbind("Year" = as.Date(paste0(1913:2012, "-01-01"), format = "%Y-%m-%d"), df)
 # Reorder Variables:
 names(df) <- c("Year", c(10, 1, 90))
 names(df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))])
 ## [1] "Year" "1"    "10"   "90"
 df <- df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))]
 # Rename Variables:
 names(df) <- c("Year", "Top 1%", "Top 10%", "Bottom 90%")
 View(df)

 # Reshape to long format:
 library("reshape")
 df <- melt(df, id = "Year", variable_name = "Fractile", na.rm = TRUE)
 # convert value to numeric
 df$value <- as.numeric(as.character(df$value))
 View(df)

 str(df)
 ## 'data.frame':	292 obs. of  3 variables:
 ##  $ Year    : Date, format: "1917-01-01" ...
 ##  $ Fractile: Factor w/ 3 levels "Top 1%","Top 10%",..: 1 1 1 1 1 1 1 1 1 1 ...
 ##  $ value   : num  279656 276230 269987 327844 304496 ...

 # Save Data
 # save(df, file = "ps-income-levels.rda")
 load("ps-income-levels.rda") # local copy

 # Figure 2
 # The plots for real income growth in the United States show a sharp difference for top earners versus the rest of the population 
 # The difference in income growth is much more pronounced for those higher on the income scale. 
 library("ggplot2")
 library("scales")
 p <- ggplot(data = df, aes(x = Year, y = value, color = Fractile))
 p <- p + geom_line()
 p <- p + theme_hc()
 p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2027-01-01")), labels = date_format("%Y"))
 p <- p + scale_y_continuous(labels = dollar)
 p <- p + theme(legend.position = "none")
 p <- p + geom_text(data = subset(df, Year == "2012-01-01"), aes(x = Year, label = Fractile, hjust = -0.2), size = 4)
 p <- p + xlab("")
 p <- p + ylab("")
 p <- p + ggtitle("Real incomes in the United States($)")
 p

 # Figure 3
 ggsave(p, file = "ps-us-real-income.pdf", width = 10, height = 6)

 # The dollar scale makes it difficult to see changes in the bottom quantiles. 
 # The bottom 90% income earners seem to enjoy no income growth over the entire time period. 
 # Using a logarithmic scale of base 10 for real income corrects for that issue.
 # Change y-units to log10 dollar:
 p <- p + scale_y_log10(labels = dollar)
 p

 ggsave(p, file = "ps-us-real-incomes-log.pdf", width = 10, height = 6)

 # Income inequality is clearly apparent and growing over the recent period.
 # Income levels in income fractiles that do not rely on capital gains have stagnated.
 # Plot of the growth rate for each series.
 # Add lagged series to data frame:
 df <- ddply(df, .(Fractile), transform, lagged = c(NA, value[-length(value)]))
 # Compute growth rate:
 df$Growth <- with(df, (value / lagged) - 1)

 # Figure 4
 # Plot real income growth rates:
 p <- ggplot(data = df, aes(x = Year, y = Growth,
  color = ifelse(Growth > 0, "positive", "negative")))
 p <- p + geom_linerange(aes(ymin = 0, ymax = Growth))
 #p <- p + scale_colour_manual("", values = c("positive" = "blue", "negative" = "red"))
 p <- p + geom_hline(y = 0, color = "gray")
 p <- p + theme_hc()
 p <- p + facet_grid(Fractile ~ .)
 p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
 p <- p + scale_y_continuous(labels = percent)
 p <- p + theme(legend.position = "none")
 p <- p + xlab("")
 p <- p + ylab("")
 p <- p + ggtitle("Real income growth rates (%)")
 p

 ggsave(p, file = "ps-us-real-income-growth.pdf", width = 10, height = 10)


 # Plot the actual dollar values.
 # Add differenced series:
 df <- ddply(df, .(Fractile), transform, Difference = c(NA, diff(value)))

 # Figure 5
 # Plot real income changes:
 p <- ggplot(data = df, aes(x = Year, y = Difference, 
  color = ifelse(Growth > 0, "positive", "negative")))
 p <- p + geom_linerange(aes(ymin = 0, ymax = Difference))
 p <- p + geom_hline(y = 0, color = "gray")
 p <- p + theme_hc()
 p <- p + facet_grid(Fractile ~ ., scale = "free_y")
 p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
 p <- p + scale_y_continuous(labels = dollar)
 p <- p + theme(legend.position = "none")
 p <- p + xlab("")
 p <- p + ylab("")
 p <- p + ggtitle("Changes in real income ($)")
 p

 ggsave(p, file = "ps-us-real-income-changes.pdf", width = 10, height = 6)

 # Focus on Top 1% 
 # Subset the top 1% of incomes:
 top1 <- subset(df, Fractile == "Top 1%")
 # Create a time series 'zoo' object:
 library("zoo")
 top1 <- with(top1, zoo(value, Year))
 str(top1)
 ## ‘zoo’ series from 1913-01-01 to 2012-01-01
 ##   Data: num [1:100] 279656 276230 269987 327844 304496 ...
 ##   Index:  Date[1:100], format: "1913-01-01" "1914-01-01" "1915-01-01" ...

 # Detrend the series:
 dz <- lm(coredata(top1) ~ index(top1))
 ## Call:
 ## lm(formula = coredata(top1) ~ index(top1))
 ## 
 ## Coefficients:
 ## (Intercept)  index(top1)  
 ##   586431.79        25.75  
  
 # Plot the residuals:
 p <- ggplot(data = dz, aes(x = index(top1),
  color = ifelse(resid(dz) > 0, "positive", "negative")))
 p <- p + geom_linerange(aes(ymin = 0, ymax = resid(dz)))
 p <- p + theme_hc()
 p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
 p <- p + scale_y_continuous(labels = dollar)
 p <- p + theme(legend.position = "none")
 p <- p + xlab("")
 p <- p + ylab("")
 p <- p + ggtitle("Detrended series of top 1% income growth")
 p

 ggsave(p, file = "ps-us-real-income-top-1-percent-detrended.pdf", width = 10, height = 6)

 # The series is not stationary, 
 # insofar as its past values fail to predict large amounts of its present values.
 # The last fifteen years are particularly remarkable in that respect: 
 # while some of the rise in income inequality has been absorbed by the model, 
 # the most recent years are robust to detrending.
	# 3 May 2014, Patrick Toche
	# Very minor edits: experiment with other styles, use of ggplot() instead of qplot().
	# First version: François Briatte
	# http://f.briatte.org/teaching/ida/093_practice.html
	# Data from Thomas Piketty & Emmanuel Saez
	# http://elsa.berkeley.edu/users/saez/TabFig2012prel.xls

	# Set directory:
	if(.Platform$OS.type == "windows"){
	setwd("c:/R/capital21c")
	} else {
	setwd("~/R/capital21c")
	}

	# Plot Aesthetics
	# http://jkunst.com/blog/post/ggplot-highcharts-taste/
	# Set Colors:
	colors_hc <- c("#7CB5EC", "#313131", "#F7A35C",
	"#90EE7E", "#7798BF", "#AAEEEE",
	"#FF0066", "#EEAAEE", "#55BF3B",
	"#DF5353", "#7798BF", "#AAEEEE")
	# Load theme font:
	library("extrafont")
	# font_import(open-sans)
	loadfonts()
	# Define theme:
	theme_hc <- function(){
	theme(
	text = element_text(family = "Open Sans", size = 12), # raised size
	title = element_text(hjust = 0),
	axis.title.x = element_text(hjust = .5),
	axis.title.y = element_text(hjust = .5),
	panel.grid.major.y = element_line(color = 'gray', size = .3),
	panel.grid.minor.y = element_blank(),
	panel.grid.major.x = element_blank(),
	panel.grid.minor.x = element_blank(),
	panel.border = element_blank(),
	panel.background = element_blank(),
	legend.position = "bottom",
	legend.title = element_blank()
	)
	}


	# Data on Income Shares:
	# Table A1: income share of the top 1% income earners (excluding capital gains)
	library("xlsx")
	data <- "TabFig2012prel.xls"
	df <- read.xlsx(data, sheetName = "Table A1", startRow = 4, endRow = 105, colIndex = 1:7)
	df <- df[-1, ] # Remove empty row
	# Fix Dates:
	colnames(df)[1] <- "Year"
	df[,1] <- as.Date(paste0(df$Year, "-01-01"), format = "%Y-%m-%d")

	# Reorder Variables:
	names(df) <- c("Year", c(10, 5, 1, 0.5, 0.1, 0.01))
	# order(names(df[,-1]), decreasing = FALSE) # not the desired order
	## [1] 6 5 4 3 1 2
	# order(as.numeric(names(df[,-1])), decreasing = FALSE) # the desired order
	## [1] 6 5 4 3 2 1
	names(df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))])
	## [1] "Year" "0.01" "0.1" "0.5" "1" "5" "10"
	df <- df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))]
	# Rename Variables:
	names(df) <- c("Year", paste0("Top ", c(0.01, 0.1, 0.5, 1, 5, 10), "%"))
	View(df)

	# Reshape to long format, dropping NA values:
	library("reshape")
	df <- melt(df, id = "Year", variable_name = "Fractile", na.rm = TRUE)
	# convert value to numeric
	df$value <- as.numeric(as.character(df$value))/100
	View(df)

	str(df)
	## 'data.frame': 592 obs. of 3 variables:
	## $ Year : Date, format: "1913-01-01" ...
	## $ Fractile: Factor w/ 6 levels "Top 0.01%","Top 0.1%",..: 1 1 1 1 1 1 1 1 1 1 ...
	## $ value : num 0.0276 0.0273 0.0436 0.044 0.0333 ...

	# Save Data
	# save(df, file = "ps-income-shares.rda")
	load("ps-income-shares.rda") # local copy
	# write.csv(df, file = "ps-income-shares.csv")

	# Figure 1
	# Plot all top fractile income shares, colored by income fractile.
	# This plot shows some of the same data as Figure 1 in the original Piketty-Saez paper
	# It reveals a U-shaped trend, starting with a general contraction of the income share of
	# top income earners at the end of World War II, and followed by an expansion in recent decades.
	library("ggplot2")
	library("scales")
	p <- ggplot(data = df, aes(x = Year, y = value, color = Fractile))
	p <- p + geom_line()
	p <- p + theme_hc()
	p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
	p <- p + scale_y_continuous(labels = percent)
	p <- p + theme(legend.position = "none")
	p <- p + geom_text(data = subset(df, Year == "2012-01-01"), aes(x = Year, label = Fractile, hjust = -0.2), size = 4)
	p <- p + xlab("")
	p <- p + ylab("")
	p <- p + ggtitle("U.S. top income shares (%)")
	p

	ggsave(p, file = "ps-us-top-income-shares.pdf", width = 10, height = 6)


	# Data on Income Levels:
	# Table_Incomegrowth: real income levels (including capital gains) for the lowest 90%, top 10% and top 1% income fractiles.
	library("xlsx")
	data <- "TabFig2012prel.xls"
	df <- read.xlsx(data, sheetName = "Table_Incomegrowth", startRow = 1, endRow = 104, colIndex = c(10, 5, 3))
	df <- df[-1, ] # Remove empty row
	# Fix Dates:
	df <- cbind("Year" = as.Date(paste0(1913:2012, "-01-01"), format = "%Y-%m-%d"), df)
	# Reorder Variables:
	names(df) <- c("Year", c(10, 1, 90))
	names(df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))])
	## [1] "Year" "1" "10" "90"
	df <- df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))]
	# Rename Variables:
	names(df) <- c("Year", "Top 1%", "Top 10%", "Bottom 90%")
	View(df)

	# Reshape to long format:
	library("reshape")
	df <- melt(df, id = "Year", variable_name = "Fractile", na.rm = TRUE)
	# convert value to numeric
	df$value <- as.numeric(as.character(df$value))
	View(df)

	str(df)
	## 'data.frame': 292 obs. of 3 variables:
	## $ Year : Date, format: "1917-01-01" ...
	## $ Fractile: Factor w/ 3 levels "Top 1%","Top 10%",..: 1 1 1 1 1 1 1 1 1 1 ...
	## $ value : num 279656 276230 269987 327844 304496 ...

	# Save Data
	# save(df, file = "ps-income-levels.rda")
	load("ps-income-levels.rda") # local copy

	# Figure 2
	# The plots for real income growth in the United States show a sharp difference for top earners versus the rest of the population
	# The difference in income growth is much more pronounced for those higher on the income scale.
	library("ggplot2")
	library("scales")
	p <- ggplot(data = df, aes(x = Year, y = value, color = Fractile))
	p <- p + geom_line()
	p <- p + theme_hc()
	p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2027-01-01")), labels = date_format("%Y"))
	p <- p + scale_y_continuous(labels = dollar)
	p <- p + theme(legend.position = "none")
	p <- p + geom_text(data = subset(df, Year == "2012-01-01"), aes(x = Year, label = Fractile, hjust = -0.2), size = 4)
	p <- p + xlab("")
	p <- p + ylab("")
	p <- p + ggtitle("Real incomes in the United States($)")
	p

	# Figure 3
	ggsave(p, file = "ps-us-real-income.pdf", width = 10, height = 6)

	# The dollar scale makes it difficult to see changes in the bottom quantiles.
	# The bottom 90% income earners seem to enjoy no income growth over the entire time period.
	# Using a logarithmic scale of base 10 for real income corrects for that issue.
	# Change y-units to log10 dollar:
	p <- p + scale_y_log10(labels = dollar)
	p

	ggsave(p, file = "ps-us-real-incomes-log.pdf", width = 10, height = 6)

	# Income inequality is clearly apparent and growing over the recent period.
	# Income levels in income fractiles that do not rely on capital gains have stagnated.
	# Plot of the growth rate for each series.
	# Add lagged series to data frame:
	df <- ddply(df, .(Fractile), transform, lagged = c(NA, value[-length(value)]))
	# Compute growth rate:
	df$Growth <- with(df, (value / lagged) - 1)

	# Figure 4
	# Plot real income growth rates:
	p <- ggplot(data = df, aes(x = Year, y = Growth,
	color = ifelse(Growth > 0, "positive", "negative")))
	p <- p + geom_linerange(aes(ymin = 0, ymax = Growth))
	#p <- p + scale_colour_manual("", values = c("positive" = "blue", "negative" = "red"))
	p <- p + geom_hline(y = 0, color = "gray")
	p <- p + theme_hc()
	p <- p + facet_grid(Fractile ~ .)
	p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
	p <- p + scale_y_continuous(labels = percent)
	p <- p + theme(legend.position = "none")
	p <- p + xlab("")
	p <- p + ylab("")
	p <- p + ggtitle("Real income growth rates (%)")
	p

	ggsave(p, file = "ps-us-real-income-growth.pdf", width = 10, height = 10)


	# Plot the actual dollar values.
	# Add differenced series:
	df <- ddply(df, .(Fractile), transform, Difference = c(NA, diff(value)))

	# Figure 5
	# Plot real income changes:
	p <- ggplot(data = df, aes(x = Year, y = Difference,
	color = ifelse(Growth > 0, "positive", "negative")))
	p <- p + geom_linerange(aes(ymin = 0, ymax = Difference))
	p <- p + geom_hline(y = 0, color = "gray")
	p <- p + theme_hc()
	p <- p + facet_grid(Fractile ~ ., scale = "free_y")
	p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
	p <- p + scale_y_continuous(labels = dollar)
	p <- p + theme(legend.position = "none")
	p <- p + xlab("")
	p <- p + ylab("")
	p <- p + ggtitle("Changes in real income ($)")
	p

	ggsave(p, file = "ps-us-real-income-changes.pdf", width = 10, height = 6)

	# Focus on Top 1%
	# Subset the top 1% of incomes:
	top1 <- subset(df, Fractile == "Top 1%")
	# Create a time series 'zoo' object:
	library("zoo")
	top1 <- with(top1, zoo(value, Year))
	str(top1)
	## ‘zoo’ series from 1913-01-01 to 2012-01-01
	## Data: num [1:100] 279656 276230 269987 327844 304496 ...
	## Index: Date[1:100], format: "1913-01-01" "1914-01-01" "1915-01-01" ...

	# Detrend the series:
	dz <- lm(coredata(top1) ~ index(top1))
	## Call:
	## lm(formula = coredata(top1) ~ index(top1))
	##
	## Coefficients:
	## (Intercept) index(top1)
	## 586431.79 25.75

	# Plot the residuals:
	p <- ggplot(data = dz, aes(x = index(top1),
	color = ifelse(resid(dz) > 0, "positive", "negative")))
	p <- p + geom_linerange(aes(ymin = 0, ymax = resid(dz)))
	p <- p + theme_hc()
	p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y"))
	p <- p + scale_y_continuous(labels = dollar)
	p <- p + theme(legend.position = "none")
	p <- p + xlab("")
	p <- p + ylab("")
	p <- p + ggtitle("Detrended series of top 1% income growth")
	p

	ggsave(p, file = "ps-us-real-income-top-1-percent-detrended.pdf", width = 10, height = 6)

	# The series is not stationary,
	# insofar as its past values fail to predict large amounts of its present values.
	# The last fifteen years are particularly remarkable in that respect:
	# while some of the rise in income inequality has been absorbed by the model,
	# the most recent years are robust to detrending.