Last active
August 29, 2015 14:00
-
-
Save ptoche/872a77b5363356ff5399 to your computer and use it in GitHub Desktop.
Piketty & Saez, Top Incomes in the U.S., 1913-2012
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 3 May 2014, Patrick Toche | |
# Very minor edits: experiment with other styles, use of ggplot() instead of qplot(). | |
# First version: François Briatte | |
# http://f.briatte.org/teaching/ida/093_practice.html | |
# Data from Thomas Piketty & Emmanuel Saez | |
# http://elsa.berkeley.edu/users/saez/TabFig2012prel.xls | |
# Set directory: | |
if(.Platform$OS.type == "windows"){ | |
setwd("c:/R/capital21c") | |
} else { | |
setwd("~/R/capital21c") | |
} | |
# Plot Aesthetics | |
# http://jkunst.com/blog/post/ggplot-highcharts-taste/ | |
# Set Colors: | |
colors_hc <- c("#7CB5EC", "#313131", "#F7A35C", | |
"#90EE7E", "#7798BF", "#AAEEEE", | |
"#FF0066", "#EEAAEE", "#55BF3B", | |
"#DF5353", "#7798BF", "#AAEEEE") | |
# Load theme font: | |
library("extrafont") | |
# font_import(open-sans) | |
loadfonts() | |
# Define theme: | |
theme_hc <- function(){ | |
theme( | |
text = element_text(family = "Open Sans", size = 12), # raised size | |
title = element_text(hjust = 0), | |
axis.title.x = element_text(hjust = .5), | |
axis.title.y = element_text(hjust = .5), | |
panel.grid.major.y = element_line(color = 'gray', size = .3), | |
panel.grid.minor.y = element_blank(), | |
panel.grid.major.x = element_blank(), | |
panel.grid.minor.x = element_blank(), | |
panel.border = element_blank(), | |
panel.background = element_blank(), | |
legend.position = "bottom", | |
legend.title = element_blank() | |
) | |
} | |
# Data on Income Shares: | |
# Table A1: income share of the top 1% income earners (excluding capital gains) | |
library("xlsx") | |
data <- "TabFig2012prel.xls" | |
df <- read.xlsx(data, sheetName = "Table A1", startRow = 4, endRow = 105, colIndex = 1:7) | |
df <- df[-1, ] # Remove empty row | |
# Fix Dates: | |
colnames(df)[1] <- "Year" | |
df[,1] <- as.Date(paste0(df$Year, "-01-01"), format = "%Y-%m-%d") | |
# Reorder Variables: | |
names(df) <- c("Year", c(10, 5, 1, 0.5, 0.1, 0.01)) | |
# order(names(df[,-1]), decreasing = FALSE) # not the desired order | |
## [1] 6 5 4 3 1 2 | |
# order(as.numeric(names(df[,-1])), decreasing = FALSE) # the desired order | |
## [1] 6 5 4 3 2 1 | |
names(df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))]) | |
## [1] "Year" "0.01" "0.1" "0.5" "1" "5" "10" | |
df <- df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))] | |
# Rename Variables: | |
names(df) <- c("Year", paste0("Top ", c(0.01, 0.1, 0.5, 1, 5, 10), "%")) | |
View(df) | |
# Reshape to long format, dropping NA values: | |
library("reshape") | |
df <- melt(df, id = "Year", variable_name = "Fractile", na.rm = TRUE) | |
# convert value to numeric | |
df$value <- as.numeric(as.character(df$value))/100 | |
View(df) | |
str(df) | |
## 'data.frame': 592 obs. of 3 variables: | |
## $ Year : Date, format: "1913-01-01" ... | |
## $ Fractile: Factor w/ 6 levels "Top 0.01%","Top 0.1%",..: 1 1 1 1 1 1 1 1 1 1 ... | |
## $ value : num 0.0276 0.0273 0.0436 0.044 0.0333 ... | |
# Save Data | |
# save(df, file = "ps-income-shares.rda") | |
load("ps-income-shares.rda") # local copy | |
# write.csv(df, file = "ps-income-shares.csv") | |
# Figure 1 | |
# Plot all top fractile income shares, colored by income fractile. | |
# This plot shows some of the same data as Figure 1 in the original Piketty-Saez paper | |
# It reveals a U-shaped trend, starting with a general contraction of the income share of | |
# top income earners at the end of World War II, and followed by an expansion in recent decades. | |
library("ggplot2") | |
library("scales") | |
p <- ggplot(data = df, aes(x = Year, y = value, color = Fractile)) | |
p <- p + geom_line() | |
p <- p + theme_hc() | |
p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y")) | |
p <- p + scale_y_continuous(labels = percent) | |
p <- p + theme(legend.position = "none") | |
p <- p + geom_text(data = subset(df, Year == "2012-01-01"), aes(x = Year, label = Fractile, hjust = -0.2), size = 4) | |
p <- p + xlab("") | |
p <- p + ylab("") | |
p <- p + ggtitle("U.S. top income shares (%)") | |
p | |
ggsave(p, file = "ps-us-top-income-shares.pdf", width = 10, height = 6) | |
# Data on Income Levels: | |
# Table_Incomegrowth: real income levels (including capital gains) for the lowest 90%, top 10% and top 1% income fractiles. | |
library("xlsx") | |
data <- "TabFig2012prel.xls" | |
df <- read.xlsx(data, sheetName = "Table_Incomegrowth", startRow = 1, endRow = 104, colIndex = c(10, 5, 3)) | |
df <- df[-1, ] # Remove empty row | |
# Fix Dates: | |
df <- cbind("Year" = as.Date(paste0(1913:2012, "-01-01"), format = "%Y-%m-%d"), df) | |
# Reorder Variables: | |
names(df) <- c("Year", c(10, 1, 90)) | |
names(df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))]) | |
## [1] "Year" "1" "10" "90" | |
df <- df[,c(1,1+order(as.numeric(names(df[,-1])), decreasing = FALSE))] | |
# Rename Variables: | |
names(df) <- c("Year", "Top 1%", "Top 10%", "Bottom 90%") | |
View(df) | |
# Reshape to long format: | |
library("reshape") | |
df <- melt(df, id = "Year", variable_name = "Fractile", na.rm = TRUE) | |
# convert value to numeric | |
df$value <- as.numeric(as.character(df$value)) | |
View(df) | |
str(df) | |
## 'data.frame': 292 obs. of 3 variables: | |
## $ Year : Date, format: "1917-01-01" ... | |
## $ Fractile: Factor w/ 3 levels "Top 1%","Top 10%",..: 1 1 1 1 1 1 1 1 1 1 ... | |
## $ value : num 279656 276230 269987 327844 304496 ... | |
# Save Data | |
# save(df, file = "ps-income-levels.rda") | |
load("ps-income-levels.rda") # local copy | |
# Figure 2 | |
# The plots for real income growth in the United States show a sharp difference for top earners versus the rest of the population | |
# The difference in income growth is much more pronounced for those higher on the income scale. | |
library("ggplot2") | |
library("scales") | |
p <- ggplot(data = df, aes(x = Year, y = value, color = Fractile)) | |
p <- p + geom_line() | |
p <- p + theme_hc() | |
p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2027-01-01")), labels = date_format("%Y")) | |
p <- p + scale_y_continuous(labels = dollar) | |
p <- p + theme(legend.position = "none") | |
p <- p + geom_text(data = subset(df, Year == "2012-01-01"), aes(x = Year, label = Fractile, hjust = -0.2), size = 4) | |
p <- p + xlab("") | |
p <- p + ylab("") | |
p <- p + ggtitle("Real incomes in the United States($)") | |
p | |
# Figure 3 | |
ggsave(p, file = "ps-us-real-income.pdf", width = 10, height = 6) | |
# The dollar scale makes it difficult to see changes in the bottom quantiles. | |
# The bottom 90% income earners seem to enjoy no income growth over the entire time period. | |
# Using a logarithmic scale of base 10 for real income corrects for that issue. | |
# Change y-units to log10 dollar: | |
p <- p + scale_y_log10(labels = dollar) | |
p | |
ggsave(p, file = "ps-us-real-incomes-log.pdf", width = 10, height = 6) | |
# Income inequality is clearly apparent and growing over the recent period. | |
# Income levels in income fractiles that do not rely on capital gains have stagnated. | |
# Plot of the growth rate for each series. | |
# Add lagged series to data frame: | |
df <- ddply(df, .(Fractile), transform, lagged = c(NA, value[-length(value)])) | |
# Compute growth rate: | |
df$Growth <- with(df, (value / lagged) - 1) | |
# Figure 4 | |
# Plot real income growth rates: | |
p <- ggplot(data = df, aes(x = Year, y = Growth, | |
color = ifelse(Growth > 0, "positive", "negative"))) | |
p <- p + geom_linerange(aes(ymin = 0, ymax = Growth)) | |
#p <- p + scale_colour_manual("", values = c("positive" = "blue", "negative" = "red")) | |
p <- p + geom_hline(y = 0, color = "gray") | |
p <- p + theme_hc() | |
p <- p + facet_grid(Fractile ~ .) | |
p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y")) | |
p <- p + scale_y_continuous(labels = percent) | |
p <- p + theme(legend.position = "none") | |
p <- p + xlab("") | |
p <- p + ylab("") | |
p <- p + ggtitle("Real income growth rates (%)") | |
p | |
ggsave(p, file = "ps-us-real-income-growth.pdf", width = 10, height = 10) | |
# Plot the actual dollar values. | |
# Add differenced series: | |
df <- ddply(df, .(Fractile), transform, Difference = c(NA, diff(value))) | |
# Figure 5 | |
# Plot real income changes: | |
p <- ggplot(data = df, aes(x = Year, y = Difference, | |
color = ifelse(Growth > 0, "positive", "negative"))) | |
p <- p + geom_linerange(aes(ymin = 0, ymax = Difference)) | |
p <- p + geom_hline(y = 0, color = "gray") | |
p <- p + theme_hc() | |
p <- p + facet_grid(Fractile ~ ., scale = "free_y") | |
p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y")) | |
p <- p + scale_y_continuous(labels = dollar) | |
p <- p + theme(legend.position = "none") | |
p <- p + xlab("") | |
p <- p + ylab("") | |
p <- p + ggtitle("Changes in real income ($)") | |
p | |
ggsave(p, file = "ps-us-real-income-changes.pdf", width = 10, height = 6) | |
# Focus on Top 1% | |
# Subset the top 1% of incomes: | |
top1 <- subset(df, Fractile == "Top 1%") | |
# Create a time series 'zoo' object: | |
library("zoo") | |
top1 <- with(top1, zoo(value, Year)) | |
str(top1) | |
## ‘zoo’ series from 1913-01-01 to 2012-01-01 | |
## Data: num [1:100] 279656 276230 269987 327844 304496 ... | |
## Index: Date[1:100], format: "1913-01-01" "1914-01-01" "1915-01-01" ... | |
# Detrend the series: | |
dz <- lm(coredata(top1) ~ index(top1)) | |
## Call: | |
## lm(formula = coredata(top1) ~ index(top1)) | |
## | |
## Coefficients: | |
## (Intercept) index(top1) | |
## 586431.79 25.75 | |
# Plot the residuals: | |
p <- ggplot(data = dz, aes(x = index(top1), | |
color = ifelse(resid(dz) > 0, "positive", "negative"))) | |
p <- p + geom_linerange(aes(ymin = 0, ymax = resid(dz))) | |
p <- p + theme_hc() | |
p <- p + scale_x_date(limits = as.Date(c("1911-01-01", "2023-01-01")), labels = date_format("%Y")) | |
p <- p + scale_y_continuous(labels = dollar) | |
p <- p + theme(legend.position = "none") | |
p <- p + xlab("") | |
p <- p + ylab("") | |
p <- p + ggtitle("Detrended series of top 1% income growth") | |
p | |
ggsave(p, file = "ps-us-real-income-top-1-percent-detrended.pdf", width = 10, height = 6) | |
# The series is not stationary, | |
# insofar as its past values fail to predict large amounts of its present values. | |
# The last fifteen years are particularly remarkable in that respect: | |
# while some of the rise in income inequality has been absorbed by the model, | |
# the most recent years are robust to detrending. | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Glad you liked it!