-
-
Save Sandy4321/a580789a5cadf0e2a2c3 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#http://rpubs.com/bradleyboehmke/median_income_trends | |
# Preprocessing & summarizing data | |
library(dplyr) | |
library(tidyr) | |
library(RCurl) | |
library(XLConnect) | |
library(magrittr) | |
appURL <- "https://www.census.gov/hhes/www/income/data/historical/household/2013/h08.xls" | |
f = CFILE("exfile.xls", mode="wb") | |
curlPerform(url = appURL, writedata = f@ref, ssl.verifypeer = FALSE) | |
close(f) | |
out <- readWorksheetFromFile(file = "exfile.xls", sheet = "h08", header = FALSE | |
, startRow = 62, endRow = 113) | |
'State' -> names(out)[1] | |
out[,c(1,seq(2,61,2))] -> data | |
paste0("X",2013:1984) -> names(data)[2:31] | |
data %>% tbl_df -> data | |
data %>% | |
filter(State == "United States") %>% | |
gather(Year, Income, X2013:X1984) %>% | |
separate(Year, c("left","Year"), sep="X") %>% | |
select(-left) %>% | |
arrange(Year) -> us | |
# extract 1984 value as the baseline value and add to us dataframe | |
us[us$Year==1984,3] -> us_base | |
us_base$Income -> us$us_baseline | |
# calculate the percent change in U.S. median income for each year as compared | |
# to 1984 (aka the baseline) | |
us %>% | |
mutate(us_change = (Income-us_baseline)/us_baseline) -> us | |
# create a states dataframe, clean and turn into long format | |
data %>% | |
filter(State != "United States") %>% | |
gather(Year, Income, X2013:X1984) %>% | |
separate(Year, c("left","Year"), sep="X") %>% | |
select(-left) %>% | |
arrange(Year) %>% | |
filter(Income != "NA") -> states | |
# create baselines for each state | |
state_base <- states %>% | |
filter(Year == 1984) %>% | |
select(State, State_Baseline = Income) | |
# add baseline to the states and calculate the percent change in median income | |
# for each state as compared to 1984 (aka the baseline) | |
states %<>% | |
left_join(state_base) %>% | |
arrange(State) %>% | |
mutate(state_change = (Income-State_Baseline)/State_Baseline) | |
# change year variables from character to numeric | |
states$Year %<>% as.numeric | |
us$Year %<>% as.numeric | |
# get top 5 and bottom 5 states which will allow me to identify them | |
# graphically | |
states %>% | |
filter(Year == 2013) %>% | |
arrange(desc(state_change)) %>% | |
mutate(rank = seq(1,length(State), by=1)) %>% | |
filter(rank < 6 | rank > 46 ) -> rank | |
# Visualizatin development | |
library(grid) | |
library(scales) | |
library(ggplot2) | |
## Plot It | |
states %>% | |
ggplot(aes(Year, state_change, group=State)) + | |
theme_bw() + | |
theme(plot.background = element_blank(), | |
panel.grid.minor = element_blank(), | |
panel.grid.major.x = element_blank(), | |
panel.grid.major.y = element_line(linetype = 3, colour = "grey50"), | |
panel.border = element_blank(), | |
panel.background = element_blank(), | |
axis.ticks = element_blank(), | |
axis.title = element_blank()) + | |
geom_line(colour="grey90", alpha=.9) -> p | |
p | |
#Step 2 | |
#Next, I plot the U.S. overall average with a dashed line. | |
p %<>% + | |
geom_line(data=us, aes(Year, us_change, group=1), linetype=5) | |
p | |
#Step 3 | |
#I also wanted to identify how my own state (Ohio) has trended over the years so I filtered the states dataframe for only Ohio data and singled it out with a blue color. | |
p %<>% + | |
geom_line(data=filter(states, State=="Ohio"), | |
aes(Year, state_change, group=State), colour="dodgerblue", | |
size = 1) | |
p | |
#Step 4 | |
#I wanted to create some boundaries with the biggest economic “winner” and “loser”. So I identified the state that had the largest growth from 1984 to 2013 and the state that had the greatest contraction and singled these out with a slightly darker grey than all the other states. | |
p %<>% + | |
geom_line(data=filter(states, State=="D.C."), | |
aes(Year, state_change, group=State), colour="grey70") + | |
geom_line(data=filter(states, State=="Nevada"), | |
aes(Year, state_change, group=State), colour="grey70") | |
p | |
#Step 5 | |
#Next, I identify the top 5 and bottom 5 states along the overall US by plotting points on their 2013 values. | |
p %<>% + | |
geom_point(data=rank, aes(Year, state_change), shape=21, size=1.5, alpha=.6) + | |
geom_point(data=filter(us, Year == 2013), aes(Year, us_change), size=2.5, alpha=.6) | |
p | |
#Step 6 | |
#the last step I performed in R was to format the x- and y-axis. For the y-axis I fixed the limits and breaks (this was primarily because I was tinkering around with the dimensions of the chart but wanted to keep the breaks fixed) and turned the labels to a percent format. For the x-axis I increased the breaks to every 5 years and reduced the padding at the ends of the axis. Also note that I extend the x-axis to 1983 even though my data only goes back to 1984. This is to add more space on the left side of the x-axis; the reason for this becomes evident in the final graphic where I move the y-axis labels. | |
p %<>% + | |
scale_y_continuous(limits=c(-.2,.55), breaks=seq(-.2,.4,by=.2), label=percent) + | |
scale_x_continuous(limits=c(1983,2013),breaks=seq(1985,2010,by=5), expand=c(0,.25)) | |
p + ggtitle("Brad's Beautiful Time Series Plot") + | |
theme(plot.title=element_text(face="bold",hjust=.012,vjust=.8,colour="#3C3C3C",size=20)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment