mattspence · April 7, 2014 20:41
diff --git a/HH_DataViz.R b/HH_DataViz.R
 ## This is a comment.
 ## 
 ## R code to do two data visualizations for hospitals and health data.
 ## Requires healthandhospitals.csv
 ## Also requires 4 packages: dplyr, ggplot2, reshape2, and scales
 ## To download those packages, type the following:

 setInternet2(TRUE) ## This helps download packages in our firewalled environment
 install.packages(c("dplyr", "ggplot2", "reshape2", "scales"), dependencies=TRUE)

 ## Otherwise, run from below:

 rm(list=ls()) ## remove any data currently in the workspace

 setwd("H:/code/HH_DataViz") ## set the working directory to whereever you downloaded the .csv file

 ## Read in the Health and Hospitals data. Set . and -11111 to NA. 
 ## Also read in dataset with GOVS codes, state name and state abbreviation. 
 data<-read.csv("healthandhospitals.csv", stringsAsFactors=FALSE, na.strings=c(".", -11111)) 
 govs_codes <- data.frame(GOVS=c("01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51"), 
 		NAME = c("Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"), 
 		ABB=c("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"))
 govs_codes$GOVS <- as.numeric(govs_codes$GOVS) ## Convert the GOVS code to Numeric to match the data
 data_merged<-merge(data,govs_codes, by.x="State", by.y="GOVS", all.x=TRUE) ## Merge the two datasets

 library(dplyr) ## Load the dplyr library. 
 ## dplyr makes transforming the data really easy. The following code using dplyr's syntax to filter the data, select columns, arrange, etc

 dataStates <- data_merged %.% filter(Level==2 & State!=0) ## Filter: Only level 2 and no State 0
 dataStatesAndLocal <- data_merged %.% filter(Level==1 & State!=0) ## Filter: Only level 1 and no State 0

 data <- tbl_df(data= dataStatesAndLocal) %.% arrange(State, Year) ## Set the dataset as "tbl_df" and arrange by State and Year (not a necessary step)

 ## Here, "group" the data by State (that is, do all the transformations by state.
 ## Filter only years later than 1970, then create new variables
 indexed_data <- data %.% group_by(State) %.% filter(Year > 1970) %.%
 	mutate (
 	Indexed_TotalEmp = (TotalEmp/TotalEmp[Year==1972]),    
 	HH_Emp = HealthEmp+HospitalsEmp,
 	Indexed_HH_Emp = (HH_Emp/HH_Emp[Year==1972]),
 	Indexed_TotalPay = (TotalPay/TotalPay[Year==1972]),
 	HH_FTEmp = HealthFTEmp + HospitalsFTEmp,
 	HH_PTEmp = HealthPTEmp + HospitalsPTEmp,
 	HealthvsHospitals = HealthFTE/(HealthFTE+HospitalsFTE)
 	)

 library(ggplot2) ## Use the ggplot2 library to make pretty graphs
 ## ggplot2 has a bit of a learning curve, but think of each part as a "layer", so geom_line is one layer, then geom_smooth is another, etc.
 ## Try taking off or adding layers to see how the graphic differs

 ## Total Health and Hospitals Employment index by state over time -- very basic edition
 ## Don't worry about the warning message about 51 rows containing missing values -- those are the 2012 data that are missing
 ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) +   ## this sets up the plot but doesn't graph anything
 	geom_line(aes(group=State), color="gray") +        ## this puts a line for each State
 	geom_smooth(aes(group = 1), size=2, se=FALSE)      ## this puts a smoothed line for all States (loess, to be exact)

 ## Total Health and Hospitals Employment index by state over time -- highlighting DC and NC as the extreme values in 2007.
 ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) +
 	geom_segment(aes(x=1970, xend=2007, y=1, yend=1), linetype=2, color="black") + 
 	geom_line(data=indexed_data %.% filter(ABB!="NC" & ABB!="DC"), aes(group=State), color="gray") +
 	geom_line(data=indexed_data %.% filter(ABB=="NC"), aes(x=Year, y=Indexed_HH_Emp), color="black") + 
 	geom_text(data=indexed_data %.% filter(ABB=="NC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="NC"), color="black", size=3) + 
 	geom_line(data=indexed_data %.% filter(ABB=="DC"), aes(x=Year, y=Indexed_HH_Emp), color="black") +
 	geom_text(data=indexed_data %.% filter(ABB=="DC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="DC"), color="black", size=3) + 
 	geom_smooth(aes(group = 1), size=2, se=FALSE) +
 	annotate("text", x=2008, y=1.55, hjust=0, vjust=0, label="Avg All States", color="blue", size=3)  +
 	ggtitle("State-by-state Growth in the Number of Hospital and Healthcare Employees (1972 = 1)") +
 	ylab("Index of Health and Hospitals Employees (1972 = 1)") + 
 	theme_minimal() 
 ggsave("index_plot.pdf", width=8, height=8)
 ggsave("index_plot.png", width=8, height=8)

 library(reshape2) ## needed to transform a dataset from wide to long
 library(scales)

 ## Transform the data from wide to long, because that's the format that is easier to plug in to ggplot
 tmpPTvsFT<-melt(indexed_data %.% filter(Year==2007) %.% 
 		select(State, ABB, HH_PTEmp, HH_FTEmp), id.vars=c("ABB", "State"), variable.name="EmpType", value.name="Employees") 

 ggplot(data=tmpPTvsFT) + 
 	geom_bar(aes(x=reorder(as.factor(ABB), Employees, FUN=max), y=Employees, fill=EmpType), stat="identity", position="dodge") + ## the "reorder" part is how the x axis gets ordered by full-time employees
 	xlab("State") +
  ylab("Employees, 2007") + 
 	ggtitle("Number of Part-time and Full-Time Employees in Hospitals and Health") + 
 	scale_y_continuous(labels=comma) + ## change the y scale to commas rather than scientific notation
 	coord_flip() + ## flip the coordinates so the bars go sideways instead of up and down
 	theme_minimal(base_size=12) +  
 	scale_fill_discrete(h.start=180, labels=c("Part-time", "Full-time")) +
 	guides(fill=guide_legend(title="Employment Type", reverse=TRUE))
 ggsave(filename="bar_plot.pdf", width=8, height=8)
 ggsave(filename="bar_plot.png", width=8, height=8)
	## This is a comment.
	##
	## R code to do two data visualizations for hospitals and health data.
	## Requires healthandhospitals.csv
	## Also requires 4 packages: dplyr, ggplot2, reshape2, and scales
	## To download those packages, type the following:

	setInternet2(TRUE) ## This helps download packages in our firewalled environment
	install.packages(c("dplyr", "ggplot2", "reshape2", "scales"), dependencies=TRUE)

	## Otherwise, run from below:

	rm(list=ls()) ## remove any data currently in the workspace

	setwd("H:/code/HH_DataViz") ## set the working directory to whereever you downloaded the .csv file

	## Read in the Health and Hospitals data. Set . and -11111 to NA.
	## Also read in dataset with GOVS codes, state name and state abbreviation.
	data<-read.csv("healthandhospitals.csv", stringsAsFactors=FALSE, na.strings=c(".", -11111))
	govs_codes <- data.frame(GOVS=c("01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51"),
	NAME = c("Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"),
	ABB=c("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"))
	govs_codes$GOVS <- as.numeric(govs_codes$GOVS) ## Convert the GOVS code to Numeric to match the data
	data_merged<-merge(data,govs_codes, by.x="State", by.y="GOVS", all.x=TRUE) ## Merge the two datasets

	library(dplyr) ## Load the dplyr library.
	## dplyr makes transforming the data really easy. The following code using dplyr's syntax to filter the data, select columns, arrange, etc

	dataStates <- data_merged %.% filter(Level==2 & State!=0) ## Filter: Only level 2 and no State 0
	dataStatesAndLocal <- data_merged %.% filter(Level==1 & State!=0) ## Filter: Only level 1 and no State 0

	data <- tbl_df(data= dataStatesAndLocal) %.% arrange(State, Year) ## Set the dataset as "tbl_df" and arrange by State and Year (not a necessary step)

	## Here, "group" the data by State (that is, do all the transformations by state.
	## Filter only years later than 1970, then create new variables
	indexed_data <- data %.% group_by(State) %.% filter(Year > 1970) %.%
	mutate (
	Indexed_TotalEmp = (TotalEmp/TotalEmp[Year==1972]),
	HH_Emp = HealthEmp+HospitalsEmp,
	Indexed_HH_Emp = (HH_Emp/HH_Emp[Year==1972]),
	Indexed_TotalPay = (TotalPay/TotalPay[Year==1972]),
	HH_FTEmp = HealthFTEmp + HospitalsFTEmp,
	HH_PTEmp = HealthPTEmp + HospitalsPTEmp,
	HealthvsHospitals = HealthFTE/(HealthFTE+HospitalsFTE)
	)

	library(ggplot2) ## Use the ggplot2 library to make pretty graphs
	## ggplot2 has a bit of a learning curve, but think of each part as a "layer", so geom_line is one layer, then geom_smooth is another, etc.
	## Try taking off or adding layers to see how the graphic differs

	## Total Health and Hospitals Employment index by state over time -- very basic edition
	## Don't worry about the warning message about 51 rows containing missing values -- those are the 2012 data that are missing
	ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) + ## this sets up the plot but doesn't graph anything
	geom_line(aes(group=State), color="gray") + ## this puts a line for each State
	geom_smooth(aes(group = 1), size=2, se=FALSE) ## this puts a smoothed line for all States (loess, to be exact)

	## Total Health and Hospitals Employment index by state over time -- highlighting DC and NC as the extreme values in 2007.
	ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) +
	geom_segment(aes(x=1970, xend=2007, y=1, yend=1), linetype=2, color="black") +
	geom_line(data=indexed_data %.% filter(ABB!="NC" & ABB!="DC"), aes(group=State), color="gray") +
	geom_line(data=indexed_data %.% filter(ABB=="NC"), aes(x=Year, y=Indexed_HH_Emp), color="black") +
	geom_text(data=indexed_data %.% filter(ABB=="NC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="NC"), color="black", size=3) +
	geom_line(data=indexed_data %.% filter(ABB=="DC"), aes(x=Year, y=Indexed_HH_Emp), color="black") +
	geom_text(data=indexed_data %.% filter(ABB=="DC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="DC"), color="black", size=3) +
	geom_smooth(aes(group = 1), size=2, se=FALSE) +
	annotate("text", x=2008, y=1.55, hjust=0, vjust=0, label="Avg All States", color="blue", size=3) +
	ggtitle("State-by-state Growth in the Number of Hospital and Healthcare Employees (1972 = 1)") +
	ylab("Index of Health and Hospitals Employees (1972 = 1)") +
	theme_minimal()
	ggsave("index_plot.pdf", width=8, height=8)
	ggsave("index_plot.png", width=8, height=8)

	library(reshape2) ## needed to transform a dataset from wide to long
	library(scales)

	## Transform the data from wide to long, because that's the format that is easier to plug in to ggplot
	tmpPTvsFT<-melt(indexed_data %.% filter(Year==2007) %.%
	select(State, ABB, HH_PTEmp, HH_FTEmp), id.vars=c("ABB", "State"), variable.name="EmpType", value.name="Employees")

	ggplot(data=tmpPTvsFT) +
	geom_bar(aes(x=reorder(as.factor(ABB), Employees, FUN=max), y=Employees, fill=EmpType), stat="identity", position="dodge") + ## the "reorder" part is how the x axis gets ordered by full-time employees
	xlab("State") +
	ylab("Employees, 2007") +
	ggtitle("Number of Part-time and Full-Time Employees in Hospitals and Health") +
	scale_y_continuous(labels=comma) + ## change the y scale to commas rather than scientific notation
	coord_flip() + ## flip the coordinates so the bars go sideways instead of up and down
	theme_minimal(base_size=12) +
	scale_fill_discrete(h.start=180, labels=c("Part-time", "Full-time")) +
	guides(fill=guide_legend(title="Employment Type", reverse=TRUE))
	ggsave(filename="bar_plot.pdf", width=8, height=8)
	ggsave(filename="bar_plot.png", width=8, height=8)