Skip to content

Instantly share code, notes, and snippets.

@mattspence
Created April 7, 2014 20:41
Show Gist options
  • Save mattspence/10048533 to your computer and use it in GitHub Desktop.
Save mattspence/10048533 to your computer and use it in GitHub Desktop.
## This is a comment.
##
## R code to do two data visualizations for hospitals and health data.
## Requires healthandhospitals.csv
## Also requires 4 packages: dplyr, ggplot2, reshape2, and scales
## To download those packages, type the following:
setInternet2(TRUE) ## This helps download packages in our firewalled environment
install.packages(c("dplyr", "ggplot2", "reshape2", "scales"), dependencies=TRUE)
## Otherwise, run from below:
rm(list=ls()) ## remove any data currently in the workspace
setwd("H:/code/HH_DataViz") ## set the working directory to whereever you downloaded the .csv file
## Read in the Health and Hospitals data. Set . and -11111 to NA.
## Also read in dataset with GOVS codes, state name and state abbreviation.
data<-read.csv("healthandhospitals.csv", stringsAsFactors=FALSE, na.strings=c(".", -11111))
govs_codes <- data.frame(GOVS=c("01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51"),
NAME = c("Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"),
ABB=c("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY"))
govs_codes$GOVS <- as.numeric(govs_codes$GOVS) ## Convert the GOVS code to Numeric to match the data
data_merged<-merge(data,govs_codes, by.x="State", by.y="GOVS", all.x=TRUE) ## Merge the two datasets
library(dplyr) ## Load the dplyr library.
## dplyr makes transforming the data really easy. The following code using dplyr's syntax to filter the data, select columns, arrange, etc
dataStates <- data_merged %.% filter(Level==2 & State!=0) ## Filter: Only level 2 and no State 0
dataStatesAndLocal <- data_merged %.% filter(Level==1 & State!=0) ## Filter: Only level 1 and no State 0
data <- tbl_df(data= dataStatesAndLocal) %.% arrange(State, Year) ## Set the dataset as "tbl_df" and arrange by State and Year (not a necessary step)
## Here, "group" the data by State (that is, do all the transformations by state.
## Filter only years later than 1970, then create new variables
indexed_data <- data %.% group_by(State) %.% filter(Year > 1970) %.%
mutate (
Indexed_TotalEmp = (TotalEmp/TotalEmp[Year==1972]),
HH_Emp = HealthEmp+HospitalsEmp,
Indexed_HH_Emp = (HH_Emp/HH_Emp[Year==1972]),
Indexed_TotalPay = (TotalPay/TotalPay[Year==1972]),
HH_FTEmp = HealthFTEmp + HospitalsFTEmp,
HH_PTEmp = HealthPTEmp + HospitalsPTEmp,
HealthvsHospitals = HealthFTE/(HealthFTE+HospitalsFTE)
)
library(ggplot2) ## Use the ggplot2 library to make pretty graphs
## ggplot2 has a bit of a learning curve, but think of each part as a "layer", so geom_line is one layer, then geom_smooth is another, etc.
## Try taking off or adding layers to see how the graphic differs
## Total Health and Hospitals Employment index by state over time -- very basic edition
## Don't worry about the warning message about 51 rows containing missing values -- those are the 2012 data that are missing
ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) + ## this sets up the plot but doesn't graph anything
geom_line(aes(group=State), color="gray") + ## this puts a line for each State
geom_smooth(aes(group = 1), size=2, se=FALSE) ## this puts a smoothed line for all States (loess, to be exact)
## Total Health and Hospitals Employment index by state over time -- highlighting DC and NC as the extreme values in 2007.
ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) +
geom_segment(aes(x=1970, xend=2007, y=1, yend=1), linetype=2, color="black") +
geom_line(data=indexed_data %.% filter(ABB!="NC" & ABB!="DC"), aes(group=State), color="gray") +
geom_line(data=indexed_data %.% filter(ABB=="NC"), aes(x=Year, y=Indexed_HH_Emp), color="black") +
geom_text(data=indexed_data %.% filter(ABB=="NC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="NC"), color="black", size=3) +
geom_line(data=indexed_data %.% filter(ABB=="DC"), aes(x=Year, y=Indexed_HH_Emp), color="black") +
geom_text(data=indexed_data %.% filter(ABB=="DC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="DC"), color="black", size=3) +
geom_smooth(aes(group = 1), size=2, se=FALSE) +
annotate("text", x=2008, y=1.55, hjust=0, vjust=0, label="Avg All States", color="blue", size=3) +
ggtitle("State-by-state Growth in the Number of Hospital and Healthcare Employees (1972 = 1)") +
ylab("Index of Health and Hospitals Employees (1972 = 1)") +
theme_minimal()
ggsave("index_plot.pdf", width=8, height=8)
ggsave("index_plot.png", width=8, height=8)
library(reshape2) ## needed to transform a dataset from wide to long
library(scales)
## Transform the data from wide to long, because that's the format that is easier to plug in to ggplot
tmpPTvsFT<-melt(indexed_data %.% filter(Year==2007) %.%
select(State, ABB, HH_PTEmp, HH_FTEmp), id.vars=c("ABB", "State"), variable.name="EmpType", value.name="Employees")
ggplot(data=tmpPTvsFT) +
geom_bar(aes(x=reorder(as.factor(ABB), Employees, FUN=max), y=Employees, fill=EmpType), stat="identity", position="dodge") + ## the "reorder" part is how the x axis gets ordered by full-time employees
xlab("State") +
ylab("Employees, 2007") +
ggtitle("Number of Part-time and Full-Time Employees in Hospitals and Health") +
scale_y_continuous(labels=comma) + ## change the y scale to commas rather than scientific notation
coord_flip() + ## flip the coordinates so the bars go sideways instead of up and down
theme_minimal(base_size=12) +
scale_fill_discrete(h.start=180, labels=c("Part-time", "Full-time")) +
guides(fill=guide_legend(title="Employment Type", reverse=TRUE))
ggsave(filename="bar_plot.pdf", width=8, height=8)
ggsave(filename="bar_plot.png", width=8, height=8)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment