Created
April 7, 2014 20:41
-
-
Save mattspence/10048533 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## This is a comment. | |
## | |
## R code to do two data visualizations for hospitals and health data. | |
## Requires healthandhospitals.csv | |
## Also requires 4 packages: dplyr, ggplot2, reshape2, and scales | |
## To download those packages, type the following: | |
setInternet2(TRUE) ## This helps download packages in our firewalled environment | |
install.packages(c("dplyr", "ggplot2", "reshape2", "scales"), dependencies=TRUE) | |
## Otherwise, run from below: | |
rm(list=ls()) ## remove any data currently in the workspace | |
setwd("H:/code/HH_DataViz") ## set the working directory to whereever you downloaded the .csv file | |
## Read in the Health and Hospitals data. Set . and -11111 to NA. | |
## Also read in dataset with GOVS codes, state name and state abbreviation. | |
data<-read.csv("healthandhospitals.csv", stringsAsFactors=FALSE, na.strings=c(".", -11111)) | |
govs_codes <- data.frame(GOVS=c("01","02","03","04","05","06","07","08","09","10","11","12","13","14","15","16","17","18","19","20","21","22","23","24","25","26","27","28","29","30","31","32","33","34","35","36","37","38","39","40","41","42","43","44","45","46","47","48","49","50","51"), | |
NAME = c("Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","District of Columbia","Florida","Georgia","Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts","Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey","New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island","South Carolina","South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington","West Virginia","Wisconsin","Wyoming"), | |
ABB=c("AL","AK","AZ","AR","CA","CO","CT","DE","DC","FL","GA","HI","ID","IL","IN","IA","KS","KY","LA","ME","MD","MA","MI","MN","MS","MO","MT","NE","NV","NH","NJ","NM","NY","NC","ND","OH","OK","OR","PA","RI","SC","SD","TN","TX","UT","VT","VA","WA","WV","WI","WY")) | |
govs_codes$GOVS <- as.numeric(govs_codes$GOVS) ## Convert the GOVS code to Numeric to match the data | |
data_merged<-merge(data,govs_codes, by.x="State", by.y="GOVS", all.x=TRUE) ## Merge the two datasets | |
library(dplyr) ## Load the dplyr library. | |
## dplyr makes transforming the data really easy. The following code using dplyr's syntax to filter the data, select columns, arrange, etc | |
dataStates <- data_merged %.% filter(Level==2 & State!=0) ## Filter: Only level 2 and no State 0 | |
dataStatesAndLocal <- data_merged %.% filter(Level==1 & State!=0) ## Filter: Only level 1 and no State 0 | |
data <- tbl_df(data= dataStatesAndLocal) %.% arrange(State, Year) ## Set the dataset as "tbl_df" and arrange by State and Year (not a necessary step) | |
## Here, "group" the data by State (that is, do all the transformations by state. | |
## Filter only years later than 1970, then create new variables | |
indexed_data <- data %.% group_by(State) %.% filter(Year > 1970) %.% | |
mutate ( | |
Indexed_TotalEmp = (TotalEmp/TotalEmp[Year==1972]), | |
HH_Emp = HealthEmp+HospitalsEmp, | |
Indexed_HH_Emp = (HH_Emp/HH_Emp[Year==1972]), | |
Indexed_TotalPay = (TotalPay/TotalPay[Year==1972]), | |
HH_FTEmp = HealthFTEmp + HospitalsFTEmp, | |
HH_PTEmp = HealthPTEmp + HospitalsPTEmp, | |
HealthvsHospitals = HealthFTE/(HealthFTE+HospitalsFTE) | |
) | |
library(ggplot2) ## Use the ggplot2 library to make pretty graphs | |
## ggplot2 has a bit of a learning curve, but think of each part as a "layer", so geom_line is one layer, then geom_smooth is another, etc. | |
## Try taking off or adding layers to see how the graphic differs | |
## Total Health and Hospitals Employment index by state over time -- very basic edition | |
## Don't worry about the warning message about 51 rows containing missing values -- those are the 2012 data that are missing | |
ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) + ## this sets up the plot but doesn't graph anything | |
geom_line(aes(group=State), color="gray") + ## this puts a line for each State | |
geom_smooth(aes(group = 1), size=2, se=FALSE) ## this puts a smoothed line for all States (loess, to be exact) | |
## Total Health and Hospitals Employment index by state over time -- highlighting DC and NC as the extreme values in 2007. | |
ggplot(indexed_data, aes(x=Year, y= Indexed_HH_Emp)) + | |
geom_segment(aes(x=1970, xend=2007, y=1, yend=1), linetype=2, color="black") + | |
geom_line(data=indexed_data %.% filter(ABB!="NC" & ABB!="DC"), aes(group=State), color="gray") + | |
geom_line(data=indexed_data %.% filter(ABB=="NC"), aes(x=Year, y=Indexed_HH_Emp), color="black") + | |
geom_text(data=indexed_data %.% filter(ABB=="NC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="NC"), color="black", size=3) + | |
geom_line(data=indexed_data %.% filter(ABB=="DC"), aes(x=Year, y=Indexed_HH_Emp), color="black") + | |
geom_text(data=indexed_data %.% filter(ABB=="DC", Year==2007), aes(x=2008, y=Indexed_HH_Emp, label="DC"), color="black", size=3) + | |
geom_smooth(aes(group = 1), size=2, se=FALSE) + | |
annotate("text", x=2008, y=1.55, hjust=0, vjust=0, label="Avg All States", color="blue", size=3) + | |
ggtitle("State-by-state Growth in the Number of Hospital and Healthcare Employees (1972 = 1)") + | |
ylab("Index of Health and Hospitals Employees (1972 = 1)") + | |
theme_minimal() | |
ggsave("index_plot.pdf", width=8, height=8) | |
ggsave("index_plot.png", width=8, height=8) | |
library(reshape2) ## needed to transform a dataset from wide to long | |
library(scales) | |
## Transform the data from wide to long, because that's the format that is easier to plug in to ggplot | |
tmpPTvsFT<-melt(indexed_data %.% filter(Year==2007) %.% | |
select(State, ABB, HH_PTEmp, HH_FTEmp), id.vars=c("ABB", "State"), variable.name="EmpType", value.name="Employees") | |
ggplot(data=tmpPTvsFT) + | |
geom_bar(aes(x=reorder(as.factor(ABB), Employees, FUN=max), y=Employees, fill=EmpType), stat="identity", position="dodge") + ## the "reorder" part is how the x axis gets ordered by full-time employees | |
xlab("State") + | |
ylab("Employees, 2007") + | |
ggtitle("Number of Part-time and Full-Time Employees in Hospitals and Health") + | |
scale_y_continuous(labels=comma) + ## change the y scale to commas rather than scientific notation | |
coord_flip() + ## flip the coordinates so the bars go sideways instead of up and down | |
theme_minimal(base_size=12) + | |
scale_fill_discrete(h.start=180, labels=c("Part-time", "Full-time")) + | |
guides(fill=guide_legend(title="Employment Type", reverse=TRUE)) | |
ggsave(filename="bar_plot.pdf", width=8, height=8) | |
ggsave(filename="bar_plot.png", width=8, height=8) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment