Last active
January 18, 2017 03:07
-
-
Save mGalarnyk/47ca288c12dede3bc40c8fa95fc25763 to your computer and use it in GitHub Desktop.
Coursera John Hopkins Data Science Specialization Programming Assignment 3 https://medium.com/@GalarnykMichael/in-progress-review-course-2-r-programming-jhu-coursera-ad27086d8438#.i1tezvk53
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Question 1 | |
# Plot the 30-day mortality rates for heart attack | |
# Read the outcome data into R via the read.csv function and look at the first few rows. | |
# outcome <- read.csv("outcome-of-care-measures.csv", colClasses = "character") | |
# head(outcome) | |
# There are many columns in this dataset. You can see how many by typing ncol(outcome) (you can see the number of rows with the nrow function). In addition, you can see the names of each column by typing names(outcome) (the names are also in the PDF document. | |
# To make a simple histogram of the 30-day death rates from heart attack (column 11 in the outcome dataset), run | |
library(data.table) | |
# Reading in data | |
outcome <- data.table::fread('outcome-of-care-measures.csv') | |
outcome[, (11) := lapply(.SD, as.numeric), .SDcols = (11)] | |
outcome[, lapply(.SD | |
, hist | |
, xlab= "Deaths" | |
, main = "Hospital 30-Day Death (Mortality) Rates from Heart Attack" | |
, col="lightblue") | |
, .SDcols = (11)] | |
# Question 2 | |
# Finding the best hospital in a state | |
best <- function(state, outcome) { | |
# Read outcome data | |
out_dt <- data.table::fread('outcome-of-care-measures.csv') | |
outcome <- tolower(outcome) | |
# Column name is same as variable so changing it | |
chosen_state <- state | |
# Check that state and outcome are valid | |
if (!chosen_state %in% unique(out_dt[["State"]])) { | |
stop('invalid state') | |
} | |
if (!outcome %in% c("heart attack", "heart failure", "pneumonia")) { | |
stop('invalid outcome') | |
} | |
# Renaming Columns to be less verbose and lowercase | |
setnames(out_dt | |
, tolower(sapply(colnames(out_dt), gsub, pattern = "^Hospital 30-Day Death \\(Mortality\\) Rates from ", replacement = "" )) | |
) | |
#Filter by state | |
out_dt <- out_dt[state == chosen_state] | |
# Columns indices to keep | |
col_indices <- grep(paste0("hospital name|state|^",outcome), colnames(out_dt)) | |
# Filtering out unnessecary data | |
out_dt <- out_dt[, .SD ,.SDcols = col_indices] | |
# Find out what class each column is | |
# sapply(out_dt,class) | |
out_dt[, outcome] <- out_dt[, as.numeric(get(outcome))] | |
# Removing Missing Values for numerical datatype (outcome column) | |
out_dt <- out_dt[complete.cases(out_dt),] | |
# Order Column to Top | |
out_dt <- out_dt[order(get(outcome), `hospital name`)] | |
return(out_dt[, "hospital name"][1]) | |
} | |
# Question 3 | |
# Ranking hospitals by outcome in a state | |
rankhospital <- function(state, outcome, num = "best") { | |
# Read outcome data | |
out_dt <- data.table::fread('outcome-of-care-measures.csv') | |
outcome <- tolower(outcome) | |
# Column name is same as variable so changing it | |
chosen_state <- state | |
# Check that state and outcome are valid | |
if (!chosen_state %in% unique(out_dt[["State"]])) { | |
stop('invalid state') | |
} | |
if (!outcome %in% c("heart attack", "heart failure", "pneumonia")) { | |
stop('invalid outcome') | |
} | |
# Renaming Columns to be less verbose and lowercase | |
setnames(out_dt | |
, tolower(sapply(colnames(out_dt), gsub, pattern = "^Hospital 30-Day Death \\(Mortality\\) Rates from ", replacement = "" )) | |
) | |
#Filter by state | |
out_dt <- out_dt[state == chosen_state] | |
# Columns indices to keep | |
col_indices <- grep(paste0("hospital name|state|^",outcome), colnames(out_dt)) | |
# Filtering out unnessecary data | |
out_dt <- out_dt[, .SD ,.SDcols = col_indices] | |
# Find out what class each column is | |
# sapply(out_dt,class) | |
out_dt[, outcome] <- out_dt[, as.numeric(get(outcome))] | |
# Removing Missing Values for numerical datatype (outcome column) | |
out_dt <- out_dt[complete.cases(out_dt),] | |
# Order Column to Top | |
out_dt <- out_dt[order(get(outcome), `hospital name`)] | |
out_dt <- out_dt[, .(`hospital name` = `hospital name`, state = state, rate = get(outcome), Rank = .I)] | |
if (num == "best"){ | |
return(out_dt[1,`hospital name`]) | |
} | |
if (num == "worst"){ | |
return(out_dt[.N,`hospital name`]) | |
} | |
return(out_dt[num,`hospital name`]) | |
} | |
# Question 4 | |
# Ranking hospitals in all states | |
rankall <- function(outcome, num = "best") { | |
# Read outcome data | |
out_dt <- data.table::fread('outcome-of-care-measures.csv') | |
outcome <- tolower(outcome) | |
if (!outcome %in% c("heart attack", "heart failure", "pneumonia")) { | |
stop('invalid outcome') | |
} | |
# Renaming Columns to be less verbose and lowercase | |
setnames(out_dt | |
, tolower(sapply(colnames(out_dt), gsub, pattern = "^Hospital 30-Day Death \\(Mortality\\) Rates from ", replacement = "" )) | |
) | |
# Columns indices to keep | |
col_indices <- grep(paste0("hospital name|state|^",outcome), colnames(out_dt)) | |
# Filtering out unnessecary data | |
out_dt <- out_dt[, .SD ,.SDcols = col_indices] | |
# Find out what class each column is | |
# sapply(out_dt,class) | |
# Change outcome column class | |
out_dt[, outcome] <- out_dt[, as.numeric(get(outcome))] | |
if (num == "best"){ | |
return(out_dt[order(state, get(outcome), `hospital name`) | |
, .(hospital = head(`hospital name`, 1)) | |
, by = state]) | |
} | |
if (num == "worst"){ | |
return(out_dt[order(get(outcome), `hospital name`) | |
, .(hospital = tail(`hospital name`, 1)) | |
, by = state]) | |
} | |
return(out_dt[order(state, get(outcome), `hospital name`) | |
, head(.SD,num) | |
, by = state, .SDcols = c("hospital name") ]) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment