Skip to content

Instantly share code, notes, and snippets.

@mGalarnyk
Last active January 18, 2017 03:07
Show Gist options
  • Save mGalarnyk/47ca288c12dede3bc40c8fa95fc25763 to your computer and use it in GitHub Desktop.
Save mGalarnyk/47ca288c12dede3bc40c8fa95fc25763 to your computer and use it in GitHub Desktop.
Coursera John Hopkins Data Science Specialization Programming Assignment 3 https://medium.com/@GalarnykMichael/in-progress-review-course-2-r-programming-jhu-coursera-ad27086d8438#.i1tezvk53
# Question 1
# Plot the 30-day mortality rates for heart attack
# Read the outcome data into R via the read.csv function and look at the first few rows.
# outcome <- read.csv("outcome-of-care-measures.csv", colClasses = "character")
# head(outcome)
# There are many columns in this dataset. You can see how many by typing ncol(outcome) (you can see the number of rows with the nrow function). In addition, you can see the names of each column by typing names(outcome) (the names are also in the PDF document.
# To make a simple histogram of the 30-day death rates from heart attack (column 11 in the outcome dataset), run
library(data.table)
# Reading in data
outcome <- data.table::fread('outcome-of-care-measures.csv')
outcome[, (11) := lapply(.SD, as.numeric), .SDcols = (11)]
outcome[, lapply(.SD
, hist
, xlab= "Deaths"
, main = "Hospital 30-Day Death (Mortality) Rates from Heart Attack"
, col="lightblue")
, .SDcols = (11)]
# Question 2
# Finding the best hospital in a state
best <- function(state, outcome) {
# Read outcome data
out_dt <- data.table::fread('outcome-of-care-measures.csv')
outcome <- tolower(outcome)
# Column name is same as variable so changing it
chosen_state <- state
# Check that state and outcome are valid
if (!chosen_state %in% unique(out_dt[["State"]])) {
stop('invalid state')
}
if (!outcome %in% c("heart attack", "heart failure", "pneumonia")) {
stop('invalid outcome')
}
# Renaming Columns to be less verbose and lowercase
setnames(out_dt
, tolower(sapply(colnames(out_dt), gsub, pattern = "^Hospital 30-Day Death \\(Mortality\\) Rates from ", replacement = "" ))
)
#Filter by state
out_dt <- out_dt[state == chosen_state]
# Columns indices to keep
col_indices <- grep(paste0("hospital name|state|^",outcome), colnames(out_dt))
# Filtering out unnessecary data
out_dt <- out_dt[, .SD ,.SDcols = col_indices]
# Find out what class each column is
# sapply(out_dt,class)
out_dt[, outcome] <- out_dt[, as.numeric(get(outcome))]
# Removing Missing Values for numerical datatype (outcome column)
out_dt <- out_dt[complete.cases(out_dt),]
# Order Column to Top
out_dt <- out_dt[order(get(outcome), `hospital name`)]
return(out_dt[, "hospital name"][1])
}
# Question 3
# Ranking hospitals by outcome in a state
rankhospital <- function(state, outcome, num = "best") {
# Read outcome data
out_dt <- data.table::fread('outcome-of-care-measures.csv')
outcome <- tolower(outcome)
# Column name is same as variable so changing it
chosen_state <- state
# Check that state and outcome are valid
if (!chosen_state %in% unique(out_dt[["State"]])) {
stop('invalid state')
}
if (!outcome %in% c("heart attack", "heart failure", "pneumonia")) {
stop('invalid outcome')
}
# Renaming Columns to be less verbose and lowercase
setnames(out_dt
, tolower(sapply(colnames(out_dt), gsub, pattern = "^Hospital 30-Day Death \\(Mortality\\) Rates from ", replacement = "" ))
)
#Filter by state
out_dt <- out_dt[state == chosen_state]
# Columns indices to keep
col_indices <- grep(paste0("hospital name|state|^",outcome), colnames(out_dt))
# Filtering out unnessecary data
out_dt <- out_dt[, .SD ,.SDcols = col_indices]
# Find out what class each column is
# sapply(out_dt,class)
out_dt[, outcome] <- out_dt[, as.numeric(get(outcome))]
# Removing Missing Values for numerical datatype (outcome column)
out_dt <- out_dt[complete.cases(out_dt),]
# Order Column to Top
out_dt <- out_dt[order(get(outcome), `hospital name`)]
out_dt <- out_dt[, .(`hospital name` = `hospital name`, state = state, rate = get(outcome), Rank = .I)]
if (num == "best"){
return(out_dt[1,`hospital name`])
}
if (num == "worst"){
return(out_dt[.N,`hospital name`])
}
return(out_dt[num,`hospital name`])
}
# Question 4
# Ranking hospitals in all states
rankall <- function(outcome, num = "best") {
# Read outcome data
out_dt <- data.table::fread('outcome-of-care-measures.csv')
outcome <- tolower(outcome)
if (!outcome %in% c("heart attack", "heart failure", "pneumonia")) {
stop('invalid outcome')
}
# Renaming Columns to be less verbose and lowercase
setnames(out_dt
, tolower(sapply(colnames(out_dt), gsub, pattern = "^Hospital 30-Day Death \\(Mortality\\) Rates from ", replacement = "" ))
)
# Columns indices to keep
col_indices <- grep(paste0("hospital name|state|^",outcome), colnames(out_dt))
# Filtering out unnessecary data
out_dt <- out_dt[, .SD ,.SDcols = col_indices]
# Find out what class each column is
# sapply(out_dt,class)
# Change outcome column class
out_dt[, outcome] <- out_dt[, as.numeric(get(outcome))]
if (num == "best"){
return(out_dt[order(state, get(outcome), `hospital name`)
, .(hospital = head(`hospital name`, 1))
, by = state])
}
if (num == "worst"){
return(out_dt[order(get(outcome), `hospital name`)
, .(hospital = tail(`hospital name`, 1))
, by = state])
}
return(out_dt[order(state, get(outcome), `hospital name`)
, head(.SD,num)
, by = state, .SDcols = c("hospital name") ])
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment