Last active
January 25, 2019 21:15
-
-
Save levithatcher/070496ca48c165d7ced37e0ffcd24dc7 to your computer and use it in GitHub Desktop.
Create county-level choropleths of low-birth weight, premature death, and their relationship to county income
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
removeCommasInNumber <- function(column) { | |
column <- as.numeric(gsub(",", "", column)) | |
column | |
} | |
createChoropleth <- function(df, | |
colToPlot, | |
title, | |
legend, | |
numColors=1, | |
NAReplace=NULL) { | |
library(choroplethr) | |
library(choroplethrMaps) | |
library(ggplot2) | |
# Remove commas from column of interest | |
if (!is.numeric(df[[colToPlot]])) { | |
df[[colToPlot]] <- removeCommasInNumber(df[[colToPlot]]) | |
} | |
# Remove state rows from dataset | |
df <- subset(df, COUNTYCODE != 0) | |
df$STATECODE <- as.integer(df$STATECODE) | |
df$COUNTYCODE <- as.integer(df$COUNTYCODE) | |
# Pad county digits | |
df$COUNTYCODE <- sprintf("%03d", df$COUNTYCODE) | |
# Concatenate and create FIPS | |
df$FIPSCODE <- as.numeric(paste0(df$STATECODE,df$COUNTYCODE)) | |
# Reduce dataset and rename cols for county_choropleth func | |
df <- subset(df, select = c('FIPSCODE', colToPlot)) | |
colnames(df) <- c("region","value") | |
print("NA count:") | |
print(count(is.na(df["value"]))) | |
# Fill NA cells with something (so choropleth works) | |
if (!is.null(NAReplace)) { | |
df["value"][is.na(df["value"])] <- NAReplace | |
} | |
str(df) | |
# Plot data on a US map | |
county_choropleth(df, num_colors = numColors, legend = legend) + | |
ggtitle(title) + | |
theme(plot.title = element_text(hjust = 0.5)) | |
} | |
createPercentiles <- function(x) { | |
(x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) | |
} | |
library(healthcareai) | |
# Start analysis -- read in data | |
# Data comes from here: http://www.countyhealthrankings.org/rankings/data | |
df <- read.csv('2015 CHR Analytic Data.csv') | |
# Remove state-summary rows | |
df <- subset(df, COUNTYCODE != 0) | |
# Prepare columns | |
# Convert to percentage when plotting its choropleth | |
#df$Low.birthweight.Value <- df$Low.birthweight.Value * 100 | |
df$Median.household.income.Value <- removeCommasInNumber(df$Median.household.income.Value) | |
df$Premature.death.Value <- removeCommasInNumber(df$Premature.death.Value) | |
# Change from factors to numerics | |
#df$Median.household.income.Value <- as.numeric(df$Median.household.income.Value) | |
#df$Premature.death.Value <- as.numeric(df$Premature.death.Value) | |
# Change to percentiles | |
df$Median.household.income.Percentile <- round(createPercentiles(df$Median.household.income.Value), 2) | |
# Change to percentiles | |
# We subtract 1 from income and do abs, since we want 100th percentile to be desirable | |
df$Low.birthweight.Percentile <- round(abs(createPercentiles(df$Low.birthweight.Value) - 1), 2) | |
df$Premature.death.Percentile <- round(abs(createPercentiles(df$Premature.death.Value) - 1), 2) | |
# Calculate diff between county income and health outcomes (leads to -1 to 1) | |
df$LBW.PATWI <- round(df$Low.birthweight.Percentile - | |
df$Median.household.income.Percentile, 2) | |
df$Early.Death.PATWI <- round(df$Premature.death.Percentile - | |
df$Median.household.income.Percentile, 2) | |
df <- subset(df, select = c('County','State','Median.household.income.Value','Median.household.income.Percentile','Low.birthweight.Percentile','LBW.PATWI','Premature.death.Percentile','Early.Death.PATWI')) | |
str(df) | |
# Plot choropleth | |
createChoropleth(df, | |
#colToPlot = 'Median.household.income.Value', | |
#colToPlot = 'Premature.death.Value', | |
#colToPlot = 'Low.birthweight.Value', | |
#colToPlot = 'LBWPcntlMinusIncomePcntl', | |
colToPlot = 'EarlyDeathPctlMinusIncomePctl', | |
title = 'Prime Years Lost Compared to Income by County', | |
legend = 'Percentile Diff', | |
numColors = 7, | |
NAReplace = 0) | |
# Find statistical relationships | |
df$Low.birthweight.Value <- imputeColumn(df$Low.birthweight.Value) | |
calculateTargetedCorrelations(df[,c('Median.household.income.Value', | |
'Premature.death.Value', | |
'Low.birthweight.Value')], | |
targetCol = 'Median.household.income.Value') | |
lm(Premature.death.Value ~ Median.household.income.Value, data = df) | |
lm(Low.birthweight.Value ~ Median.household.income.Value, data = df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment