levithatcher · January 25, 2019 21:15
diff --git a/CountyPublicHealthChoropleth.R b/CountyPublicHealthChoropleth.R
 removeCommasInNumber <- function(column) {
  column <- as.numeric(gsub(",", "", column))
  column
 }

 createChoropleth <- function(df, 
                             colToPlot, 
                             title,
                             legend,
                             numColors=1, 
                             NAReplace=NULL) {
  
  library(choroplethr)
  library(choroplethrMaps)
  library(ggplot2)
  
  # Remove commas from column of interest
  if (!is.numeric(df[[colToPlot]])) {
    df[[colToPlot]] <- removeCommasInNumber(df[[colToPlot]])
  }
  
  # Remove state rows from dataset
  df <- subset(df, COUNTYCODE != 0)
  
  df$STATECODE <- as.integer(df$STATECODE)
  df$COUNTYCODE <- as.integer(df$COUNTYCODE)
  
  # Pad county digits
  df$COUNTYCODE <- sprintf("%03d", df$COUNTYCODE)
  
  # Concatenate and create FIPS
  df$FIPSCODE <- as.numeric(paste0(df$STATECODE,df$COUNTYCODE))
  
  # Reduce dataset and rename cols for county_choropleth func
  df <- subset(df, select = c('FIPSCODE', colToPlot))
  colnames(df) <- c("region","value")
  
  print("NA count:")
  print(count(is.na(df["value"])))
  
  # Fill NA cells with something (so choropleth works)
  if (!is.null(NAReplace)) {
    df["value"][is.na(df["value"])] <- NAReplace
  }
  
  str(df)
  
  # Plot data on a US map
  county_choropleth(df, num_colors = numColors, legend = legend) +
    ggtitle(title) +
    theme(plot.title = element_text(hjust = 0.5))
 }

 createPercentiles <- function(x) {
  (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
 }


 library(healthcareai)

 # Start analysis -- read in data
 # Data comes from here: http://www.countyhealthrankings.org/rankings/data
 df <- read.csv('2015 CHR Analytic Data.csv')

 # Remove state-summary rows
 df <- subset(df, COUNTYCODE != 0)

 # Prepare columns
 # Convert to percentage when plotting its choropleth
 #df$Low.birthweight.Value <- df$Low.birthweight.Value * 100

 df$Median.household.income.Value <- removeCommasInNumber(df$Median.household.income.Value)
 df$Premature.death.Value <- removeCommasInNumber(df$Premature.death.Value)

 # Change from factors to numerics
 #df$Median.household.income.Value <- as.numeric(df$Median.household.income.Value)
 #df$Premature.death.Value <- as.numeric(df$Premature.death.Value)

 # Change to percentiles
 df$Median.household.income.Percentile <- round(createPercentiles(df$Median.household.income.Value), 2)

 # Change to percentiles
 # We subtract 1 from income and do abs, since we want 100th percentile to be desirable
 df$Low.birthweight.Percentile <- round(abs(createPercentiles(df$Low.birthweight.Value) - 1), 2)
 df$Premature.death.Percentile <- round(abs(createPercentiles(df$Premature.death.Value) - 1), 2)

 # Calculate diff between county income and health outcomes (leads to -1 to 1)
 df$LBW.PATWI <- round(df$Low.birthweight.Percentile - 
                                       df$Median.household.income.Percentile, 2)

 df$Early.Death.PATWI <- round(df$Premature.death.Percentile - 
                                            df$Median.household.income.Percentile, 2)

 df <- subset(df, select = c('County','State','Median.household.income.Value','Median.household.income.Percentile','Low.birthweight.Percentile','LBW.PATWI','Premature.death.Percentile','Early.Death.PATWI'))

 str(df)

 # Plot choropleth
 createChoropleth(df,
                 #colToPlot = 'Median.household.income.Value',
                 #colToPlot = 'Premature.death.Value', 
                 #colToPlot = 'Low.birthweight.Value',
                 #colToPlot = 'LBWPcntlMinusIncomePcntl',
                 colToPlot = 'EarlyDeathPctlMinusIncomePctl',
                 title = 'Prime Years Lost Compared to Income by County',
                 legend = 'Percentile Diff',
                 numColors = 7,
                 NAReplace = 0)

 # Find statistical relationships
 df$Low.birthweight.Value <- imputeColumn(df$Low.birthweight.Value)

 calculateTargetedCorrelations(df[,c('Median.household.income.Value',
                                    'Premature.death.Value',
                                    'Low.birthweight.Value')],
                              targetCol = 'Median.household.income.Value')

 lm(Premature.death.Value ~ Median.household.income.Value, data = df)

 lm(Low.birthweight.Value ~ Median.household.income.Value, data = df)
	removeCommasInNumber <- function(column) {
	column <- as.numeric(gsub(",", "", column))
	column
	}

	createChoropleth <- function(df,
	colToPlot,
	title,
	legend,
	numColors=1,
	NAReplace=NULL) {

	library(choroplethr)
	library(choroplethrMaps)
	library(ggplot2)

	# Remove commas from column of interest
	if (!is.numeric(df[[colToPlot]])) {
	df[[colToPlot]] <- removeCommasInNumber(df[[colToPlot]])
	}

	# Remove state rows from dataset
	df <- subset(df, COUNTYCODE != 0)

	df$STATECODE <- as.integer(df$STATECODE)
	df$COUNTYCODE <- as.integer(df$COUNTYCODE)

	# Pad county digits
	df$COUNTYCODE <- sprintf("%03d", df$COUNTYCODE)

	# Concatenate and create FIPS
	df$FIPSCODE <- as.numeric(paste0(df$STATECODE,df$COUNTYCODE))

	# Reduce dataset and rename cols for county_choropleth func
	df <- subset(df, select = c('FIPSCODE', colToPlot))
	colnames(df) <- c("region","value")

	print("NA count:")
	print(count(is.na(df["value"])))

	# Fill NA cells with something (so choropleth works)
	if (!is.null(NAReplace)) {
	df["value"][is.na(df["value"])] <- NAReplace
	}

	str(df)

	# Plot data on a US map
	county_choropleth(df, num_colors = numColors, legend = legend) +
	ggtitle(title) +
	theme(plot.title = element_text(hjust = 0.5))
	}

	createPercentiles <- function(x) {
	(x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
	}


	library(healthcareai)

	# Start analysis -- read in data
	# Data comes from here: http://www.countyhealthrankings.org/rankings/data
	df <- read.csv('2015 CHR Analytic Data.csv')

	# Remove state-summary rows
	df <- subset(df, COUNTYCODE != 0)

	# Prepare columns
	# Convert to percentage when plotting its choropleth
	#df$Low.birthweight.Value <- df$Low.birthweight.Value * 100

	df$Median.household.income.Value <- removeCommasInNumber(df$Median.household.income.Value)
	df$Premature.death.Value <- removeCommasInNumber(df$Premature.death.Value)

	# Change from factors to numerics
	#df$Median.household.income.Value <- as.numeric(df$Median.household.income.Value)
	#df$Premature.death.Value <- as.numeric(df$Premature.death.Value)

	# Change to percentiles
	df$Median.household.income.Percentile <- round(createPercentiles(df$Median.household.income.Value), 2)

	# Change to percentiles
	# We subtract 1 from income and do abs, since we want 100th percentile to be desirable
	df$Low.birthweight.Percentile <- round(abs(createPercentiles(df$Low.birthweight.Value) - 1), 2)
	df$Premature.death.Percentile <- round(abs(createPercentiles(df$Premature.death.Value) - 1), 2)

	# Calculate diff between county income and health outcomes (leads to -1 to 1)
	df$LBW.PATWI <- round(df$Low.birthweight.Percentile -
	df$Median.household.income.Percentile, 2)

	df$Early.Death.PATWI <- round(df$Premature.death.Percentile -
	df$Median.household.income.Percentile, 2)

	df <- subset(df, select = c('County','State','Median.household.income.Value','Median.household.income.Percentile','Low.birthweight.Percentile','LBW.PATWI','Premature.death.Percentile','Early.Death.PATWI'))

	str(df)

	# Plot choropleth
	createChoropleth(df,
	#colToPlot = 'Median.household.income.Value',
	#colToPlot = 'Premature.death.Value',
	#colToPlot = 'Low.birthweight.Value',
	#colToPlot = 'LBWPcntlMinusIncomePcntl',
	colToPlot = 'EarlyDeathPctlMinusIncomePctl',
	title = 'Prime Years Lost Compared to Income by County',
	legend = 'Percentile Diff',
	numColors = 7,
	NAReplace = 0)

	# Find statistical relationships
	df$Low.birthweight.Value <- imputeColumn(df$Low.birthweight.Value)

	calculateTargetedCorrelations(df[,c('Median.household.income.Value',
	'Premature.death.Value',
	'Low.birthweight.Value')],
	targetCol = 'Median.household.income.Value')

	lm(Premature.death.Value ~ Median.household.income.Value, data = df)

	lm(Low.birthweight.Value ~ Median.household.income.Value, data = df)