ctesta01 · January 18, 2024 21:02
diff --git a/id529_review_student_choice.R b/id529_review_student_choice.R
 # Let's Review

 # We circulated a survey about what would be the most useful material for us to
 # review and revisit, as well as if there are any questions that would be useful
 # for us to go over together.

 # Overall topics to try to cover: 
 # 
 #   - dplyr and data manipulation 
 #   - color in ggplot2 
 #   - pseudocoding
 #   - time since last recession
 #   - logistic regression
 #   - chi squared test

 # dependencies ------------------------------------------------------------

 library(ID529data)
 library(tidyverse)
 library(gtsummary)
 library(magrittr)

 # dplyr and data manipulation ---------------------------------------------

 # Most requested topic: dplyr and data manipulation 
 # 
 # Remember you can load dplyr with either library(dplyr) or just load the 
 # whole tidyverse like I've done using library(tidyverse)
 # 
 # Let's think of what are some of the most important tasks using dplyr and data 
 # manipulation: 
 # 
 #   - Making a new column out of information from other columns
 #     - Cleaning a single column 
 #     - case_when 
 #     - if_else 
 #   - Creating summaries
 #   - Filtering data 
 #   - Joining data
 # 
 # Remember you can access the dplyr cheat sheet here: 
 # https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf 

 View(nhanes_id529) # typically I don't write View() into a script and only run it on the console

 # dplyr::mutate 
 # 
 # Cleaning a single column: 
 # Sometimes we want to create a TRUE/FALSE variable out of a numeric variable:

 nhanes_id529 |> 
  mutate(age_over_45 = age > 45) |> 
  select(id, age, age_over_45) |> 
  View()

 # Or clean string text using functions from the stringr package: 

 nhanes_id529 |> 
  mutate(hispanic = stringr::str_detect(race_ethnicity, pattern = "^Hispanic")) |> 
  select(id, race_ethnicity, hispanic) |> 
  View()
  
 # Using case_when in a mutate: 
 # If we want to create multiple levels to the new column beyond just TRUE/FALSE, 
 # we could use a case_when: 

 nhanes_id529 |> 
  mutate(age_category = 
           case_when(
             age < 25 ~ "Age: [12, 25)",
             age < 50 ~ "Age: [25, 50)",
             age < 75 ~ "Age: [50, 75)",
             age >= 75 ~ "Age: [75, 80]")) |>
  select(id, age, age_category) |> 
  View()

 # Now is a good time to mention that if I wanted to make any of these updates 
 # to the nhanes_id529 object, I'd have to write them back to the object like so:

 nhanes_id529 <- nhanes_id529 |> 
  mutate(age_category = 
           case_when(
             age < 25 ~ "Age: [12, 25)",
             age < 50 ~ "Age: [25, 50)",
             age < 75 ~ "Age: [50, 75)",
             age >= 75 ~ "Age: [75, 80]"))

 # if_else takes 3 arguments: 
 #   - A conditional value (can be a vector) (TRUE/FALSE)
 #   - A return value for when the conditional value is TRUE (can be a vector)
 #   - A return value for when the conditional value is FALSE (also can be a vector). 
 # 
 # dplyr::if_else has the following advantages over the ifelse in base R: 
 #   - if_else checks that the two return values have the same type 
 #   - it has a 'missing' argument for handling NA values 
 #   - it doesn't automatically convert Dates to numeric type 
 # 
 # here's two examples, one not vectorized and one vectorized: 
 if_else(15 > 5, 'yes', 'no')

 if_else(c(TRUE, FALSE, TRUE), 
        c('yes', 'affirmative', 'si'), 
        c('no', 'negative', 'incorrect'))

 # when do we use if_else in a mutate? 
 # 
 # suppose we were trying to improve a plot by controlling when we plot a label 
 # for each point. 
 # 
 # what we might do is construct a column with an indicator for whether or not 
 # to plot the label and then use an if_else to construct the string for the 
 # label to be plotted. 

 df <- data.frame(
  x = c(1,2,3,4),
  y = c(5,6,7,8),
  lgl_plot_label = c(TRUE, FALSE, FALSE, TRUE),
  name = c('observation 1', 'observation 2', 'observation 3', 'observation 4'))

 # suppose I had already created the indicator column lgl_plot_label: 
 # 
 # now I can use if_else to create the label for some points:
 df <- df |> mutate(
  label = if_else(lgl_plot_label,
                  # create the string "(x,y)" rounded to 0 digits past the decimal
                  name,
                  ''
                  ))

 ggplot(df, aes(x, y, label = label)) + 
  geom_point() + 
  geom_text(mapping = aes(x = x+.25))


 # creating summaries ------------------------------------------------------

 # one of the most useful functions in dplyr is summarize. it let's you 
 # condense a lot of observations down into summary statistics. 

 # you can actually use summarize without group_by to just get overall summary 
 # statistics on whatever columns you want: 

 # for example, to get the mean age in a dataset:
 nhanes_id529 |> summarize(mean_age = mean(age))

 # or blood pressure across an entire dataset:
 # remember, mean_BP is the individual's mean of probably 3 blood pressure
 # measurements
 nhanes_id529 |> summarize(mean_bp = mean(mean_BP, na.rm = TRUE))

 # but if I want to get those summary statistics broken down by 
 # some other category, then I want to use group_by() |> summarize() 

 nhanes_id529 |> 
  group_by(age_category) |> 
  summarize(mean_bp = mean(mean_BP, na.rm=TRUE))

 # I can use multiple categories to group_by if I want to:

 nhanes_id529 |> 
  group_by(sex_gender, age_category) |> 
  summarize(mean_bp = mean(mean_BP, na.rm=TRUE))

 # I can also have multiple summary statistics

 nhanes_id529 |> 
  group_by(sex_gender, age_category) |>
  summarize(
    count = n(), # n() counts the number of observations / rows in the category 
    mean_bp = mean(mean_BP, na.rm = TRUE),
    mean_PFAS_total = mean(PFAS_total, na.rm=TRUE), # sd calculates the standard deviation
    sd_bp = sd(mean_BP, na.rm = TRUE),
    sd_PFAS_total = sd(PFAS_total, na.rm = TRUE))


 # filtering data ----------------------------------------------------------

 # sometimes we want to filter our data, like if we ran a survey-based study
 # and wanted to impose strict study-inclusion criteria, like we only wanted
 # to study those aged 25-74. 
 # 
 # we can use a filter to restrict our data 

 nhanes_id529 |> 
  filter(age >= 25, age <= 74) |> 
  View()

 # we could overwrite the data in our R session if we wanted to with something
 # like this:
 # 
 # nhanes_id529 <- nhanes_id529 |> 
 #   filter(age > 25, age <= 74) |> 
 #     head()
 nhanes_id529 %<>% filter(age >= 25, age <= 74)



 # left join  --------------------------------------------------------------

 new_labels <- data.frame(
  age_category = c("Age: [12, 25)",
  "Age: [25, 50)",
  "Age: [50, 75)",
  "Age: [75, 80]"),
  new_label = c('Young Adult', 'Adult', 'Older', 'Oldest'))


 nhanes_id529 <- nhanes_id529 |> 
  left_join(new_labels, by = c('age_category' = 'age_category')) |> 
  select(id, age, age_category, new_label) |> 
  View()


 # color in ggplot2 --------------------------------------------------------

 ggplot(nhanes_id529, aes(x = age, y = mean_BP, 
                         color = sex_gender)) + 
  geom_point(alpha = .5) + 
  scale_color_manual(
    values = c(
      'Female' = '#00b894', 
      'Male' = '#fd79a8'
    )
  )

 nhanes_id529 |> 
  filter(PFAS_total < 200) |> 
  ggplot(
       aes(x = mean_BP,
           y = PFAS_total,
           color = age)) + 
  geom_point() + 
  scale_color_distiller(
    palette = 'BrBG',
    direction = 1)

 # pseudocoding ------------------------------------------------------------

 # pseudocode for the COVID OSHA project 
 # 
 # https://github.com/ctesta01/covid_osha 
 # 
 # 0. dependencies 
 # 
 # 1. load_data
 #     covid <- load_covid_data()
 #     osha <- load_osha_data()
 # 
 # 2. clean data 
 #     covid <- clean_covid(covid)
 #     osha <- clean_osha(osha)
 # 
 # 3. draft function for regional plot 
 # 
 # create_regional_plot <- function(
 #     region,
 #     covid,
 #     osha
 #     ) {
 #   inset_plot <- create_inset_map(region)
 #   
 #   # ggplot with two axes 
 #   osha_and_covid_plot <- ggplot() + 
 #     geom_bar(osha, ...) +  # stacked bar plot by industry 
 #     geom_line() + 
 #     scale_y_continuous( ... # configure for a second y axis)
 # 
 #   # combine osha and covid plot with the inset 
 #   osha_and_covid_plot_w_inset <- ... 
 # } 
 # 
 # 4. create 5-panel figure 
 #     cowplot::plot_grid( # not sure how to do layout yet 
 #       create_regional_plot('national', covid, osha),
 #       create_regional_plot('midwest', covid, osha),
 #       create_regional_plot('south', covid, osha),
 #       create_regional_plot('northeast', covid, osha),
 #       create_regional_plot('west', covid, osha)
 #  )
 #      

 # time since last recession -----------------------------------------------

 # get time since the most recent recession 

 recessions <- c(
  1929,
  1937,
  1945,
  1949,
  1953,
  1958,
  1960,
  1969,
  1973,
  1980,
  1981,
  1990,
  2000,
  2008)

 observation_years <- c(
  2011, 
  2021, 
  2005,
  2001,
  2004,
  2005)

 time_since_most_recent_recession <- function(
    recessions, 
    observation_years) {
  
  purrr::map_dbl(observation_years,
                 ~ . - max(recessions[recessions <= .]))
 }

 # example working through intermediate steps:
 purrr::map(observation_years,
               ~ . - max(recessions[recessions <= .]))

 time_since_most_recent_recession(
  recessions,
  observation_years)


 # logistic regression -----------------------------------------------------

 # let's do a logistic regression on stage two or higher elevated
 # blood pressure 

 nhanes_id529 <- nhanes_id529 |> 
  mutate(high_blood_pressure = mean_BP >= 140)

 logistic_model <- glm(
  high_blood_pressure ~ age + sex_gender + 
    race_ethnicity, 
  family = binomial(link = 'logit'),
  data = nhanes_id529)

 gtsummary::tbl_regression(logistic_model, exponentiate = TRUE)

 vip::vip(logistic_model)

 # chi squared test --------------------------------------------------------

 observations <- matrix(
  c(134, 155, # treated (success, failure)
    133, 195), # untreated (success, failure)
  byrow = TRUE,
  ncol = 2)

 colnames(observations) <- c('success', 'failure')
 rownames(observations) <- c('treated', 'untreated')

 observations

 chisq.test(observations)
	# Let's Review

	# We circulated a survey about what would be the most useful material for us to
	# review and revisit, as well as if there are any questions that would be useful
	# for us to go over together.

	# Overall topics to try to cover:
	#
	# - dplyr and data manipulation
	# - color in ggplot2
	# - pseudocoding
	# - time since last recession
	# - logistic regression
	# - chi squared test

	# dependencies ------------------------------------------------------------

	library(ID529data)
	library(tidyverse)
	library(gtsummary)
	library(magrittr)

	# dplyr and data manipulation ---------------------------------------------

	# Most requested topic: dplyr and data manipulation
	#
	# Remember you can load dplyr with either library(dplyr) or just load the
	# whole tidyverse like I've done using library(tidyverse)
	#
	# Let's think of what are some of the most important tasks using dplyr and data
	# manipulation:
	#
	# - Making a new column out of information from other columns
	# - Cleaning a single column
	# - case_when
	# - if_else
	# - Creating summaries
	# - Filtering data
	# - Joining data
	#
	# Remember you can access the dplyr cheat sheet here:
	# https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

	View(nhanes_id529) # typically I don't write View() into a script and only run it on the console

	# dplyr::mutate
	#
	# Cleaning a single column:
	# Sometimes we want to create a TRUE/FALSE variable out of a numeric variable:

	nhanes_id529 \|>
	mutate(age_over_45 = age > 45) \|>
	select(id, age, age_over_45) \|>
	View()

	# Or clean string text using functions from the stringr package:

	nhanes_id529 \|>
	mutate(hispanic = stringr::str_detect(race_ethnicity, pattern = "^Hispanic")) \|>
	select(id, race_ethnicity, hispanic) \|>
	View()

	# Using case_when in a mutate:
	# If we want to create multiple levels to the new column beyond just TRUE/FALSE,
	# we could use a case_when:

	nhanes_id529 \|>
	mutate(age_category =
	case_when(
	age < 25 ~ "Age: [12, 25)",
	age < 50 ~ "Age: [25, 50)",
	age < 75 ~ "Age: [50, 75)",
	age >= 75 ~ "Age: [75, 80]")) \|>
	select(id, age, age_category) \|>
	View()

	# Now is a good time to mention that if I wanted to make any of these updates
	# to the nhanes_id529 object, I'd have to write them back to the object like so:

	nhanes_id529 <- nhanes_id529 \|>
	mutate(age_category =
	case_when(
	age < 25 ~ "Age: [12, 25)",
	age < 50 ~ "Age: [25, 50)",
	age < 75 ~ "Age: [50, 75)",
	age >= 75 ~ "Age: [75, 80]"))

	# if_else takes 3 arguments:
	# - A conditional value (can be a vector) (TRUE/FALSE)
	# - A return value for when the conditional value is TRUE (can be a vector)
	# - A return value for when the conditional value is FALSE (also can be a vector).
	#
	# dplyr::if_else has the following advantages over the ifelse in base R:
	# - if_else checks that the two return values have the same type
	# - it has a 'missing' argument for handling NA values
	# - it doesn't automatically convert Dates to numeric type
	#
	# here's two examples, one not vectorized and one vectorized:
	if_else(15 > 5, 'yes', 'no')

	if_else(c(TRUE, FALSE, TRUE),
	c('yes', 'affirmative', 'si'),
	c('no', 'negative', 'incorrect'))

	# when do we use if_else in a mutate?
	#
	# suppose we were trying to improve a plot by controlling when we plot a label
	# for each point.
	#
	# what we might do is construct a column with an indicator for whether or not
	# to plot the label and then use an if_else to construct the string for the
	# label to be plotted.

	df <- data.frame(
	x = c(1,2,3,4),
	y = c(5,6,7,8),
	lgl_plot_label = c(TRUE, FALSE, FALSE, TRUE),
	name = c('observation 1', 'observation 2', 'observation 3', 'observation 4'))

	# suppose I had already created the indicator column lgl_plot_label:
	#
	# now I can use if_else to create the label for some points:
	df <- df \|> mutate(
	label = if_else(lgl_plot_label,
	# create the string "(x,y)" rounded to 0 digits past the decimal
	name,
	''
	))

	ggplot(df, aes(x, y, label = label)) +
	geom_point() +
	geom_text(mapping = aes(x = x+.25))


	# creating summaries ------------------------------------------------------

	# one of the most useful functions in dplyr is summarize. it let's you
	# condense a lot of observations down into summary statistics.

	# you can actually use summarize without group_by to just get overall summary
	# statistics on whatever columns you want:

	# for example, to get the mean age in a dataset:
	nhanes_id529 \|> summarize(mean_age = mean(age))

	# or blood pressure across an entire dataset:
	# remember, mean_BP is the individual's mean of probably 3 blood pressure
	# measurements
	nhanes_id529 \|> summarize(mean_bp = mean(mean_BP, na.rm = TRUE))

	# but if I want to get those summary statistics broken down by
	# some other category, then I want to use group_by() \|> summarize()

	nhanes_id529 \|>
	group_by(age_category) \|>
	summarize(mean_bp = mean(mean_BP, na.rm=TRUE))

	# I can use multiple categories to group_by if I want to:

	nhanes_id529 \|>
	group_by(sex_gender, age_category) \|>
	summarize(mean_bp = mean(mean_BP, na.rm=TRUE))

	# I can also have multiple summary statistics

	nhanes_id529 \|>
	group_by(sex_gender, age_category) \|>
	summarize(
	count = n(), # n() counts the number of observations / rows in the category
	mean_bp = mean(mean_BP, na.rm = TRUE),
	mean_PFAS_total = mean(PFAS_total, na.rm=TRUE), # sd calculates the standard deviation
	sd_bp = sd(mean_BP, na.rm = TRUE),
	sd_PFAS_total = sd(PFAS_total, na.rm = TRUE))


	# filtering data ----------------------------------------------------------

	# sometimes we want to filter our data, like if we ran a survey-based study
	# and wanted to impose strict study-inclusion criteria, like we only wanted
	# to study those aged 25-74.
	#
	# we can use a filter to restrict our data

	nhanes_id529 \|>
	filter(age >= 25, age <= 74) \|>
	View()

	# we could overwrite the data in our R session if we wanted to with something
	# like this:
	#
	# nhanes_id529 <- nhanes_id529 \|>
	# filter(age > 25, age <= 74) \|>
	# head()
	nhanes_id529 %<>% filter(age >= 25, age <= 74)



	# left join --------------------------------------------------------------

	new_labels <- data.frame(
	age_category = c("Age: [12, 25)",
	"Age: [25, 50)",
	"Age: [50, 75)",
	"Age: [75, 80]"),
	new_label = c('Young Adult', 'Adult', 'Older', 'Oldest'))


	nhanes_id529 <- nhanes_id529 \|>
	left_join(new_labels, by = c('age_category' = 'age_category')) \|>
	select(id, age, age_category, new_label) \|>
	View()


	# color in ggplot2 --------------------------------------------------------

	ggplot(nhanes_id529, aes(x = age, y = mean_BP,
	color = sex_gender)) +
	geom_point(alpha = .5) +
	scale_color_manual(
	values = c(
	'Female' = '#00b894',
	'Male' = '#fd79a8'
	)
	)

	nhanes_id529 \|>
	filter(PFAS_total < 200) \|>
	ggplot(
	aes(x = mean_BP,
	y = PFAS_total,
	color = age)) +
	geom_point() +
	scale_color_distiller(
	palette = 'BrBG',
	direction = 1)

	# pseudocoding ------------------------------------------------------------

	# pseudocode for the COVID OSHA project
	#
	# https://github.com/ctesta01/covid_osha
	#
	# 0. dependencies
	#
	# 1. load_data
	# covid <- load_covid_data()
	# osha <- load_osha_data()
	#
	# 2. clean data
	# covid <- clean_covid(covid)
	# osha <- clean_osha(osha)
	#
	# 3. draft function for regional plot
	#
	# create_regional_plot <- function(
	# region,
	# covid,
	# osha
	# ) {
	# inset_plot <- create_inset_map(region)
	#
	# # ggplot with two axes
	# osha_and_covid_plot <- ggplot() +
	# geom_bar(osha, ...) + # stacked bar plot by industry
	# geom_line() +
	# scale_y_continuous( ... # configure for a second y axis)
	#
	# # combine osha and covid plot with the inset
	# osha_and_covid_plot_w_inset <- ...
	# }
	#
	# 4. create 5-panel figure
	# cowplot::plot_grid( # not sure how to do layout yet
	# create_regional_plot('national', covid, osha),
	# create_regional_plot('midwest', covid, osha),
	# create_regional_plot('south', covid, osha),
	# create_regional_plot('northeast', covid, osha),
	# create_regional_plot('west', covid, osha)
	# )
	#

	# time since last recession -----------------------------------------------

	# get time since the most recent recession

	recessions <- c(
	1929,
	1937,
	1945,
	1949,
	1953,
	1958,
	1960,
	1969,
	1973,
	1980,
	1981,
	1990,
	2000,
	2008)

	observation_years <- c(
	2011,
	2021,
	2005,
	2001,
	2004,
	2005)

	time_since_most_recent_recession <- function(
	recessions,
	observation_years) {

	purrr::map_dbl(observation_years,
	~ . - max(recessions[recessions <= .]))
	}

	# example working through intermediate steps:
	purrr::map(observation_years,
	~ . - max(recessions[recessions <= .]))

	time_since_most_recent_recession(
	recessions,
	observation_years)


	# logistic regression -----------------------------------------------------

	# let's do a logistic regression on stage two or higher elevated
	# blood pressure

	nhanes_id529 <- nhanes_id529 \|>
	mutate(high_blood_pressure = mean_BP >= 140)

	logistic_model <- glm(
	high_blood_pressure ~ age + sex_gender +
	race_ethnicity,
	family = binomial(link = 'logit'),
	data = nhanes_id529)

	gtsummary::tbl_regression(logistic_model, exponentiate = TRUE)

	vip::vip(logistic_model)

	# chi squared test --------------------------------------------------------

	observations <- matrix(
	c(134, 155, # treated (success, failure)
	133, 195), # untreated (success, failure)
	byrow = TRUE,
	ncol = 2)

	colnames(observations) <- c('success', 'failure')
	rownames(observations) <- c('treated', 'untreated')

	observations

	chisq.test(observations)
No results found