Created
August 2, 2024 15:58
-
-
Save battenr/0c9b8753a477c493b9a44f59f6135da9 to your computer and use it in GitHub Desktop.
Variable Selection - Based on P-Values vs Based on DAGs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Title: Variable Selection Based on P-Values vs DAGs | |
| # Description: The purpose of this code is to show how variable selection | |
| # based on p-values can lead to more biased results than using DAGs. | |
| # Setup ---- | |
| library(tidyverse) # ol faithful | |
| # Simulating Data ---- | |
| n = 250 # arbitrary sample size | |
| # Notation for the variables below: | |
| # - X: Coffee | |
| # - Y: Happiness level | |
| # - C1: Number of donuts per week | |
| # - C2: Sunshine | |
| # - C3: Sleep | |
| # - C4: Lifting | |
| # Simulating data with arbitrary parameters | |
| set.seed(456) # setting seed for reproducibility | |
| df <- data.frame( | |
| c1 = rnorm(n = n, mean = 7, sd = 1.5), | |
| c2 = rbinom(n = n, size = 1, prob = 0.65), | |
| c3 = rnorm(n = n, mean = 10, sd = 2) | |
| ) %>% | |
| dplyr::mutate( | |
| trt = rbinom(n = n, size = 1, prob = plogis(0.1*c1 + 0.3*c2 - 0.05*c3)), | |
| c4 = 1.5 + 3*c3 - 2*trt, | |
| y = 0.5*c1 + 0.2*c2 + 0.4*c3 + 0.5*c4 + 2*trt | |
| ) | |
| # Variable Selection Using P-Value ---- | |
| #... Univariate Selection ---- | |
| # Here, testing the association between the outcome and each variable. Not | |
| # testing treatment because that needs to be included regardless. Using a | |
| # cutoff value of 0.20 | |
| # For C1 | |
| unimod <- glm(y ~ c1, | |
| family = stats::gaussian(), | |
| data = df) | |
| broom::tidy(unimod) | |
| # For C2 | |
| unimod <- glm(y ~ c2, | |
| family = stats::gaussian(), | |
| data = df) | |
| broom::tidy(unimod) | |
| # For C3 | |
| unimod <- glm(y ~ c3, | |
| family = stats::gaussian(), | |
| data = df) | |
| broom::tidy(unimod) | |
| # For C4 | |
| unimod <- glm(y ~ c4, | |
| family = stats::gaussian(), | |
| data = df) | |
| broom::tidy(unimod) | |
| #... Multivariable Model ---- | |
| # Based on running the above, I ended up with including c1, c3 and c4. | |
| # Including these variables to see if still significant in multivariable model. | |
| multimod <- glm(y ~ trt + c1 + c3 + c4, | |
| family = stats::gaussian(), | |
| data = df) | |
| broom::tidy(multimod) # all are significant in this example. | |
| # Variable Selection Based on DAG ---- | |
| # Using daggity.net know I need to adjust for c1, c2 and c3 (not c4) | |
| dagmod <- glm(y ~ trt + c1 + c2 + c3, | |
| family = stats::gaussian(), | |
| data = df) | |
| broom::tidy(dagmod) # just to look at results from the model. | |
| # Repeating to Measure Bias ---- | |
| #... Function for Using P-values ---- | |
| # Note: For simplicity using the same variables, but in practice I'd be curious results | |
| # of doing univariate analyses each time to determine the variables to include. | |
| pvaluemod <- function(n){ | |
| # Simulating the data, similar as above. | |
| df <- data.frame( | |
| c1 = rnorm(n = n, mean = 7, sd = 1.5), | |
| c2 = rbinom(n = n, size = 1, prob = 0.65), | |
| c3 = rnorm(n = n, mean = 10, sd = 2) | |
| ) %>% | |
| dplyr::mutate( | |
| trt = rbinom(n = n, size = 1, prob = plogis(0.1*c1 + 0.3*c2 + 0.05*c3)), | |
| c4 = 3*c3 + 2*trt, | |
| y = 0.05*c1 + 0.2*c2 + 0.04*c3 + 0.03*c4 + 2*trt + rnorm(n = n) | |
| ) | |
| # Fitting a GLM with a Gaussian distribution for the outcome and identity link | |
| mod <- glm(y ~ trt + c1 + c3 + c4, | |
| family = gaussian(), | |
| data = df) | |
| # Extracting the estimate | |
| estimate = mod$coefficients[2] | |
| # Calcuating the bias as estimated effect - actual effect (2 from the above, 2*trt) | |
| bias = estimate - 2 | |
| return(bias) | |
| } | |
| #... Function for using DAGs ---- | |
| # See above function for comments. Very similar here only different variables are included. | |
| dagmod <- function(n){ | |
| df <- data.frame( | |
| c1 = rnorm(n = n, mean = 7, sd = 1.5), | |
| c2 = rbinom(n = n, size = 1, prob = 0.65), | |
| c3 = rnorm(n = n, mean = 10, sd = 2) | |
| ) %>% | |
| dplyr::mutate( | |
| trt = rbinom(n = n, size = 1, prob = plogis(0.1*c1 + 0.3*c2 + 0.05*c3)), | |
| c4 = 3*c3 + 2*trt, | |
| y = 0.05*c1 + 0.2*c2 + 0.04*c3 + 0.03*c4 + 2*trt + rnorm(n = n) | |
| ) | |
| mod <- glm(y ~ trt + c1 + c2 + c3, | |
| family = gaussian(), | |
| data = df) | |
| estimate = mod$coefficients[2] | |
| bias = estimate - 2 | |
| return(bias) | |
| } | |
| #... Repeating 1000 times! ---- | |
| n.sim = 250 # sample size of 250 is arbitrary. | |
| # Based on variables from p-values | |
| pvalue_bias <- replicate(1000, pvaluemod(n = n.sim), simplify = FALSE) | |
| pvalue_bias <- do.call(rbind, pvalue_bias) # reformatting | |
| # Based on variables from a DAG | |
| dag_bias <- replicate(1000, dagmod(n = n.sim), simplify = FALSE) | |
| dag_bias <- do.call(rbind, dag_bias) # reformatting | |
| # Combining Results Into a Dataframe ---- | |
| # This code is messy but it's basically using the results to combine into a dataframe | |
| # then calculating the value _num which will be used in the monte carlo estimate of SE for the | |
| # bias. | |
| df_bias <- data.frame( | |
| dag = (dag_bias %>% as.data.frame())$trt, | |
| pvalue = (pvalue_bias %>% as.data.frame())$trt | |
| ) %>% | |
| dplyr::mutate( | |
| dag_num = (dag - mean(dag))^2, | |
| pvalue_num = (pvalue - mean(pvalue))^2 | |
| # _squared wasn't used in final version but could be used if you want to look at MSE. | |
| # dag_squared = ((dag-2)^2)/n.sim, | |
| # pvalue_squared = ((pvalue-2)^2)/n.sim | |
| ) | |
| # Combining final results. Again, this is messy and should be fixed. Basically just | |
| # calculating the final results (mean of bias and monte carlo se of bias) | |
| bias_results <- data.frame( | |
| bias = c(mean(df_bias$dag), mean(df_bias$pvalue)), | |
| mc_se = c(sqrt(sum(df_bias$dag_num)/(n.sim*(n.sim-1))), | |
| sqrt(sum(df_bias$pvalue_num)/(n.sim*(n.sim-1)))), | |
| method = c("DAG", "P-Value") | |
| ) | |
| # Plots! ---- | |
| # Who doesn't love a good plot? | |
| #... Reformatting Data ---- | |
| # Reformatting data to plot. | |
| df_plot <- df_bias %>% | |
| pivot_longer( | |
| cols = c(dag, pvalue), | |
| names_to = "method", | |
| values_to = "bias" | |
| ) | |
| #... Actual Plot! ---- | |
| ggplot(df_plot, aes(x = method, y = bias, fill = method)) + | |
| # Using boxplot. Typically I don't like using boxplots but this is for linkedin/teaching | |
| # so not a big deal. In general recommend a different method to show the distribution like | |
| # density or ggridges package, etc. | |
| geom_boxplot() + | |
| # Adding labels and title/subtitle. | |
| labs(title = "Bias in Treatment Effect Estimates using Variable Selection", | |
| subtitle = "Selection based on P-Values vs. Selection based on DAG", | |
| y = "Bias", | |
| x = "Variable Selection Method") + | |
| # Making theme minimal as a basic starting point. | |
| theme_minimal() + | |
| # Renaming the label | |
| scale_x_discrete(labels = c(dag = "DAG", pvalue = "P-Value")) + | |
| # Reformatting theme to have bold title text and a larger font size. | |
| theme( | |
| plot.title = element_text(hjust = 0.5, face = "bold"), | |
| plot.subtitle = element_text(hjust = 0.5, face = "bold"), | |
| text = element_text(size = 20), | |
| legend.position = "none" | |
| ) + | |
| # Changing Y scale. | |
| scale_y_continuous(limits = c(-0.7, 0.7), | |
| breaks = c(-0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment