Last active
November 15, 2019 11:23
-
-
Save medewitt/c0fed33553f491763a6e560f3d97d093 to your computer and use it in GitHub Desktop.
looking at a simulation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(glmnet) | |
library(furrr) | |
plan("multisession") | |
run_simulation <- function(n = 100, num_pred = 50, | |
rho = 0, true_beta= -.1, | |
error = 0.01){ | |
# Build Prediction Matrix ---- | |
Sigma_star <- matrix(rep(r, num_pred^2), | |
nrow = num_pred) | |
diag(Sigma_star) <- 1 | |
X <- MASS::mvrnorm(n = n, mu = rep(0, num_pred), | |
Sigma = Sigma_star) | |
X[,1] <- rep(1:10, times = n/10) | |
pop <- sample(500:2000, n, replace = T) | |
beta <- c(true_beta, .2, .1, rep(0, num_pred-3)) | |
eta <- rnorm(n, X %*% beta + log(pop), error) | |
eta_star <- exp(eta) | |
y <- rpois(n = n, lambda = eta_star) | |
#glm( y ~ X[,c(1:3)] + offset(log(pop)), family = "poisson") | |
# Run Lasso ---- | |
log_pop <- log(pop) | |
log_y <- log(y+1) | |
new_X <- cbind(X, log_pop) | |
fit_lasso <- cv.glmnet(x = new_X, | |
y = log_y, | |
nfolds = 5) | |
bestlam <- fit_lasso$lambda.min | |
lasso_coef <- predict(fit_lasso, type = "coefficients", s = bestlam) | |
keepers <- lasso_coef %>% | |
as.matrix() %>% | |
as.data.frame() %>% | |
mutate(var_name = rownames(.)) %>% | |
mutate(keep = ifelse(`1` != 0, TRUE, FALSE)) %>% | |
mutate(keep = ifelse(var_name == "X.Intercept.", FALSE, keep)) %>% | |
mutate(keep = ifelse(var_name == "log_pop", FALSE, keep)) %>% | |
mutate(keep = ifelse(var_name == "treatment" | | |
var_name == "population", TRUE, keep)) %>% | |
filter(!var_name %in% c("log_pop", "X.Intercept.")) | |
num_variables <- sum(keepers$keep) | |
reduced_X <- X[, pull(keepers, keep)] | |
# Fit GLM --- | |
fit_glm <- | |
glm(y ~ reduced_X + offset(log(pop)), family = "poisson") | |
data.frame(n = n, | |
rho = rho, | |
num_pred = num_pred, | |
true_beta = true_beta, | |
estimated_beta = coef(fit_glm)[2]) | |
} | |
library(purrr) | |
combos <- expand.grid(n = seq(50, 400, by = 50), | |
rho = seq(0, 1, by = .1), | |
true_beta = seq(-1, 1, by = .1), | |
num_pred = c(50, 75, 100)) %>% | |
as.data.frame() | |
simulated_results <-rerun(.n = 1000, future_pmap_dfr(.l = list(n = combos$n, | |
rho = combos$rho, | |
true_beta = combos$true_beta), | |
run_simulation)) | |
condensed_results <- simulated_results %>% | |
bind_rows(.id = "iteration") | |
condensed_results %>% | |
readr::write_rds("lasso-simulation-results.rds") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
library(glmnet) | |
library(doParallel) | |
set.seed(42) | |
registerDoParallel(detectCores()-1) | |
out <- vector() | |
for(i in 1:100){ | |
independent_predictors <- 50 | |
correlated_predictors <- 10 | |
n <- 200 | |
r <- 0.8 | |
noise <- 0.01 | |
Sigma_star <- matrix(rep(r, correlated_predictors*correlated_predictors), | |
nrow = correlated_predictors) | |
diag(Sigma_star) <- 1 | |
X_ind <- matrix(rnorm(n*independent_predictors), | |
ncol = independent_predictors) | |
X_corr <- MASS::mvrnorm(n = n, mu = rep(0, correlated_predictors), | |
Sigma = Sigma_star) | |
X_treatment <- rep(seq(1:10), times = n/10) | |
X_population <- sample(100:10000, size = n, replace = T) | |
X <- cbind(X_treatment ,X_ind, X_corr, X_population) | |
colnames(X) <- c("treatment", sprintf("var_%00s", 1:(ncol(X)-2)), "population") | |
ncol(X) | |
true_betas <- c(-.1, .2, -.1, .8, 1.2, rep(0,ncol(X)-7),-.1) | |
dim(X) | |
length(true_betas) | |
y_star <- rnorm(n, X[,-ncol(X)] %*% true_betas+ log(X[,ncol(X), drop = TRUE]), | |
sd = noise) | |
y <- rpois(n, exp(y_star)) | |
dat <- data.frame(y, X) | |
plot(y~ population, data = dat); abline(a = 0, b = 1, col = "red") | |
# Lasso Training Section | |
x_train_lasso <- dat %>% | |
select(-y) %>% | |
mutate(population = log(population)) | |
y_train_lasso <- dat %>% | |
select(y) %>% | |
mutate(y = log(y)) | |
fit_lasso <- cv.glmnet(alpha = 1, | |
x = as.matrix(x_train_lasso), | |
y = as.matrix(y_train_lasso), | |
nfolds = 5, | |
parallel = TRUE) | |
bestlam <- fit_lasso$lambda.min | |
lasso_coef <- predict(fit_lasso, type = "coefficients", s = bestlam) | |
# Pull out the coefficients to keep in the model | |
keepers <- lasso_coef %>% | |
as.matrix() %>% | |
as.data.frame() %>% | |
mutate(var_name = rownames(.)) %>% | |
mutate(keep = ifelse(`1` > 0, TRUE, FALSE)) %>% | |
mutate(keep = ifelse(var_name == "(Intercept)", FALSE, keep)) %>% | |
mutate(keep = ifelse(var_name == "treatment" | var_name == "population", TRUE, keep)) %>% | |
filter(keep) %>% | |
pull(var_name) | |
dat_for_model <- dat[,c("y", keepers)] | |
pop_offset <- dat_for_model %>% | |
select(population) %>% | |
unlist() | |
glm_preds <- dat_for_model %>% | |
select(-population) | |
fit_glm <- glm(y ~ ., data = glm_preds, offset = log(pop_offset), family = "poisson") | |
summary(fit_glm) | |
est_effect <- exp(coef(fit_glm)[2]) | |
out[i] <- est_effect | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(dplyr) | |
set.seed(42) | |
independent_predictors <- 50 | |
correlated_predictors <- 10 | |
n <- 100 | |
r <- 0.8 | |
noise <- 0.01 | |
Sigma_star <- matrix(rep(r, correlated_predictors*correlated_predictors), | |
nrow = correlated_predictors) | |
diag(Sigma_star) <- 1 | |
X_ind <- matrix(rnorm(n*independent_predictors), | |
ncol = independent_predictors) | |
X_corr <- MASS::mvrnorm(n = n, mu = rep(0, correlated_predictors), | |
Sigma = Sigma_star) | |
X_treatment <- rep(seq(1:10), times = n/10) | |
X_population <- sample(100:10000, size = n, replace = T) | |
X <- cbind(X_treatment ,X_ind, X_corr, X_population) | |
colnames(X) <- c("treatment", sprintf("var_%00s", 1:(ncol(X)-2)), "population") | |
ncol(X) | |
true_betas <- c(-.1, .2, -.1, .1, .1, rep(0,ncol(X)-6),-.1) | |
y_star <- rnorm(n, X[,-ncol(X)] %*% true_betas+ log(X[,ncol(X), drop = TRUE]), | |
sd = noise) | |
y <- rpois(n, exp(y_star)) | |
dat <- data.frame(y, X) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment