Skip to content

Instantly share code, notes, and snippets.

@medewitt
Last active November 15, 2019 11:23
Show Gist options
  • Save medewitt/c0fed33553f491763a6e560f3d97d093 to your computer and use it in GitHub Desktop.
Save medewitt/c0fed33553f491763a6e560f3d97d093 to your computer and use it in GitHub Desktop.
looking at a simulation
library(dplyr)
library(glmnet)
library(furrr)
plan("multisession")
run_simulation <- function(n = 100, num_pred = 50,
rho = 0, true_beta= -.1,
error = 0.01){
# Build Prediction Matrix ----
Sigma_star <- matrix(rep(r, num_pred^2),
nrow = num_pred)
diag(Sigma_star) <- 1
X <- MASS::mvrnorm(n = n, mu = rep(0, num_pred),
Sigma = Sigma_star)
X[,1] <- rep(1:10, times = n/10)
pop <- sample(500:2000, n, replace = T)
beta <- c(true_beta, .2, .1, rep(0, num_pred-3))
eta <- rnorm(n, X %*% beta + log(pop), error)
eta_star <- exp(eta)
y <- rpois(n = n, lambda = eta_star)
#glm( y ~ X[,c(1:3)] + offset(log(pop)), family = "poisson")
# Run Lasso ----
log_pop <- log(pop)
log_y <- log(y+1)
new_X <- cbind(X, log_pop)
fit_lasso <- cv.glmnet(x = new_X,
y = log_y,
nfolds = 5)
bestlam <- fit_lasso$lambda.min
lasso_coef <- predict(fit_lasso, type = "coefficients", s = bestlam)
keepers <- lasso_coef %>%
as.matrix() %>%
as.data.frame() %>%
mutate(var_name = rownames(.)) %>%
mutate(keep = ifelse(`1` != 0, TRUE, FALSE)) %>%
mutate(keep = ifelse(var_name == "X.Intercept.", FALSE, keep)) %>%
mutate(keep = ifelse(var_name == "log_pop", FALSE, keep)) %>%
mutate(keep = ifelse(var_name == "treatment" |
var_name == "population", TRUE, keep)) %>%
filter(!var_name %in% c("log_pop", "X.Intercept."))
num_variables <- sum(keepers$keep)
reduced_X <- X[, pull(keepers, keep)]
# Fit GLM ---
fit_glm <-
glm(y ~ reduced_X + offset(log(pop)), family = "poisson")
data.frame(n = n,
rho = rho,
num_pred = num_pred,
true_beta = true_beta,
estimated_beta = coef(fit_glm)[2])
}
library(purrr)
combos <- expand.grid(n = seq(50, 400, by = 50),
rho = seq(0, 1, by = .1),
true_beta = seq(-1, 1, by = .1),
num_pred = c(50, 75, 100)) %>%
as.data.frame()
simulated_results <-rerun(.n = 1000, future_pmap_dfr(.l = list(n = combos$n,
rho = combos$rho,
true_beta = combos$true_beta),
run_simulation))
condensed_results <- simulated_results %>%
bind_rows(.id = "iteration")
condensed_results %>%
readr::write_rds("lasso-simulation-results.rds")
library(dplyr)
library(glmnet)
library(doParallel)
set.seed(42)
registerDoParallel(detectCores()-1)
out <- vector()
for(i in 1:100){
independent_predictors <- 50
correlated_predictors <- 10
n <- 200
r <- 0.8
noise <- 0.01
Sigma_star <- matrix(rep(r, correlated_predictors*correlated_predictors),
nrow = correlated_predictors)
diag(Sigma_star) <- 1
X_ind <- matrix(rnorm(n*independent_predictors),
ncol = independent_predictors)
X_corr <- MASS::mvrnorm(n = n, mu = rep(0, correlated_predictors),
Sigma = Sigma_star)
X_treatment <- rep(seq(1:10), times = n/10)
X_population <- sample(100:10000, size = n, replace = T)
X <- cbind(X_treatment ,X_ind, X_corr, X_population)
colnames(X) <- c("treatment", sprintf("var_%00s", 1:(ncol(X)-2)), "population")
ncol(X)
true_betas <- c(-.1, .2, -.1, .8, 1.2, rep(0,ncol(X)-7),-.1)
dim(X)
length(true_betas)
y_star <- rnorm(n, X[,-ncol(X)] %*% true_betas+ log(X[,ncol(X), drop = TRUE]),
sd = noise)
y <- rpois(n, exp(y_star))
dat <- data.frame(y, X)
plot(y~ population, data = dat); abline(a = 0, b = 1, col = "red")
# Lasso Training Section
x_train_lasso <- dat %>%
select(-y) %>%
mutate(population = log(population))
y_train_lasso <- dat %>%
select(y) %>%
mutate(y = log(y))
fit_lasso <- cv.glmnet(alpha = 1,
x = as.matrix(x_train_lasso),
y = as.matrix(y_train_lasso),
nfolds = 5,
parallel = TRUE)
bestlam <- fit_lasso$lambda.min
lasso_coef <- predict(fit_lasso, type = "coefficients", s = bestlam)
# Pull out the coefficients to keep in the model
keepers <- lasso_coef %>%
as.matrix() %>%
as.data.frame() %>%
mutate(var_name = rownames(.)) %>%
mutate(keep = ifelse(`1` > 0, TRUE, FALSE)) %>%
mutate(keep = ifelse(var_name == "(Intercept)", FALSE, keep)) %>%
mutate(keep = ifelse(var_name == "treatment" | var_name == "population", TRUE, keep)) %>%
filter(keep) %>%
pull(var_name)
dat_for_model <- dat[,c("y", keepers)]
pop_offset <- dat_for_model %>%
select(population) %>%
unlist()
glm_preds <- dat_for_model %>%
select(-population)
fit_glm <- glm(y ~ ., data = glm_preds, offset = log(pop_offset), family = "poisson")
summary(fit_glm)
est_effect <- exp(coef(fit_glm)[2])
out[i] <- est_effect
}
library(dplyr)
set.seed(42)
independent_predictors <- 50
correlated_predictors <- 10
n <- 100
r <- 0.8
noise <- 0.01
Sigma_star <- matrix(rep(r, correlated_predictors*correlated_predictors),
nrow = correlated_predictors)
diag(Sigma_star) <- 1
X_ind <- matrix(rnorm(n*independent_predictors),
ncol = independent_predictors)
X_corr <- MASS::mvrnorm(n = n, mu = rep(0, correlated_predictors),
Sigma = Sigma_star)
X_treatment <- rep(seq(1:10), times = n/10)
X_population <- sample(100:10000, size = n, replace = T)
X <- cbind(X_treatment ,X_ind, X_corr, X_population)
colnames(X) <- c("treatment", sprintf("var_%00s", 1:(ncol(X)-2)), "population")
ncol(X)
true_betas <- c(-.1, .2, -.1, .1, .1, rep(0,ncol(X)-6),-.1)
y_star <- rnorm(n, X[,-ncol(X)] %*% true_betas+ log(X[,ncol(X), drop = TRUE]),
sd = noise)
y <- rpois(n, exp(y_star))
dat <- data.frame(y, X)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment