Last active
February 24, 2020 22:45
-
-
Save BioSciEconomist/6e76fb7333d6d35ae9ba10b3b72898db to your computer and use it in GitHub Desktop.
Power calculations based on the gamma distribution for positive claims costs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: Gamma Power Calculator 26Nov19.R | |
# | DATE: 11/20/19 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: compare conventional vs nGLM power calculations to estimate power and sample size for tests involving costs | |
# *---------------------------------------------------------------- | |
# based on: Cundill, Bonnie & Alexander, Neal. (2015). Sample size calculations for skewed distributions. | |
# BMC medical research methodology. 15. 28. 10.1186/s12874-015-0023-0. | |
# first run nGLM.R to load nGLM function for power calculations based on the gamma model | |
# rm(list=ls()) # get rid of any existing data if necessary | |
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation | |
library(MASS) # for estimating GLM models | |
library(dplyr) # for data manipulation | |
#------------------------------------------------ | |
# determine values of kappa for gamma power calculation using historical data | |
#------------------------------------------------ | |
### this is assuming you have baseline or historical reference data | |
df1 <- read.csv('demo_dat_claims.csv') | |
names(df1) = c("ID","treat","Admits","cost","CostFlag") | |
df1 <- df1[df1$cost > 0,] | |
treat <- df1[df1$treat == 1,] | |
ctrl <- df1[df1$treat == 0,] | |
### derive parameters based on Cundill & Alexander BMC Med Res Meth | |
# if y ~ gammma with shape parameter k and scale parameter theta, then: | |
# E(y) = k*theta = mu | |
# v(mu) = k*theta^2 = mu^2 / k | |
# this means shape*scale = mean | |
# this also implies we could estimate kappa as mean^2 / variance or CV^2 | |
mu1 <- 22863 # treatment mean mean(treat$cost) | |
mu0 <- 13244 # control mean mean(ctrl$cost) | |
cv1 <- 1.85 # sd(treat$cost)/mean(treat$cost) # 1.85 | |
cv0 <- 1.43 # sd(ctrl$cost)/mean(ctrl$cost) #1.43 | |
kappa_0 <- .489 # 1/(cv0^2) | |
kappa_1 <- .292 # 1/(cv1^2) | |
### additional parameters for power calculations below | |
Q0 <- .5 # proportion in control group | |
sigma_0 = mu0/sqrt(kappa_0) # this is really close to sd(treat$cost) | |
sigma_1 = mu1/sqrt(kappa_1) # this is really close to sd(control$cost) | |
# common std deviation - this is ~ sd(df1$cost) | |
sigma <- sqrt((sigma_0^2*(1/Q0 -1 ) + sigma_1^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 ))) | |
#------------------------------------ | |
# 'standard' power analysis | |
#----------------------------------- | |
library(pwr) | |
# making common error variance equal to population variance | |
t0 <- abs(mu1 - mu0)/sigma # effect size | |
pwr.t.test(n = , d = t0, sig.level = .05 , power = .8 ,type = c("two.sample")) # 183 ~ per group | |
rm(t0) # cleanup | |
#----------------------------------------------- | |
# required sample size based on nGLM | |
#----------------------------------------------- | |
nGLM(link="log", family="Gamma", mu0= mu0, mu1=mu1, Q0= Q0, kappa0 = kappa_0, kappa1 = kappa_1, alpha = .05,power=0.8, method=2)/2 | |
# required n = ~ 144 per group) | |
# formula by hand assuming 5% level of significance and 80% power ('by hand' calculation) | |
Za <- qnorm(.975) # Z(1-alpha/2) | |
Zb <- qnorm(.8) # Z(1-beta) | |
Q1 <- .5 # % of n in treatment | |
Q0 <- .5 # % of n in control | |
k1 <- kappa_1 # kappa for treatment | |
k0 <- kappa_0 # kappa for control | |
logMuo <- log(mu0) # log of control mean (this implies a log link?) | |
logMu1 <- log(mu1) # log of control mean (this implies a log link?) | |
sqrt_n <- ((Za + Zb)*((1/(Q1*k1)+(1/(Q0*k0))))^.5)/(logMuo - logMu1) | |
(sqrt_n^2)/2 # required sample size 144 (per group) | |
rm(Za,Zb,Q1,Q0,k1,k0,logMuo,logMu1) # cleanup | |
#-------------------------------------------------- | |
# power simulation | |
#-------------------------------------------------- | |
# 1. Simulate data from the model for each group's population. These are the samples. The | |
# populations are chosen so that the true difference between the population means is ?? > 0. (The | |
# null hypothesis is false.) | |
# | |
# 2. Run the procedure on the samples. For each sample, record whether the test rejects the null | |
# hypothesis. | |
# | |
# 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for | |
# the power of the test. | |
#----------------------------------- | |
# 1 - simulate data that reflects the hypothesized treatment and control groups / effect size of interest | |
#----------------------------------- | |
n <- 10000 | |
### treatment data | |
theta1 <- mu1/kappa_1 | |
# simulate gamma claims data using parameters defined above | |
set.seed(1234567) | |
y1 <- rgamma(n, shape = kappa_1, scale = theta1) | |
hist(y1, breaks = 50) | |
summary(y1) | |
y1 <- data.frame(y1) # convert to dataframe | |
y1$treat <- rep(1,n) # add trt/control flag | |
names(y1) <- c("cost","treat") # rename columns | |
### control data | |
theta0 <- mu0/kappa_0 | |
# simulate gamma claims data using parameters defined above | |
set.seed(1234567) | |
y2 <- rgamma(n, shape = kappa_0, scale = theta0) | |
hist(y2, breaks = 50) | |
summary(y2) | |
sd(y2) | |
y2 <- data.frame(y2) # convert to dataframe | |
y2$treat <- rep(0,n) # add trt/control flag | |
names(y2) <- c("cost","treat") # rename columns | |
### combine treatment and control data | |
id <- seq(1,n*2) # id vector | |
y3 <- rbind(y1,y2) # combine / stack costs | |
simdat <- data.frame(id,y3) # combine simulated features | |
hist(simdat$cost, breaks = 30) # check | |
simdat%>% | |
group_by(treat)%>% | |
summarize(avg = mean(cost), | |
sd = sd(cost)) | |
# compare to real data | |
# df1%>% | |
# group_by(treat)%>% | |
# summarize(avg = mean(cost), | |
# sd = sd(cost)) | |
rm(n,id,y1,y2,y3,theta1,theta0) # cleanup | |
#----------------------------------- | |
# 2 - repeated sampling of our estimate with a given sample size from our simulated data | |
#----------------------------------- | |
# create treatment and control groups | |
treat <- simdat[simdat$treat == 1,] | |
ctrl <- simdat[simdat$treat == 0,] | |
t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population | |
Sn <- 144 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level | |
set.seed(1234567) # make this repeatable | |
bstrap <- c() # initialize vector for bootstrapped estimates | |
for (i in 1:500) { | |
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group | |
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group | |
bs <- rbind(sample1,sample2) # bootstrap sample | |
fit <- glm(cost ~ treat, data = bs, family = "Gamma"(link = 'log')) # our estimator | |
summ <- summary(fit) # get parameter estimates and p-values from our estimator | |
bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample | |
print(cbind("Iteration:", i)) # comment this out to stop printing to log | |
} | |
rm(treat,ctrl,t0,Sn,i,sample1,sample2,bs,fit,summ) # cleanup | |
hist(bstrap, breaks = 30) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/) | |
summary(bstrap) | |
#----------------------------------------------------------------------- | |
# 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for | |
# the power of the test. | |
#------------------------------------------------------------------------ | |
# power - how many times p < .05 | |
sum(bstrap < .05)/length(bstrap) # 79% power for Sn = 230 (115 per group) | |
rm(bstrap) # cleanup | |
################################################################### | |
################################################################### | |
############# changing assumptions ########################## | |
################################################################### | |
################################################################### | |
#----------------------------------- | |
# 2b - repeated sampling of our estimate with a given sample size from our simulated data using linear regression | |
# instead of GLM | |
#----------------------------------- | |
# create treatment and control groups | |
treat <- simdat[simdat$treat == 1,] | |
ctrl <- simdat[simdat$treat == 0,] | |
t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population | |
Sn <- 183 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level | |
set.seed(1234567) # make this repeatable | |
bstrap <- c() # initialize vector for bootstrapped estimates | |
for (i in 1:500) { | |
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group | |
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group | |
bs <- rbind(sample1,sample2) # bootstrap sample | |
fit <- lm(cost ~ treat, data = bs) # our standard regression estimator | |
summ <- summary(fit) # get parameter estimates and p-values from our estimator | |
bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample | |
print(cbind("Iteration:", i)) # comment this out to stop printing to log | |
} | |
rm(treat,ctrl,t0,Sn,sample1,sample2,bs,fit,summ) # cleanup | |
hist(bstrap) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/) | |
summary(bstrap) | |
#----------------------------------------------------------------------- | |
# 3b Count how many times the test rejects the null hypothesis. This proportion is an estimate for | |
# the power of the test. | |
#------------------------------------------------------------------------ | |
# power - how many times p < .05 | |
sum(bstrap < .05)/length(bstrap) # 76% power for Sn = 144 (per group) | |
sum(bstrap < .05)/length(bstrap) # 84% power for Sn = 183 (per group) | |
rm(bstrap) # clean up | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment