Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active February 24, 2020 22:45
Show Gist options
  • Save BioSciEconomist/6e76fb7333d6d35ae9ba10b3b72898db to your computer and use it in GitHub Desktop.
Save BioSciEconomist/6e76fb7333d6d35ae9ba10b3b72898db to your computer and use it in GitHub Desktop.
Power calculations based on the gamma distribution for positive claims costs
# *-----------------------------------------------------------------
# | PROGRAM NAME: Gamma Power Calculator 26Nov19.R
# | DATE: 11/20/19
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: compare conventional vs nGLM power calculations to estimate power and sample size for tests involving costs
# *----------------------------------------------------------------
# based on: Cundill, Bonnie & Alexander, Neal. (2015). Sample size calculations for skewed distributions.
# BMC medical research methodology. 15. 28. 10.1186/s12874-015-0023-0.
# first run nGLM.R to load nGLM function for power calculations based on the gamma model
# rm(list=ls()) # get rid of any existing data if necessary
options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation
library(MASS) # for estimating GLM models
library(dplyr) # for data manipulation
#------------------------------------------------
# determine values of kappa for gamma power calculation using historical data
#------------------------------------------------
### this is assuming you have baseline or historical reference data
df1 <- read.csv('demo_dat_claims.csv')
names(df1) = c("ID","treat","Admits","cost","CostFlag")
df1 <- df1[df1$cost > 0,]
treat <- df1[df1$treat == 1,]
ctrl <- df1[df1$treat == 0,]
### derive parameters based on Cundill & Alexander BMC Med Res Meth
# if y ~ gammma with shape parameter k and scale parameter theta, then:
# E(y) = k*theta = mu
# v(mu) = k*theta^2 = mu^2 / k
# this means shape*scale = mean
# this also implies we could estimate kappa as mean^2 / variance or CV^2
mu1 <- 22863 # treatment mean mean(treat$cost)
mu0 <- 13244 # control mean mean(ctrl$cost)
cv1 <- 1.85 # sd(treat$cost)/mean(treat$cost) # 1.85
cv0 <- 1.43 # sd(ctrl$cost)/mean(ctrl$cost) #1.43
kappa_0 <- .489 # 1/(cv0^2)
kappa_1 <- .292 # 1/(cv1^2)
### additional parameters for power calculations below
Q0 <- .5 # proportion in control group
sigma_0 = mu0/sqrt(kappa_0) # this is really close to sd(treat$cost)
sigma_1 = mu1/sqrt(kappa_1) # this is really close to sd(control$cost)
# common std deviation - this is ~ sd(df1$cost)
sigma <- sqrt((sigma_0^2*(1/Q0 -1 ) + sigma_1^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 )))
#------------------------------------
# 'standard' power analysis
#-----------------------------------
library(pwr)
# making common error variance equal to population variance
t0 <- abs(mu1 - mu0)/sigma # effect size
pwr.t.test(n = , d = t0, sig.level = .05 , power = .8 ,type = c("two.sample")) # 183 ~ per group
rm(t0) # cleanup
#-----------------------------------------------
# required sample size based on nGLM
#-----------------------------------------------
nGLM(link="log", family="Gamma", mu0= mu0, mu1=mu1, Q0= Q0, kappa0 = kappa_0, kappa1 = kappa_1, alpha = .05,power=0.8, method=2)/2
# required n = ~ 144 per group)
# formula by hand assuming 5% level of significance and 80% power ('by hand' calculation)
Za <- qnorm(.975) # Z(1-alpha/2)
Zb <- qnorm(.8) # Z(1-beta)
Q1 <- .5 # % of n in treatment
Q0 <- .5 # % of n in control
k1 <- kappa_1 # kappa for treatment
k0 <- kappa_0 # kappa for control
logMuo <- log(mu0) # log of control mean (this implies a log link?)
logMu1 <- log(mu1) # log of control mean (this implies a log link?)
sqrt_n <- ((Za + Zb)*((1/(Q1*k1)+(1/(Q0*k0))))^.5)/(logMuo - logMu1)
(sqrt_n^2)/2 # required sample size 144 (per group)
rm(Za,Zb,Q1,Q0,k1,k0,logMuo,logMu1) # cleanup
#--------------------------------------------------
# power simulation
#--------------------------------------------------
# 1. Simulate data from the model for each group's population. These are the samples. The
# populations are chosen so that the true difference between the population means is ?? > 0. (The
# null hypothesis is false.)
#
# 2. Run the procedure on the samples. For each sample, record whether the test rejects the null
# hypothesis.
#
# 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for
# the power of the test.
#-----------------------------------
# 1 - simulate data that reflects the hypothesized treatment and control groups / effect size of interest
#-----------------------------------
n <- 10000
### treatment data
theta1 <- mu1/kappa_1
# simulate gamma claims data using parameters defined above
set.seed(1234567)
y1 <- rgamma(n, shape = kappa_1, scale = theta1)
hist(y1, breaks = 50)
summary(y1)
y1 <- data.frame(y1) # convert to dataframe
y1$treat <- rep(1,n) # add trt/control flag
names(y1) <- c("cost","treat") # rename columns
### control data
theta0 <- mu0/kappa_0
# simulate gamma claims data using parameters defined above
set.seed(1234567)
y2 <- rgamma(n, shape = kappa_0, scale = theta0)
hist(y2, breaks = 50)
summary(y2)
sd(y2)
y2 <- data.frame(y2) # convert to dataframe
y2$treat <- rep(0,n) # add trt/control flag
names(y2) <- c("cost","treat") # rename columns
### combine treatment and control data
id <- seq(1,n*2) # id vector
y3 <- rbind(y1,y2) # combine / stack costs
simdat <- data.frame(id,y3) # combine simulated features
hist(simdat$cost, breaks = 30) # check
simdat%>%
group_by(treat)%>%
summarize(avg = mean(cost),
sd = sd(cost))
# compare to real data
# df1%>%
# group_by(treat)%>%
# summarize(avg = mean(cost),
# sd = sd(cost))
rm(n,id,y1,y2,y3,theta1,theta0) # cleanup
#-----------------------------------
# 2 - repeated sampling of our estimate with a given sample size from our simulated data
#-----------------------------------
# create treatment and control groups
treat <- simdat[simdat$treat == 1,]
ctrl <- simdat[simdat$treat == 0,]
t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population
Sn <- 144 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level
set.seed(1234567) # make this repeatable
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:500) {
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # bootstrap sample
fit <- glm(cost ~ treat, data = bs, family = "Gamma"(link = 'log')) # our estimator
summ <- summary(fit) # get parameter estimates and p-values from our estimator
bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rm(treat,ctrl,t0,Sn,i,sample1,sample2,bs,fit,summ) # cleanup
hist(bstrap, breaks = 30) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
summary(bstrap)
#-----------------------------------------------------------------------
# 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for
# the power of the test.
#------------------------------------------------------------------------
# power - how many times p < .05
sum(bstrap < .05)/length(bstrap) # 79% power for Sn = 230 (115 per group)
rm(bstrap) # cleanup
###################################################################
###################################################################
############# changing assumptions ##########################
###################################################################
###################################################################
#-----------------------------------
# 2b - repeated sampling of our estimate with a given sample size from our simulated data using linear regression
# instead of GLM
#-----------------------------------
# create treatment and control groups
treat <- simdat[simdat$treat == 1,]
ctrl <- simdat[simdat$treat == 0,]
t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population
Sn <- 183 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level
set.seed(1234567) # make this repeatable
bstrap <- c() # initialize vector for bootstrapped estimates
for (i in 1:500) {
sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
bs <- rbind(sample1,sample2) # bootstrap sample
fit <- lm(cost ~ treat, data = bs) # our standard regression estimator
summ <- summary(fit) # get parameter estimates and p-values from our estimator
bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample
print(cbind("Iteration:", i)) # comment this out to stop printing to log
}
rm(treat,ctrl,t0,Sn,sample1,sample2,bs,fit,summ) # cleanup
hist(bstrap) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
summary(bstrap)
#-----------------------------------------------------------------------
# 3b Count how many times the test rejects the null hypothesis. This proportion is an estimate for
# the power of the test.
#------------------------------------------------------------------------
# power - how many times p < .05
sum(bstrap < .05)/length(bstrap) # 76% power for Sn = 144 (per group)
sum(bstrap < .05)/length(bstrap) # 84% power for Sn = 183 (per group)
rm(bstrap) # clean up
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment