BioSciEconomist · February 24, 2020 22:45
diff --git a/Gamma Power Calculator.R b/Gamma Power Calculator.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: Gamma Power Calculator 26Nov19.R
 # | DATE: 11/20/19 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:        
 # *----------------------------------------------------------------
 # | PURPOSE: compare conventional vs nGLM power calculations to estimate power and sample size for tests involving costs
 # *----------------------------------------------------------------

 # based on: Cundill, Bonnie & Alexander, Neal. (2015). Sample size calculations for skewed distributions. 
 # BMC medical research methodology. 15. 28. 10.1186/s12874-015-0023-0.

 # first run nGLM.R to load nGLM function for power calculations based on the gamma model

 # rm(list=ls()) # get rid of any existing data if necessary

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


 library(MASS)  # for estimating GLM models
 library(dplyr) # for data manipulation 

 #------------------------------------------------
 # determine values of kappa for gamma power calculation using historical data
 #------------------------------------------------

 ### this is assuming you have baseline or historical reference data

 df1 <- read.csv('demo_dat_claims.csv')

 names(df1) = c("ID","treat","Admits","cost","CostFlag")
 df1 <- df1[df1$cost > 0,]

 treat <- df1[df1$treat == 1,]
 ctrl <- df1[df1$treat == 0,]

 ### derive parameters based on Cundill & Alexander BMC Med Res Meth

 # if y ~ gammma with shape parameter k and scale parameter theta, then:
 # E(y) = k*theta = mu
 # v(mu) =  k*theta^2 = mu^2 / k
 # this means shape*scale = mean
 # this also implies we could estimate kappa as mean^2 / variance or CV^2

 mu1 <- 22863  # treatment mean mean(treat$cost)
 mu0 <- 13244  # control mean mean(ctrl$cost)
 cv1 <- 1.85 # sd(treat$cost)/mean(treat$cost) # 1.85
 cv0 <- 1.43 # sd(ctrl$cost)/mean(ctrl$cost)  #1.43
 kappa_0 <- .489   # 1/(cv0^2)
 kappa_1 <- .292  # 1/(cv1^2)

 ### additional parameters for power calculations below
 Q0 <- .5 # proportion in control group
 sigma_0 = mu0/sqrt(kappa_0) # this is really close to sd(treat$cost)
 sigma_1 = mu1/sqrt(kappa_1) # this is really close to sd(control$cost)

 # common std deviation -  this is ~  sd(df1$cost)
 sigma <- sqrt((sigma_0^2*(1/Q0 -1 ) + sigma_1^2*(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 )))


 #------------------------------------
 # 'standard' power analysis
 #-----------------------------------

 library(pwr)

 # making common error variance equal to population variance

 t0 <- abs(mu1 - mu0)/sigma # effect size
 pwr.t.test(n = , d = t0, sig.level = .05 , power = .8 ,type = c("two.sample"))  # 183 ~  per group

 rm(t0) # cleanup

 #-----------------------------------------------
 # required sample size based on nGLM
 #-----------------------------------------------

 nGLM(link="log",  family="Gamma", mu0= mu0, mu1=mu1, Q0= Q0, kappa0 = kappa_0, kappa1 = kappa_1, alpha = .05,power=0.8, method=2)/2

 # required n = ~ 144 per group)

 # formula by hand assuming 5% level of significance and 80% power ('by hand' calculation)

 Za <- qnorm(.975) # Z(1-alpha/2)
 Zb <- qnorm(.8) # Z(1-beta)
 Q1 <- .5 # % of n in treatment
 Q0 <- .5 # % of n in control
 k1 <- kappa_1 # kappa for treatment
 k0 <- kappa_0 # kappa for control
 logMuo <- log(mu0) # log of control mean (this implies a log link?)
 logMu1 <- log(mu1) # log of control mean (this implies a log link?)

 sqrt_n <- ((Za + Zb)*((1/(Q1*k1)+(1/(Q0*k0))))^.5)/(logMuo - logMu1)

 (sqrt_n^2)/2 # required sample size 144 (per group)

 rm(Za,Zb,Q1,Q0,k1,k0,logMuo,logMu1) # cleanup

 #--------------------------------------------------
 #  power simulation
 #--------------------------------------------------

 # 1.  Simulate data from the model for each group's population. These are the samples. The
 #     populations are chosen so that the true difference between the population means is ?? > 0. (The
 #     null hypothesis is false.)
 # 
 # 2. Run the procedure on the samples. For each sample, record whether the test rejects the null
 #    hypothesis.
 #
 # 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for
 #    the power of the test.


 #-----------------------------------
 #  1 -  simulate data that reflects the hypothesized treatment and control groups / effect size of interest
 #-----------------------------------

 n <- 10000

 ### treatment data

 theta1 <- mu1/kappa_1

 # simulate gamma claims data using parameters defined above
 set.seed(1234567)
 y1 <- rgamma(n, shape = kappa_1, scale = theta1)

 hist(y1, breaks = 50)
 summary(y1)

 y1 <- data.frame(y1) # convert to dataframe
 y1$treat <- rep(1,n) # add trt/control flag
 names(y1) <- c("cost","treat") # rename columns

 ### control data

 theta0 <- mu0/kappa_0

 # simulate gamma claims data using parameters defined above
 set.seed(1234567)
 y2 <- rgamma(n, shape = kappa_0, scale = theta0)

 hist(y2, breaks = 50)
 summary(y2)
 sd(y2)

 y2 <- data.frame(y2) # convert to dataframe
 y2$treat <- rep(0,n) # add trt/control flag
 names(y2) <- c("cost","treat") # rename columns


 ### combine treatment and control data

 id <- seq(1,n*2) # id vector
 y3 <- rbind(y1,y2) # combine / stack costs
 simdat <- data.frame(id,y3) # combine simulated features

 hist(simdat$cost, breaks = 30) # check

 simdat%>%
  group_by(treat)%>%
  summarize(avg = mean(cost),
            sd = sd(cost))

 # compare to real data
 # df1%>%
 #   group_by(treat)%>%
 #   summarize(avg = mean(cost),
 #             sd = sd(cost))

 rm(n,id,y1,y2,y3,theta1,theta0) # cleanup

 #-----------------------------------
 #  2 -  repeated sampling of our estimate with a given sample size from our simulated data
 #-----------------------------------

 # create treatment and control groups
 treat <- simdat[simdat$treat == 1,]
 ctrl <- simdat[simdat$treat == 0,]

 t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population 
 Sn <- 144 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level


 set.seed(1234567) # make this repeatable
 bstrap <- c() # initialize vector for bootstrapped estimates
 for (i in 1:500) {
  sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
  sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ]  # sample from control group
  bs <- rbind(sample1,sample2) # bootstrap sample
  fit <- glm(cost ~ treat, data = bs, family = "Gamma"(link = 'log')) # our estimator
  summ <- summary(fit) # get parameter estimates and p-values from our estimator
  bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
 }

 rm(treat,ctrl,t0,Sn,i,sample1,sample2,bs,fit,summ) # cleanup

 hist(bstrap, breaks = 30) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
 summary(bstrap)


 #-----------------------------------------------------------------------
 # 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for
 #    the power of the test.
 #------------------------------------------------------------------------

 # power - how many times p < .05
 sum(bstrap < .05)/length(bstrap) # 79% power for Sn = 230 (115 per group)

 rm(bstrap) # cleanup

 ###################################################################
 ###################################################################
 #############     changing assumptions   ##########################
 ###################################################################
 ###################################################################

 #-----------------------------------
 #  2b -  repeated sampling of our estimate with a given sample size from our simulated data using linear regression
 #        instead of GLM
 #-----------------------------------

 # create treatment and control groups
 treat <- simdat[simdat$treat == 1,]
 ctrl <- simdat[simdat$treat == 0,]

 t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population 
 Sn <- 183 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level

 set.seed(1234567) # make this repeatable
 bstrap <- c() # initialize vector for bootstrapped estimates
 for (i in 1:500) {
  sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
  sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ]  # sample from control group
  bs <- rbind(sample1,sample2) # bootstrap sample
  fit <- lm(cost ~ treat, data = bs) # our standard regression estimator
  summ <- summary(fit) # get parameter estimates and p-values from our estimator
  bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample
  print(cbind("Iteration:", i)) # comment this out to stop printing to log
 }

 rm(treat,ctrl,t0,Sn,sample1,sample2,bs,fit,summ) # cleanup

 hist(bstrap) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
 summary(bstrap)


 #-----------------------------------------------------------------------
 # 3b Count how many times the test rejects the null hypothesis. This proportion is an estimate for
 #    the power of the test.
 #------------------------------------------------------------------------

 # power - how many times p < .05
 sum(bstrap < .05)/length(bstrap) # 76% power for Sn = 144 (per group)
 sum(bstrap < .05)/length(bstrap) # 84% power for Sn = 183 (per group)


 rm(bstrap) # clean up
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: Gamma Power Calculator 26Nov19.R
	# \| DATE: 11/20/19
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: compare conventional vs nGLM power calculations to estimate power and sample size for tests involving costs
	# *----------------------------------------------------------------

	# based on: Cundill, Bonnie & Alexander, Neal. (2015). Sample size calculations for skewed distributions.
	# BMC medical research methodology. 15. 28. 10.1186/s12874-015-0023-0.

	# first run nGLM.R to load nGLM function for power calculations based on the gamma model

	# rm(list=ls()) # get rid of any existing data if necessary

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation


	library(MASS) # for estimating GLM models
	library(dplyr) # for data manipulation

	#------------------------------------------------
	# determine values of kappa for gamma power calculation using historical data
	#------------------------------------------------

	### this is assuming you have baseline or historical reference data

	df1 <- read.csv('demo_dat_claims.csv')

	names(df1) = c("ID","treat","Admits","cost","CostFlag")
	df1 <- df1[df1$cost > 0,]

	treat <- df1[df1$treat == 1,]
	ctrl <- df1[df1$treat == 0,]

	### derive parameters based on Cundill & Alexander BMC Med Res Meth

	# if y ~ gammma with shape parameter k and scale parameter theta, then:
	# E(y) = k*theta = mu
	# v(mu) = k*theta^2 = mu^2 / k
	# this means shape*scale = mean
	# this also implies we could estimate kappa as mean^2 / variance or CV^2

	mu1 <- 22863 # treatment mean mean(treat$cost)
	mu0 <- 13244 # control mean mean(ctrl$cost)
	cv1 <- 1.85 # sd(treat$cost)/mean(treat$cost) # 1.85
	cv0 <- 1.43 # sd(ctrl$cost)/mean(ctrl$cost) #1.43
	kappa_0 <- .489 # 1/(cv0^2)
	kappa_1 <- .292 # 1/(cv1^2)

	### additional parameters for power calculations below
	Q0 <- .5 # proportion in control group
	sigma_0 = mu0/sqrt(kappa_0) # this is really close to sd(treat$cost)
	sigma_1 = mu1/sqrt(kappa_1) # this is really close to sd(control$cost)

	# common std deviation - this is ~ sd(df1$cost)
	sigma <- sqrt((sigma_0^2(1/Q0 -1 ) + sigma_1^2(1/(1-Q0) -1 ) )/((1/Q0 -1 ) + (1/(1-Q0) -1 )))


	#------------------------------------
	# 'standard' power analysis
	#-----------------------------------

	library(pwr)

	# making common error variance equal to population variance

	t0 <- abs(mu1 - mu0)/sigma # effect size
	pwr.t.test(n = , d = t0, sig.level = .05 , power = .8 ,type = c("two.sample")) # 183 ~ per group

	rm(t0) # cleanup

	#-----------------------------------------------
	# required sample size based on nGLM
	#-----------------------------------------------

	nGLM(link="log", family="Gamma", mu0= mu0, mu1=mu1, Q0= Q0, kappa0 = kappa_0, kappa1 = kappa_1, alpha = .05,power=0.8, method=2)/2

	# required n = ~ 144 per group)

	# formula by hand assuming 5% level of significance and 80% power ('by hand' calculation)

	Za <- qnorm(.975) # Z(1-alpha/2)
	Zb <- qnorm(.8) # Z(1-beta)
	Q1 <- .5 # % of n in treatment
	Q0 <- .5 # % of n in control
	k1 <- kappa_1 # kappa for treatment
	k0 <- kappa_0 # kappa for control
	logMuo <- log(mu0) # log of control mean (this implies a log link?)
	logMu1 <- log(mu1) # log of control mean (this implies a log link?)

	sqrt_n <- ((Za + Zb)((1/(Q1k1)+(1/(Q0*k0))))^.5)/(logMuo - logMu1)

	(sqrt_n^2)/2 # required sample size 144 (per group)

	rm(Za,Zb,Q1,Q0,k1,k0,logMuo,logMu1) # cleanup

	#--------------------------------------------------
	# power simulation
	#--------------------------------------------------

	# 1. Simulate data from the model for each group's population. These are the samples. The
	# populations are chosen so that the true difference between the population means is ?? > 0. (The
	# null hypothesis is false.)
	#
	# 2. Run the procedure on the samples. For each sample, record whether the test rejects the null
	# hypothesis.
	#
	# 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for
	# the power of the test.


	#-----------------------------------
	# 1 - simulate data that reflects the hypothesized treatment and control groups / effect size of interest
	#-----------------------------------

	n <- 10000

	### treatment data

	theta1 <- mu1/kappa_1

	# simulate gamma claims data using parameters defined above
	set.seed(1234567)
	y1 <- rgamma(n, shape = kappa_1, scale = theta1)

	hist(y1, breaks = 50)
	summary(y1)

	y1 <- data.frame(y1) # convert to dataframe
	y1$treat <- rep(1,n) # add trt/control flag
	names(y1) <- c("cost","treat") # rename columns

	### control data

	theta0 <- mu0/kappa_0

	# simulate gamma claims data using parameters defined above
	set.seed(1234567)
	y2 <- rgamma(n, shape = kappa_0, scale = theta0)

	hist(y2, breaks = 50)
	summary(y2)
	sd(y2)

	y2 <- data.frame(y2) # convert to dataframe
	y2$treat <- rep(0,n) # add trt/control flag
	names(y2) <- c("cost","treat") # rename columns


	### combine treatment and control data

	id <- seq(1,n*2) # id vector
	y3 <- rbind(y1,y2) # combine / stack costs
	simdat <- data.frame(id,y3) # combine simulated features

	hist(simdat$cost, breaks = 30) # check

	simdat%>%
	group_by(treat)%>%
	summarize(avg = mean(cost),
	sd = sd(cost))

	# compare to real data
	# df1%>%
	# group_by(treat)%>%
	# summarize(avg = mean(cost),
	# sd = sd(cost))

	rm(n,id,y1,y2,y3,theta1,theta0) # cleanup

	#-----------------------------------
	# 2 - repeated sampling of our estimate with a given sample size from our simulated data
	#-----------------------------------

	# create treatment and control groups
	treat <- simdat[simdat$treat == 1,]
	ctrl <- simdat[simdat$treat == 0,]

	t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population
	Sn <- 144 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level


	set.seed(1234567) # make this repeatable
	bstrap <- c() # initialize vector for bootstrapped estimates
	for (i in 1:500) {
	sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
	sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
	bs <- rbind(sample1,sample2) # bootstrap sample
	fit <- glm(cost ~ treat, data = bs, family = "Gamma"(link = 'log')) # our estimator
	summ <- summary(fit) # get parameter estimates and p-values from our estimator
	bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}

	rm(treat,ctrl,t0,Sn,i,sample1,sample2,bs,fit,summ) # cleanup

	hist(bstrap, breaks = 30) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
	summary(bstrap)


	#-----------------------------------------------------------------------
	# 3. Count how many times the test rejects the null hypothesis. This proportion is an estimate for
	# the power of the test.
	#------------------------------------------------------------------------

	# power - how many times p < .05
	sum(bstrap < .05)/length(bstrap) # 79% power for Sn = 230 (115 per group)

	rm(bstrap) # cleanup

	###################################################################
	###################################################################
	############# changing assumptions ##########################
	###################################################################
	###################################################################

	#-----------------------------------
	# 2b - repeated sampling of our estimate with a given sample size from our simulated data using linear regression
	# instead of GLM
	#-----------------------------------

	# create treatment and control groups
	treat <- simdat[simdat$treat == 1,]
	ctrl <- simdat[simdat$treat == 0,]

	t0 <- mean(treat$cost) - mean(ctrl$cost) # this is mean difference (effect size) in the simulated population
	Sn <- 183 # this is our sample size we want to use per group i.e. determine how much power for given Sn, effect size, and significance level

	set.seed(1234567) # make this repeatable
	bstrap <- c() # initialize vector for bootstrapped estimates
	for (i in 1:500) {
	sample1 <- treat[sample(nrow(treat), Sn, replace = T), ] # sample from treatment group
	sample2 <- ctrl[sample(nrow(ctrl), Sn, replace = T), ] # sample from control group
	bs <- rbind(sample1,sample2) # bootstrap sample
	fit <- lm(cost ~ treat, data = bs) # our standard regression estimator
	summ <- summary(fit) # get parameter estimates and p-values from our estimator
	bstrap <- c(bstrap, coef(summ)[8]) # get p-value from this bs sample
	print(cbind("Iteration:", i)) # comment this out to stop printing to log
	}

	rm(treat,ctrl,t0,Sn,sample1,sample2,bs,fit,summ) # cleanup

	hist(bstrap) # simulated distribution (see: http://varianceexplained.org/statistics/interpreting-pvalue-histogram/)
	summary(bstrap)


	#-----------------------------------------------------------------------
	# 3b Count how many times the test rejects the null hypothesis. This proportion is an estimate for
	# the power of the test.
	#------------------------------------------------------------------------

	# power - how many times p < .05
	sum(bstrap < .05)/length(bstrap) # 76% power for Sn = 144 (per group)
	sum(bstrap < .05)/length(bstrap) # 84% power for Sn = 183 (per group)


	rm(bstrap) # clean up