BioSciEconomist · March 3, 2021 23:41
diff --git a/simulate claims cost data.R b/simulate claims cost data.R
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: simulate claims cost data.R
 # | DATE: 3/2/21
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:          
 # *----------------------------------------------------------------
 # | PURPOSE: simualte example treatment and control claims cost data
 # *----------------------------------------------------------------

 # https://data.library.virginia.edu/assessing-type-s-and-type-m-errors/ 

 library(MASS)
 library(dplyr)

 options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

 #
 # treatent group
 #

 mu <- 4000  # mean
 sd <- 12000 # standard deviation
 cv <- sd/mu # CV
 kappa <- 1/(cv^2)  # shape parameter
 theta <-  (sd*sd)/mu  # this is theta or scale parameter


 # simulate costs based on parameter assumptions above
 set.seed(1234567)
 cost <- rgamma(100000, shape = kappa, scale = theta)

 # check descriptive results
 summary(cost)
 sd(cost)
 hist(cost, breaks = 50)

 # simulate ID and treatment indicator
 ID <- seq(1,100000)
 D <- rep(1,100000) 

 trt <- data.frame(ID,cost,D) # combine metrics

 #
 # control group
 #

 mu <- 6000  # mean
 sd <- 18000 # standard deviation
 cv <- sd/mu # CV 
 kappa <- 1/(cv^2)  # shape parameter
 theta <-  (sd*sd)/mu  # this is theta or scale parameter

 # simulate costs based on parameter assumptions above
 set.seed(1234567)
 cost <- rgamma(100000, shape = kappa, scale = theta)

 # check descriptive results
 summary(cost)
 sd(cost)
 hist(cost, breaks = 50)

 # simulate ID and treatment indicator
 ID <- seq(100001,200000)
 D <- rep(0,100000) 

 ctrl <- data.frame(ID,cost,D) # combine metrics

 #
 # create population data frame
 #

 df <- rbind(trt,ctrl)

 # check descriptives
 summary(df)
 df%>% 
  group_by(D) %>%
  summarize(y = mean(cost),
            n = n()) 
 hist(df$cost, breaks = 50)

 summary(lm(cost ~ D, data = df)) # regression on full model 
 summary(glm(cost ~ D, data = df, family = "Gamma"(link = 'log'),epsilon = 1e-8, maxit = 500, trace = FALSE)) # gamma reg
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: simulate claims cost data.R
	# \| DATE: 3/2/21
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: simualte example treatment and control claims cost data
	# *----------------------------------------------------------------

	# https://data.library.virginia.edu/assessing-type-s-and-type-m-errors/

	library(MASS)
	library(dplyr)

	options("scipen" =100, "digits" = 4) # override R's tendency to use scientific notation

	#
	# treatent group
	#

	mu <- 4000 # mean
	sd <- 12000 # standard deviation
	cv <- sd/mu # CV
	kappa <- 1/(cv^2) # shape parameter
	theta <- (sd*sd)/mu # this is theta or scale parameter


	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	cost <- rgamma(100000, shape = kappa, scale = theta)

	# check descriptive results
	summary(cost)
	sd(cost)
	hist(cost, breaks = 50)

	# simulate ID and treatment indicator
	ID <- seq(1,100000)
	D <- rep(1,100000)

	trt <- data.frame(ID,cost,D) # combine metrics

	#
	# control group
	#

	mu <- 6000 # mean
	sd <- 18000 # standard deviation
	cv <- sd/mu # CV
	kappa <- 1/(cv^2) # shape parameter
	theta <- (sd*sd)/mu # this is theta or scale parameter

	# simulate costs based on parameter assumptions above
	set.seed(1234567)
	cost <- rgamma(100000, shape = kappa, scale = theta)

	# check descriptive results
	summary(cost)
	sd(cost)
	hist(cost, breaks = 50)

	# simulate ID and treatment indicator
	ID <- seq(100001,200000)
	D <- rep(0,100000)

	ctrl <- data.frame(ID,cost,D) # combine metrics

	#
	# create population data frame
	#

	df <- rbind(trt,ctrl)

	# check descriptives
	summary(df)
	df%>%
	group_by(D) %>%
	summarize(y = mean(cost),
	n = n())
	hist(df$cost, breaks = 50)

	summary(lm(cost ~ D, data = df)) # regression on full model
	summary(glm(cost ~ D, data = df, family = "Gamma"(link = 'log'),epsilon = 1e-8, maxit = 500, trace = FALSE)) # gamma reg