BioSciEconomist · January 19, 2021 18:18
diff --git a/claims cost analysis demo.py b/claims cost analysis demo.py
 #  ------------------------------------------------------------------
 #  |PROGRAM NAME: claims cost analysis demo.py
 #  |DATE: 11/13/19
 #  |CREATED BY: MATT BOGARD
 #  |PROJECT FILE:
 #  |----------------------------------------------------------------
 #  | PURPOSE: analysis of cost data using 2- part GLM model
 #  |----------------------------------------------------------------
  
 # TO DO: add bootstrapping
  
 # this may not always work within model output
 pd.set_option('display.float_format', lambda x: '%3.f' % x)
  
 #-----------------------------------
 # read latest cohort file
 #-----------------------------------
  
 import pandas as pd
 import numpy as np
  
 # read initial analysis file
 df = pd.read_csv('//demo data//demo_dat_claims.csv')
  
 # inspect data
 df.columns
 df.head()
 df.info()
 df.shape
  
  
 #---------------------------------------
 # cost analysis
 #---------------------------------------
  
 # visualization and descriptives
 df.TotalAllowed.plot( kind='hist', normed=True, bins=50, range=(0,300000))
 df.TotalAllowed.describe()
  
 # Create the boxplo
 import matplotlib.pyplot as plt
 df.boxplot(column='TotalAllowed', by='treat', rot=90)
  
 # descriptive stats by treatment status
 df.groupby('treat')['TotalAllowed'].mean()
 df.groupby('treat')['ID'].count()
  
 # p(cost > 0) part 1
 df.groupby('treat')['CostFlag'].mean()
 df.groupby('treat')['ID'].count()
   
 # E(cost | cost > 0) part 2  
 tmp = df['TotalAllowed'] > 0 # defin filter for positive costs
 tmp2 = df.loc[tmp] # get positive cost data set
 tmp2.groupby('treat')['TotalAllowed'].mean()
 tmp2.groupby('treat')['ID'].count()
  
 # cleanup
 del tmp
 del tmp2
  
 #-----------------------------------------
 # two part model
 #----------------------------------------
  
 # benchmark against standard linear regression
 import statsmodels.api as sm # import stastmodels
 import statsmodels.formula.api as smf # this allows us to use an explicit formulation
  
 # regression using smf
 results = smf.ols('TotalAllowed ~ treat', data=df).fit()
 results.summary() # one of the problems with stats models I have read is you can't override scientific notation?
  
 del results # cleanup
               
 # part 1: logistic regression
  
 # note because we are making inferences we want logisticregression using
 # stats models NOT sciekitlearn see also: https://stats.stackexchange.com/questions/203740/logistic-regression-scikit-learn-vs-statsmodels
 # https://www.statsmodels.org/stable/glm.html
  
  
 model = smf.glm('CostFlag ~ treat', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
 result = model.fit()
 print(result.summary())
  
 # cleanup
 del model
 del result
  
 # part 2: glm with log link and gamma error on positive claims
  
 tmp = df['TotalAllowed'] > 0 # define filter for positive costs
 tmp2 = df.loc[tmp] # get positive cost data set
  
 model = smf.glm('TotalAllowed ~ treat', data=df, family=sm.families.Gamma(link = sm.genmod.families.links.log))
 result = model.fit()
 print(result.summary())
	# ------------------------------------------------------------------
	# \|PROGRAM NAME: claims cost analysis demo.py
	# \|DATE: 11/13/19
	# \|CREATED BY: MATT BOGARD
	# \|PROJECT FILE:
	# \|----------------------------------------------------------------
	# \| PURPOSE: analysis of cost data using 2- part GLM model
	# \|----------------------------------------------------------------

	# TO DO: add bootstrapping

	# this may not always work within model output
	pd.set_option('display.float_format', lambda x: '%3.f' % x)

	#-----------------------------------
	# read latest cohort file
	#-----------------------------------

	import pandas as pd
	import numpy as np

	# read initial analysis file
	df = pd.read_csv('//demo data//demo_dat_claims.csv')

	# inspect data
	df.columns
	df.head()
	df.info()
	df.shape


	#---------------------------------------
	# cost analysis
	#---------------------------------------

	# visualization and descriptives
	df.TotalAllowed.plot( kind='hist', normed=True, bins=50, range=(0,300000))
	df.TotalAllowed.describe()

	# Create the boxplo
	import matplotlib.pyplot as plt
	df.boxplot(column='TotalAllowed', by='treat', rot=90)

	# descriptive stats by treatment status
	df.groupby('treat')['TotalAllowed'].mean()
	df.groupby('treat')['ID'].count()

	# p(cost > 0) part 1
	df.groupby('treat')['CostFlag'].mean()
	df.groupby('treat')['ID'].count()

	# E(cost \| cost > 0) part 2
	tmp = df['TotalAllowed'] > 0 # defin filter for positive costs
	tmp2 = df.loc[tmp] # get positive cost data set
	tmp2.groupby('treat')['TotalAllowed'].mean()
	tmp2.groupby('treat')['ID'].count()

	# cleanup
	del tmp
	del tmp2

	#-----------------------------------------
	# two part model
	#----------------------------------------

	# benchmark against standard linear regression
	import statsmodels.api as sm # import stastmodels
	import statsmodels.formula.api as smf # this allows us to use an explicit formulation

	# regression using smf
	results = smf.ols('TotalAllowed ~ treat', data=df).fit()
	results.summary() # one of the problems with stats models I have read is you can't override scientific notation?

	del results # cleanup

	# part 1: logistic regression

	# note because we are making inferences we want logisticregression using
	# stats models NOT sciekitlearn see also: https://stats.stackexchange.com/questions/203740/logistic-regression-scikit-learn-vs-statsmodels
	# https://www.statsmodels.org/stable/glm.html


	model = smf.glm('CostFlag ~ treat', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
	result = model.fit()
	print(result.summary())

	# cleanup
	del model
	del result

	# part 2: glm with log link and gamma error on positive claims

	tmp = df['TotalAllowed'] > 0 # define filter for positive costs
	tmp2 = df.loc[tmp] # get positive cost data set

	model = smf.glm('TotalAllowed ~ treat', data=df, family=sm.families.Gamma(link = sm.genmod.families.links.log))
	result = model.fit()
	print(result.summary())