Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Last active January 19, 2021 18:18
Show Gist options
  • Save BioSciEconomist/73d50efc88a57686b077bf6047c6afc4 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/73d50efc88a57686b077bf6047c6afc4 to your computer and use it in GitHub Desktop.
analysis of cost data using 2 part GLM models in python
#  ------------------------------------------------------------------
#  |PROGRAM NAME: claims cost analysis demo.py
#  |DATE: 11/13/19
#  |CREATED BY: MATT BOGARD
#  |PROJECT FILE:
#  |----------------------------------------------------------------
#  | PURPOSE: analysis of cost data using 2- part GLM model
#  |----------------------------------------------------------------
 
# TO DO: add bootstrapping
 
# this may not always work within model output
pd.set_option('display.float_format', lambda x: '%3.f' % x)
 
#-----------------------------------
# read latest cohort file
#-----------------------------------
 
import pandas as pd
import numpy as np
 
# read initial analysis file
df = pd.read_csv('//demo data//demo_dat_claims.csv')
 
# inspect data
df.columns
df.head()
df.info()
df.shape
 
 
#---------------------------------------
# cost analysis
#---------------------------------------
 
# visualization and descriptives
df.TotalAllowed.plot( kind='hist', normed=True, bins=50, range=(0,300000))
df.TotalAllowed.describe()
 
# Create the boxplo
import matplotlib.pyplot as plt
df.boxplot(column='TotalAllowed', by='treat', rot=90)
 
# descriptive stats by treatment status
df.groupby('treat')['TotalAllowed'].mean()
df.groupby('treat')['ID'].count()
 
# p(cost > 0) part 1
df.groupby('treat')['CostFlag'].mean()
df.groupby('treat')['ID'].count()
  
# E(cost | cost > 0) part 2  
tmp = df['TotalAllowed'] > 0 # defin filter for positive costs
tmp2 = df.loc[tmp] # get positive cost data set
tmp2.groupby('treat')['TotalAllowed'].mean()
tmp2.groupby('treat')['ID'].count()
 
# cleanup
del tmp
del tmp2
 
#-----------------------------------------
# two part model
#----------------------------------------
 
# benchmark against standard linear regression
import statsmodels.api as sm # import stastmodels
import statsmodels.formula.api as smf # this allows us to use an explicit formulation
 
# regression using smf
results = smf.ols('TotalAllowed ~ treat', data=df).fit()
results.summary() # one of the problems with stats models I have read is you can't override scientific notation?
 
del results # cleanup
              
# part 1: logistic regression
 
# note because we are making inferences we want logisticregression using
# stats models NOT sciekitlearn see also: https://stats.stackexchange.com/questions/203740/logistic-regression-scikit-learn-vs-statsmodels
# https://www.statsmodels.org/stable/glm.html
 
 
model = smf.glm('CostFlag ~ treat', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit))
result = model.fit()
print(result.summary())
 
# cleanup
del model
del result
 
# part 2: glm with log link and gamma error on positive claims
 
tmp = df['TotalAllowed'] > 0 # define filter for positive costs
tmp2 = df.loc[tmp] # get positive cost data set
 
model = smf.glm('TotalAllowed ~ treat', data=df, family=sm.families.Gamma(link = sm.genmod.families.links.log))
result = model.fit()
print(result.summary())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment