Last active
January 19, 2021 18:18
-
-
Save BioSciEconomist/73d50efc88a57686b077bf6047c6afc4 to your computer and use it in GitHub Desktop.
analysis of cost data using 2 part GLM models in python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------ | |
# |PROGRAM NAME: claims cost analysis demo.py | |
# |DATE: 11/13/19 | |
# |CREATED BY: MATT BOGARD | |
# |PROJECT FILE: | |
# |---------------------------------------------------------------- | |
# | PURPOSE: analysis of cost data using 2- part GLM model | |
# |---------------------------------------------------------------- | |
# TO DO: add bootstrapping | |
# this may not always work within model output | |
pd.set_option('display.float_format', lambda x: '%3.f' % x) | |
#----------------------------------- | |
# read latest cohort file | |
#----------------------------------- | |
import pandas as pd | |
import numpy as np | |
# read initial analysis file | |
df = pd.read_csv('//demo data//demo_dat_claims.csv') | |
# inspect data | |
df.columns | |
df.head() | |
df.info() | |
df.shape | |
#--------------------------------------- | |
# cost analysis | |
#--------------------------------------- | |
# visualization and descriptives | |
df.TotalAllowed.plot( kind='hist', normed=True, bins=50, range=(0,300000)) | |
df.TotalAllowed.describe() | |
# Create the boxplo | |
import matplotlib.pyplot as plt | |
df.boxplot(column='TotalAllowed', by='treat', rot=90) | |
# descriptive stats by treatment status | |
df.groupby('treat')['TotalAllowed'].mean() | |
df.groupby('treat')['ID'].count() | |
# p(cost > 0) part 1 | |
df.groupby('treat')['CostFlag'].mean() | |
df.groupby('treat')['ID'].count() | |
# E(cost | cost > 0) part 2 | |
tmp = df['TotalAllowed'] > 0 # defin filter for positive costs | |
tmp2 = df.loc[tmp] # get positive cost data set | |
tmp2.groupby('treat')['TotalAllowed'].mean() | |
tmp2.groupby('treat')['ID'].count() | |
# cleanup | |
del tmp | |
del tmp2 | |
#----------------------------------------- | |
# two part model | |
#---------------------------------------- | |
# benchmark against standard linear regression | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# regression using smf | |
results = smf.ols('TotalAllowed ~ treat', data=df).fit() | |
results.summary() # one of the problems with stats models I have read is you can't override scientific notation? | |
del results # cleanup | |
# part 1: logistic regression | |
# note because we are making inferences we want logisticregression using | |
# stats models NOT sciekitlearn see also: https://stats.stackexchange.com/questions/203740/logistic-regression-scikit-learn-vs-statsmodels | |
# https://www.statsmodels.org/stable/glm.html | |
model = smf.glm('CostFlag ~ treat', data=df, family=sm.families.Binomial(link = sm.genmod.families.links.logit)) | |
result = model.fit() | |
print(result.summary()) | |
# cleanup | |
del model | |
del result | |
# part 2: glm with log link and gamma error on positive claims | |
tmp = df['TotalAllowed'] > 0 # define filter for positive costs | |
tmp2 = df.loc[tmp] # get positive cost data set | |
model = smf.glm('TotalAllowed ~ treat', data=df, family=sm.families.Gamma(link = sm.genmod.families.links.log)) | |
result = model.fit() | |
print(result.summary()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment