Created
November 16, 2019 02:34
-
-
Save BioSciEconomist/2c8014c1e577a7704f6d36042cb16927 to your computer and use it in GitHub Desktop.
example analysis of cost data using GLM gamma model with log link
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ------------------------------------------------------------------ | |
# |PROGRAM NAME:gamma model demo.py | |
# |DATE: 11/13/19 | |
# |CREATED BY: MATT BOGARD | |
# |PROJECT FILE: | |
# |---------------------------------------------------------------- | |
# | PURPOSE: example analysis of cost data using GLM gamma model with log link | |
# |---------------------------------------------------------------- | |
# TO DO: add bootstrapping | |
#----------------------------------- | |
# read latest cohort file | |
#----------------------------------- | |
import pandas as pd | |
import numpy as np | |
# read simulated analysis file | |
df = pd.read_csv('//data//simcost.csv') | |
# inspect data | |
df.columns | |
df.head() | |
df.info() | |
df.shape | |
#--------------------------------------- | |
# cost analysis | |
#--------------------------------------- | |
# visualization and descriptives | |
df.cost.plot( kind='hist', normed=True, bins=50, range=(0,300000)) | |
df.cost.describe() | |
# Create the boxplo | |
import matplotlib.pyplot as plt | |
df.boxplot(column='cost', by='treat', rot=90) | |
# descriptive stats by treatment status | |
df.groupby('treat')['cost'].mean() | |
df.groupby('treat')['id'].count() | |
df.groupby('treat')['cost'].mean() | |
df.groupby('treat')['id'].count() | |
#----------------------------------------- | |
# analysis | |
#---------------------------------------- | |
# benchmark against standard linear regression | |
import statsmodels.api as sm # import stastmodels | |
import statsmodels.formula.api as smf # this allows us to use an explicit formulation | |
# regression using smf | |
results = smf.ols('cost ~ treat', data=df).fit() | |
results.summary() # one of the problems with stats models I have read is you can't override scientific notation? | |
del results # cleanup | |
# glm with log link and gamma error on positive claims | |
model = smf.glm('cost ~ treat', data=df, family=sm.families.Gamma(link = sm.genmod.families.links.log)) | |
result = model.fit() | |
print(result.summary()) | |
import math | |
math.exp(.5831) # 1.79 as ratio of treatment and control means | |
25331/14139 # 1.79 same as exponentiated coefficient from gamma model | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment