Skip to content

Instantly share code, notes, and snippets.

from sklearn.datasets import load_iris
import pandas as pd
from pandasql import sqldf
import re
iris = load_iris()
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['species'] = pd.Factor(iris.target, levels=iris.target_names)
import matplotlib.pyplot as plt
from pandasql import *
import pandas as pd
pysqldf = lambda q: sqldf(q, globals())
q = """
SELECT
m.date
, m.beef
import pandas as pd
import statsmodels.api as sm
import pylab as pl
import numpy as np
# read the data in
df = pd.read_csv("http://www.ats.ucla.edu/stat/data/binary.csv")
# take a look at the dataset
print df.head()
# summarize the data
print df.describe()
# admit gre gpa prestige
# count 400.000000 400.000000 400.000000 400.00000
# mean 0.317500 587.700000 3.389900 2.48500
# std 0.466087 115.516536 0.380567 0.94446
# min 0.000000 220.000000 2.260000 1.00000
# 25% 0.000000 520.000000 3.130000 2.00000
# 50% 0.000000 580.000000 3.395000 2.00000
# 75% 1.000000 660.000000 3.670000 3.00000
# dummify rank
dummy_ranks = pd.get_dummies(df['prestige'], prefix='prestige')
print dummy_ranks.head()
# prestige_1 prestige_2 prestige_3 prestige_4
# 0 0 0 1 0
# 1 0 0 1 0
# 2 1 0 0 0
# 3 0 0 0 1
# 4 0 0 0 1
train_cols = data.columns[1:]
# Index([gre, gpa, prestige_2, prestige_3, prestige_4], dtype=object)
logit = sm.Logit(data['admit'], data[train_cols])
# fit the model
result = logit.fit()
# cool enough to deserve it's own gist
print result.summary()
# look at the confidence interval of each coeffecient
print result.conf_int()
# 0 1
# gre 0.000120 0.004409
# gpa 0.153684 1.454391
# prestige_2 -1.295751 -0.055135
# prestige_3 -2.016992 -0.663416
# prestige_4 -2.370399 -0.732529
# intercept -6.224242 -1.755716
# odds ratios only
print np.exp(result.params)
# gre 1.002267
# gpa 2.234545
# prestige_2 0.508931
# prestige_3 0.261792
# prestige_4 0.211938
# intercept 0.018500
# odds ratios and 95% CI
params = result.params
conf = result.conf_int()
conf['OR'] = params
conf.columns = ['2.5%', '97.5%', 'OR']
print np.exp(conf)
# 2.5% 97.5% OR
# gre 1.000120 1.004418 1.002267
# gpa 1.166122 4.281877 2.234545
# prestige_2 0.273692 0.946358 0.508931