Skip to content

Instantly share code, notes, and snippets.

@ghl3
Last active August 29, 2015 14:06
Show Gist options
  • Save ghl3/b90536923d53636150da to your computer and use it in GitHub Desktop.
Save ghl3/b90536923d53636150da to your computer and use it in GitHub Desktop.
Data and Modeling
Data and Modeling
Boiler plate for manipulating data and modeling using python libraries.
# Replace missing values with the mean
# Note that taking the mean ignores
# NA values, so this works
df.fillna(df.mean())
# Linear interpolation of missing
df.interpolate()
df.interpolate(method='spline', order=2)
# See how much of each feature is null
df.apply(lambda x: pd.isnull(x).value_counts(normalize=True)).T
# Logarithmic axes
plt.gca().set_xscale('log')
plt.gca().set_yscale('log')
# See here for a nice example:
# http://blog.yhathq.com/posts/logistic-regression-and-python.html
#
# Using the formula
#
import statsmodels.formula.api as smf
# Ensure that we fill NA values or we'll get
# NA results in our fit
df_filled = df.fillna(df.mean())
result = smf.ols('target ~ feature', data=df_filled).fit()
print result.summary()
# Plotting
df_filled.plot(x='feature', y='target', kind='scatter')
plt.plot(df_filled['feature'], result.predict(df_filled))
#
# Using explicit matrices
#
import statsmodels.api as sm
target = df['target'].fillna(df['target'].mean())
features = df['feature'].fillna(df['feature'].mean())
# Note that we add a y-intercept column by hand
result = smf.OLS(target, sm.add_constant(features)).fit()
print result.summary()
# Plotting
# Note that we use the double brackets in the predict function
# so that it returns a matrix.
# We could also have used: np.column_stack(column)
# We also have to add the constant here to be consistent
df_filled.plot(x='feature', y='target', kind='scatter')
plt.plot(df_filled['feature'], result.predict(sm.add_constant(df_filled[['feature']])), '-')
#
# Other plotting
#
from statsmodels.graphics.regressionplots import plot_fit
# Using the feature name
val = plot_fit(result, 'feature')
# 0 = intercept, 1 = first variable
val = plot_fit(results, 1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment