ghl3 · August 29, 2015 14:06
diff --git a/DataAndModeling b/DataAndModeling
 Data and Modeling

 Boiler plate for manipulating data and modeling using python libraries.
diff --git a/fillna.py b/fillna.py
 # Replace missing values with the mean
 # Note that taking the mean ignores
 # NA values, so this works
 df.fillna(df.mean())

 # Linear interpolation of missing
 df.interpolate()
 df.interpolate(method='spline', order=2)


 # See how much of each feature is null
 df.apply(lambda x: pd.isnull(x).value_counts(normalize=True)).T
diff --git a/plotting.py b/plotting.py

 # Logarithmic axes
 plt.gca().set_xscale('log')
 plt.gca().set_yscale('log')
diff --git a/regression.py b/regression.py

 # See here for a nice example:
 # http://blog.yhathq.com/posts/logistic-regression-and-python.html


 #
 # Using the formula
 #

 import statsmodels.formula.api as smf
 
 # Ensure that we fill NA values or we'll get
 # NA results in our fit
 df_filled = df.fillna(df.mean())
 result = smf.ols('target ~ feature', data=df_filled).fit()
 
 print result.summary()

 # Plotting
 df_filled.plot(x='feature', y='target', kind='scatter')
 plt.plot(df_filled['feature'], result.predict(df_filled))



 #
 # Using explicit matrices
 #

 import statsmodels.api as sm

 target = df['target'].fillna(df['target'].mean())
 features = df['feature'].fillna(df['feature'].mean())

 # Note that we add a y-intercept column by hand
 result = smf.OLS(target, sm.add_constant(features)).fit()

 print result.summary()

 # Plotting
 # Note that we use the double brackets in the predict function
 # so that it returns a matrix.
 # We could also have used: np.column_stack(column)
 # We also have to add the constant here to be consistent
 df_filled.plot(x='feature', y='target', kind='scatter')
 plt.plot(df_filled['feature'], result.predict(sm.add_constant(df_filled[['feature']])), '-')


 #
 # Other plotting
 #

 from statsmodels.graphics.regressionplots import plot_fit

 # Using the feature name
 val = plot_fit(result, 'feature')

 # 0 = intercept, 1 = first variable
 val = plot_fit(results, 1)
	Data and Modeling

	Boiler plate for manipulating data and modeling using python libraries.
	# Replace missing values with the mean
	# Note that taking the mean ignores
	# NA values, so this works
	df.fillna(df.mean())

	# Linear interpolation of missing
	df.interpolate()
	df.interpolate(method='spline', order=2)


	# See how much of each feature is null
	df.apply(lambda x: pd.isnull(x).value_counts(normalize=True)).T

	# Logarithmic axes
	plt.gca().set_xscale('log')
	plt.gca().set_yscale('log')

	# See here for a nice example:
	# http://blog.yhathq.com/posts/logistic-regression-and-python.html


	#
	# Using the formula
	#

	import statsmodels.formula.api as smf

	# Ensure that we fill NA values or we'll get
	# NA results in our fit
	df_filled = df.fillna(df.mean())
	result = smf.ols('target ~ feature', data=df_filled).fit()

	print result.summary()

	# Plotting
	df_filled.plot(x='feature', y='target', kind='scatter')
	plt.plot(df_filled['feature'], result.predict(df_filled))



	#
	# Using explicit matrices
	#

	import statsmodels.api as sm

	target = df['target'].fillna(df['target'].mean())
	features = df['feature'].fillna(df['feature'].mean())

	# Note that we add a y-intercept column by hand
	result = smf.OLS(target, sm.add_constant(features)).fit()

	print result.summary()

	# Plotting
	# Note that we use the double brackets in the predict function
	# so that it returns a matrix.
	# We could also have used: np.column_stack(column)
	# We also have to add the constant here to be consistent
	df_filled.plot(x='feature', y='target', kind='scatter')
	plt.plot(df_filled['feature'], result.predict(sm.add_constant(df_filled[['feature']])), '-')


	#
	# Other plotting
	#

	from statsmodels.graphics.regressionplots import plot_fit

	# Using the feature name
	val = plot_fit(result, 'feature')

	# 0 = intercept, 1 = first variable
	val = plot_fit(results, 1)