Skip to content

Instantly share code, notes, and snippets.

@ejcer
Created October 21, 2015 16:18
Show Gist options
  • Save ejcer/b269895a39f1d6318e8b to your computer and use it in GitHub Desktop.
Save ejcer/b269895a39f1d6318e8b to your computer and use it in GitHub Desktop.
#Fit a linear regression line to model IncomePerCapita 'y' as a function of
#PercentCollegeGrad 'x' from the states.csv data set.
### Imports ###
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import os
os.chdir('/home/edward/workspace/school/datavis')
os.getcwd()
from scipy import stats
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from numpy.random import randn
### Data Reading & Cleaning ###
states_df = pd.read_csv('./datafiles/states.csv')
### Linear Regression ###
#sns.lmplot(x='PercentCollegeGrad', y='IncomePerCapita', data=states_df)
y = states_df.IncomePerCapita
x = states_df.PercentCollegeGrad
x = sm.add_constant(x)
est = sm.OLS(y, x)
est = est.fit()
est.summary()
est.params
def find_resid(row, b0=est.params[0], b1=est.params[1]):
return row['IncomePerCapita'] - (b0 + b1*row['PercentCollegeGrad'])
states_df['Residual'] = states_df.apply(find_resid, axis=1)
### Sorting & Printing ###
states_df_sorted = states_df.sort(['Residual'], ascending=False)
print(states_df_sorted['Name'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment