Created
September 10, 2016 21:04
-
-
Save netskink/51e1ffe54f82869200d0c0c5d919bbbc to your computer and use it in GitHub Desktop.
Notes on the computing regression parameters walkthru
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Computing Regression parameters (closed form example) | |
# The data | |
# Consider the following 5 point synthetic data set | |
# In[1]: | |
import graphlab | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# In[2]: | |
# construct a one-dimensional vector using graphlab | |
sx = graphlab.SArray([0,1,2,3,4]) | |
# In[3]: | |
# construct a second one-dimensional vector | |
sy = graphlab.SArray([1,3,7,13,21]) | |
# In[4]: | |
sx | |
# In[5]: | |
sy | |
# In[6]: | |
st = graphlab.SFrame([sx,sy]) | |
# ## which is plotted below | |
# In[7]: | |
st.show() | |
# # What we need | |
# We want the line that "best fits" this data set as measured by residual sum of square - the simple linear regression cost. We have a closed form solution that involves the following terms | |
# | |
# * The number of data points N | |
# * The sum (or mean) of the Ys | |
# * The sum (or mean) of the Xs | |
# * The sum ( or mean) of the product of Xs and Ys | |
# * The sum (or mean) of the Xs squared | |
# | |
# Then once we have calculate all terms, we can use the formulas to compute the slope and intercept. Recall that we first solve for the slope and we use to the value of the slope to solve for the intercept. The formula for the slope is a fraction with | |
# In[8]: | |
# From Wee1 - simple regression - Approach 1: set gradient = 0 - slide 68 | |
# The formla for w1 which is the "estimate of the slope" is: | |
# numerator = (sum of X*Y) - (1/N) * (sum of X) * (sum of Y) | |
# denominator = (sum of X*X) - (1/N) * (sum of X) * (sum of X) | |
# Note, you can divide botht the numerator and denominator by N (which does not change the answer!) to get: | |
# In[9]: | |
# numerator = (mean of X*Y) - (mean of X) * (mean of Y) | |
# denominator = (mean of X*X) - (mean of X) * (mean of X) | |
# Hence, we can use either the sums or the means | |
# | |
# The formula in action | |
# ## Method 1: (Using sums) | |
# | |
# * N=5 | |
# * The sum of the Y's = 45 | |
# * The sum of the X's = 10 | |
# * The sum of the product of the Xs and the Ys = 140 | |
# * The sum of the X's squared = 30 | |
# | |
# So that: | |
# In[10]: | |
numerator = ((140) - (1.0/5)*(45*10)) # = 50 for fractions use 1.0 so its not rounded to zero | |
denominator = ((30) - (1.0/5) * (10*10)) # = 10 | |
# hence: | |
# In[11]: | |
slope = numerator/denominator # 50/10 = 5 | |
# In[12]: | |
slope | |
# Then we can use the computed slope to compute the intercept: | |
# In[13]: | |
#intercept = (mean of Y) - slope * (mean of X) = 9 - 5 * 2 = -1 | |
intercept = sy.mean() - slope * sx.mean() | |
# In[14]: | |
intercept | |
# Finally we can add the line to our plot from above: | |
# In[15]: | |
def f(x): | |
return 5*x-1 | |
st.add_column(f(sx), name="RSS f(x)") | |
# In[24]: | |
# could not see both both plots on the same graph using the sframe. You could | |
# only select one at a time to graph. So here is how to do with numpy | |
import numpy as np | |
import matplotlib.pyplot as plt | |
# convert the sframe to a numpy array | |
aNP = st.to_numpy() | |
# In[17]: | |
aNP | |
# In[33]: | |
# The plot takes triples of numbers, the first parm is X vector, the second parm is Y | |
# vector. The last param is the style. 'bs' is blue squres, 'r--' is red dashed line, | |
# 'g^' is green triangle. | |
# The green triangle is the mean value | |
# | |
# each set of three's is a (x,y,style) | |
# | |
# For the mean values, create a NaN at each element except for the mean location | |
plt.plot(aNP[:,0], aNP[:,1],'bs', aNP[:,0], aNP[:,2], 'r--', sx.mean(), sy.mean(), 'g^') | |
# axis sets how big the graph is parms are xmin, xmax, ymin, ymax | |
plt.axis([-1,5,-1,25]) | |
plt.ylabel('the y values') | |
plt.xlabel('the x values') | |
plt.show() | |
# In[23]: | |
aNP[:,0] | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment