Skip to content

Instantly share code, notes, and snippets.

@sergeyprokudin
Created January 9, 2019 22:55
Show Gist options
  • Save sergeyprokudin/c02829792eea098d322dd47c11f98225 to your computer and use it in GitHub Desktop.
Save sergeyprokudin/c02829792eea098d322dd47c11f98225 to your computer and use it in GitHub Desktop.
Linear Regression with Stochastic Gradient Decent
import numpy as np
def linear_regression(x, y, lr=1.0e-4, n_steps=10000, lr_decay=0.5):
"""Solves 1 dimensional regression problem: provides w=(w_1, ..., w_n) such that:
w* = min_{w} sum_{i=1}{m}[(<w, x_i> - y_i)**2] (mean squared error)
L(w) = sum_{i=1}{m}[(<w, x_i> - y_i)**2]
<w, x_i> = w_1*x_{i1} + w_2*x_{i2} + ... w_n*x_{in}
Solutions:
Algo #2: gradient decent
L(w) is a convex function (can you prove it?*) => every local minima is a global minima as well!
vector dL/dw is pointing towards the direction of function growth => -dL/dw
dL/dw_j = 0 , j=1,..,n
dL/dw_j = \sum_{i=1}{n}{ (2*x_ij*(<w, x_i> - y_i) ) = 0
dL/dw = 2X^T@X@w - 2@X^T@y
and then update vector w:
w_t = w_{t-1} - lr*dL\dw
Questions:
1. How to select a learning rate?
(a) simple heuristic: whenever MSE increases, decrease the rate by some factor
(b) constant decay: at every step, decreaset it a bit:
lr_t = (eps^t) * lr_ini
"""
def _get_gradient(w):
# dL/dw_j = \sum_{i=1}{n}{ (2*x_ij*(<w, x_i> - y_i) )
# <=> dL/dw = 2
dw = 2*(x.T@x@w - x.T@y)
return dw
n_samples, n_params = x.shape
w = np.random.normal(size=(n_params, 1))
def _mse(x, y, w):
return np.sum(np.square(x@w-y))
curr_mse = _mse(x, y, w)
for i in range(0, n_steps):
dw = _get_gradient(w)
w = w - lr*dw
if _mse(x, y, w) > curr_mse:
lr *= lr_decay
print("decreasing learning rate, current: %f" % lr)
return w
#DEMO
from sklearn.datasets import load_boston
import matplotlib.pyplot as plt
%matplotlib inline
# Artificial data: we set the regression task
x = np.arange(0, 1, 0.01)
x = np.vstack([np.ones(x.shape), x]).T
y = [email protected]([[1], [2]]) + np.random.normal(size=[x.shape[0], 1], scale=0.1)
w = linear_regression(x, y, lr=1.0e-4, n_steps=10000, lr_decay=0.5)
xr = np.arange(0, 1, 0.01)
xr = np.vstack([np.ones(xr.shape), xr]).T
yr = xr@w
ypreds = x@w
mse = np.mean(np.square(y-ypreds))
print("mean squared error: %f" % mse)
plt.title("Artificial data")
plt.scatter(xr[:, 1], yr, s=4)
plt.scatter(x[:, 1], y)
plt.show()
# Regressing 1 value of Boston housing data
ds = load_boston()
x = ds['data'][:, 12]
x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)
x = np.vstack([np.ones(x.shape), x]).T
#x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)
y = ds['target'].reshape([-1, 1])
w = linear_regression(x, y, lr=1.0e-2, n_steps=100, lr_decay=0.1)
xr = np.arange(-3, 3, 0.1)
xr = np.vstack([np.ones(xr.shape), xr]).T
yr = xr@w
ypreds = x@w
mse = np.mean(np.square(y-ypreds))
print("mean squared error: %f" % mse)
plt.title("Boston housing data")
plt.scatter(xr[:, 1], yr, s=4, label='learned regressor')
plt.scatter(x[:, 1], y, label='data')
plt.legend()
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment