sergeyprokudin · January 9, 2019 22:55
diff --git a/linear_regression_sgd.py b/linear_regression_sgd.py
 import numpy as np

 def linear_regression(x, y, lr=1.0e-4, n_steps=10000, lr_decay=0.5):
    """Solves 1 dimensional regression problem: provides w=(w_1, ..., w_n) such that:
    
    w* = min_{w} sum_{i=1}{m}[(<w, x_i> - y_i)**2]  (mean squared error)
    
    L(w) = sum_{i=1}{m}[(<w, x_i> - y_i)**2]
    
    <w, x_i> = w_1*x_{i1} + w_2*x_{i2} + ... w_n*x_{in}
    
    Solutions:
    
    Algo #2: gradient decent
    
    L(w) is a convex function (can you prove it?*) => every local minima is a global minima as well!
    
    vector dL/dw is pointing towards the direction of function growth => -dL/dw
    
    dL/dw_j = 0 , j=1,..,n
    
    dL/dw_j = \sum_{i=1}{n}{ (2*x_ij*(<w, x_i> - y_i) ) = 0 

    dL/dw = 2X^T@X@w - 2@X^T@y
    
    and then update vector w:
    
    w_t = w_{t-1} - lr*dL\dw
    
    
    Questions:
    
    1. How to select a learning rate?
      (a) simple heuristic: whenever MSE increases, decrease the rate by some factor
      (b) constant decay: at every step, decreaset it a bit:
          lr_t = (eps^t) * lr_ini
    
    
    """
    
    def _get_gradient(w):
        # dL/dw_j = \sum_{i=1}{n}{ (2*x_ij*(<w, x_i> - y_i) )
        # <=> dL/dw = 2
        dw = 2*(x.T@x@w - x.T@y)
        return dw
    
    n_samples, n_params = x.shape
    
    w = np.random.normal(size=(n_params, 1))
    
    def _mse(x, y, w):
        return np.sum(np.square(x@w-y))
    
    curr_mse = _mse(x, y, w)
    
    for i in range(0, n_steps):
        dw = _get_gradient(w)
        w = w - lr*dw
        if _mse(x, y, w) > curr_mse:
            lr *= lr_decay
            print("decreasing learning rate, current: %f" % lr)
    
    return w


 #DEMO
 from sklearn.datasets import load_boston
 import matplotlib.pyplot as plt
 %matplotlib inline


 # Artificial data: we set the regression task
 x = np.arange(0, 1, 0.01)
 x = np.vstack([np.ones(x.shape), x]).T
 y = [email protected]([[1], [2]]) + np.random.normal(size=[x.shape[0], 1], scale=0.1)

 w = linear_regression(x, y, lr=1.0e-4, n_steps=10000, lr_decay=0.5)
 xr = np.arange(0, 1, 0.01)
 xr = np.vstack([np.ones(xr.shape), xr]).T
 yr = xr@w

 ypreds = x@w
 mse = np.mean(np.square(y-ypreds))
 print("mean squared error: %f" % mse)

 plt.title("Artificial data")
 plt.scatter(xr[:, 1], yr, s=4)
 plt.scatter(x[:, 1], y)
 plt.show()


 # Regressing 1 value of Boston housing data
 ds = load_boston()
 x = ds['data'][:, 12]
 x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)
 x = np.vstack([np.ones(x.shape), x]).T
 #x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)
 y = ds['target'].reshape([-1, 1])

 w = linear_regression(x, y, lr=1.0e-2, n_steps=100, lr_decay=0.1)

 xr = np.arange(-3, 3, 0.1)
 xr = np.vstack([np.ones(xr.shape), xr]).T
 yr = xr@w

 ypreds = x@w
 mse = np.mean(np.square(y-ypreds))
 print("mean squared error: %f" % mse)

 plt.title("Boston housing data")
 plt.scatter(xr[:, 1], yr, s=4, label='learned regressor')
 plt.scatter(x[:, 1], y, label='data')
 plt.legend()
 plt.show()
	import numpy as np

	def linear_regression(x, y, lr=1.0e-4, n_steps=10000, lr_decay=0.5):
	"""Solves 1 dimensional regression problem: provides w=(w_1, ..., w_n) such that:

	w* = min_{w} sum_{i=1}{m}[(<w, x_i> - y_i)**2] (mean squared error)

	L(w) = sum_{i=1}{m}[(<w, x_i> - y_i)**2]

	<w, x_i> = w_1x_{i1} + w_2x_{i2} + ... w_n*x_{in}

	Solutions:

	Algo #2: gradient decent

	L(w) is a convex function (can you prove it?*) => every local minima is a global minima as well!

	vector dL/dw is pointing towards the direction of function growth => -dL/dw

	dL/dw_j = 0 , j=1,..,n

	dL/dw_j = \sum_{i=1}{n}{ (2x_ij(<w, x_i> - y_i) ) = 0

	dL/dw = 2X^T@X@w - 2@X^T@y

	and then update vector w:

	w_t = w_{t-1} - lr*dL\dw


	Questions:

	1. How to select a learning rate?
	(a) simple heuristic: whenever MSE increases, decrease the rate by some factor
	(b) constant decay: at every step, decreaset it a bit:
	lr_t = (eps^t) * lr_ini


	"""

	def _get_gradient(w):
	# dL/dw_j = \sum_{i=1}{n}{ (2x_ij(<w, x_i> - y_i) )
	# <=> dL/dw = 2
	dw = 2*(x.T@x@w - x.T@y)
	return dw

	n_samples, n_params = x.shape

	w = np.random.normal(size=(n_params, 1))

	def _mse(x, y, w):
	return np.sum(np.square(x@w-y))

	curr_mse = _mse(x, y, w)

	for i in range(0, n_steps):
	dw = _get_gradient(w)
	w = w - lr*dw
	if _mse(x, y, w) > curr_mse:
	lr *= lr_decay
	print("decreasing learning rate, current: %f" % lr)

	return w


	#DEMO
	from sklearn.datasets import load_boston
	import matplotlib.pyplot as plt
	%matplotlib inline


	# Artificial data: we set the regression task
	x = np.arange(0, 1, 0.01)
	x = np.vstack([np.ones(x.shape), x]).T
	y = [email protected]([[1], [2]]) + np.random.normal(size=[x.shape[0], 1], scale=0.1)

	w = linear_regression(x, y, lr=1.0e-4, n_steps=10000, lr_decay=0.5)
	xr = np.arange(0, 1, 0.01)
	xr = np.vstack([np.ones(xr.shape), xr]).T
	yr = xr@w

	ypreds = x@w
	mse = np.mean(np.square(y-ypreds))
	print("mean squared error: %f" % mse)

	plt.title("Artificial data")
	plt.scatter(xr[:, 1], yr, s=4)
	plt.scatter(x[:, 1], y)
	plt.show()


	# Regressing 1 value of Boston housing data
	ds = load_boston()
	x = ds['data'][:, 12]
	x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)
	x = np.vstack([np.ones(x.shape), x]).T
	#x = (x - np.mean(x, axis=0)) / np.std(x, axis=0)
	y = ds['target'].reshape([-1, 1])

	w = linear_regression(x, y, lr=1.0e-2, n_steps=100, lr_decay=0.1)

	xr = np.arange(-3, 3, 0.1)
	xr = np.vstack([np.ones(xr.shape), xr]).T
	yr = xr@w

	ypreds = x@w
	mse = np.mean(np.square(y-ypreds))
	print("mean squared error: %f" % mse)

	plt.title("Boston housing data")
	plt.scatter(xr[:, 1], yr, s=4, label='learned regressor')
	plt.scatter(x[:, 1], y, label='data')
	plt.legend()
	plt.show()