Created
October 15, 2016 17:02
-
-
Save yakovenkodenis/7c6ca690a200ffab88ebf79bda27dbe6 to your computer and use it in GitHub Desktop.
Grid Search for Locally Weighted Linear Regression
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from statistics import mean | |
from sklearn.grid_search import ParameterGrid | |
from sklearn.preprocessing import StandardScaler | |
def getPredict(theta, x): | |
return np.dot(x, theta) | |
def getError(h, y): | |
return np.square(h - y) | |
def getWeightedLRThetaParams(alpha, n_iterations, x_query, tau, x, y): | |
theta = np.zeros(shape=x_query.shape) | |
w = np.array([np.exp(-((x[i] - x_query) @ (x[i] - x_query)) / (2 * tau ** 2)) for i in range(len(x))]) | |
for _ in range(n_iterations): | |
theta -= alpha * ((w * (getPredict(theta, x) - y)).dot(x) * 2) | |
return theta | |
def normalize_dataset(X_train, X_test): | |
scaler = StandardScaler() | |
train_scaled = scaler.fit_transform(X_train) | |
test_scaled = scaler.transform(X_test) | |
return train_scaled, test_scaled | |
def create_train_test(data, train_percentage): | |
data_train, data_test = np.vsplit(data, [int(np.ceil(len(data) * train_percentage))]) | |
X_train, y_train = data_train.ix[:, :-1], data_train.ix[:, [-1]].values | |
X_test, y_test = data_test.ix[:, :-1], data_test.ix[:, [-1]].values | |
X_train_norm, X_test_norm = normalize_dataset(X_train, X_test) | |
X_train_norm = np.insert(X_train_norm, 0, np.ones(len(X_train_norm)), axis=1) | |
X_test_norm = np.insert(X_test_norm, 0, np.ones(len(X_test_norm)), axis=1) | |
return X_train_norm, X_test_norm, y_train.ravel(), y_test.ravel() | |
def perform_grid_search_for_single_instance(X, y, x_query, y_query, list_of_params_sets): | |
scores = [] | |
for params_set in list_of_params_sets: | |
learned_theta = getWeightedLRThetaParams( | |
params_set['alpha'], params_set['n_iterations'], x_query, params_set['tau'], X, y | |
) | |
prediction = getPredict(learned_theta, x_query) | |
scores_tuple = ( | |
learned_theta, | |
prediction, | |
getError(prediction, y_query), | |
params_set | |
) | |
scores.append(scores_tuple) | |
return min(scores, key=lambda score: score[2]) | |
def perform_grid_search(X_train, y_train, X_test, y_test): | |
params_grid = { | |
'tau': np.linspace(0.1, 1.0, num=5), | |
'alpha': np.linspace(0.001, 0.1, num=10), | |
'n_iterations': np.arange(10, 1000, step=100) | |
} | |
list_of_params_sets = list(ParameterGrid(params_grid)) | |
optimal_params_sets = [] | |
for i in range(len(y_test)): | |
optimal_params_set_for_ith_example = perform_grid_search_for_single_instance( | |
X_train, y_train, X_test[i], y_test[i], list_of_params_sets | |
) | |
optimal_params_sets.append((optimal_params_set_for_ith_example[0], optimal_params_set_for_ith_example[-1])) | |
optimal_params_sets = np.array(optimal_params_sets) | |
tuned_mean_theta = np.array([mean(t) for t in zip(*optimal_params_sets[:, 0])]) | |
tuned_mean_params_vals = [mean(p) for p in zip(*(d.values() for d in optimal_params_sets[:, 1]))] | |
tuned_mean_params = dict(zip(params_grid.keys(), tuned_mean_params_vals)) | |
return tuned_mean_theta, tuned_mean_params | |
def test(): | |
data = pd.read_csv("prices.csv") | |
data = data.astype(np.float32) | |
X_train_intercept, X_test_intercept, y_train, y_test = create_train_test(data, 0.8) | |
theta, params = perform_grid_search( | |
X_train_intercept, y_train, X_test_intercept, y_test | |
) | |
print('Best theta: ', theta) | |
print('Best params: ', params) | |
print(getPredict(theta, X_test_intercept[0]) * 1000) | |
print(getPredict(theta, X_test_intercept[1]) * 1000) | |
print(getPredict(theta, X_test_intercept[2]) * 1000) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment