Skip to content

Instantly share code, notes, and snippets.

@toniesteves
Created December 7, 2019 02:30
Show Gist options
  • Save toniesteves/f092f365eab7fa3006986b042bb18512 to your computer and use it in GitHub Desktop.
Save toniesteves/f092f365eab7fa3006986b042bb18512 to your computer and use it in GitHub Desktop.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict, KFold
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
plt.style.use('seaborn')
file = pd.read_csv('Final.csv', sep=';')
df = pd.DataFrame(file)
def preprocessing(data):
#Do first stage pre-processing (i.e. exclude non-numeric prices and convert prices to numeric variables)
data = data[data.price_new.str.contains("aanvraag") == False].dropna()
data[['price_new']] = data[['price_new']].apply(pd.to_numeric)
# exclude everything with a price above or below 3 standard deviations (i.e. outliers)
data = data[np.abs(data["price_new"]-data["price_new"].mean())<=(3*data["price_new"].std())]
# Set x and y (dropping zipcode and rooms as latitude, longitude and surface pretty much capture the former)
y = data.price_new
X = data.drop('price_new', axis = 1).drop('zipcode_new', axis = 1).drop('rooms_new', axis = 1)
return X, y
def model(pipeline, parameters, X_train, y_train, X, y):
grid_obj = GridSearchCV(estimator=pipeline,
param_grid=parameters,
cv=3,
scoring='r2',
verbose=2,
n_jobs=1,
refit=True)
grid_obj.fit(X_train, y_train)
'''Results'''
results = pd.DataFrame(pd.DataFrame(grid_obj.cv_results_))
results_sorted = results.sort_values(by=['mean_test_score'], ascending=False)
print("##### Results")
print(results_sorted)
print("best_index", grid_obj.best_index_)
print("best_score", grid_obj.best_score_)
print("best_params", grid_obj.best_params_)
'''Cross Validation'''
estimator = grid_obj.best_estimator_
'''
if estimator.named_steps['scl'] == True:
X = (X - X.mean()) / (X.std())
y = (y - y.mean()) / (y.std())
'''
shuffle = KFold(n_splits=5,
shuffle=True,
random_state=0)
cv_scores = cross_val_score(estimator,
X,
y.values.ravel(),
cv=shuffle,
scoring='r2')
print("##### CV Results")
print("mean_score", cv_scores.mean())
'''Show model coefficients or feature importances'''
try:
print("Model coefficients: ", list(zip(list(X), estimator.named_steps['clf'].coef_)))
except:
print("Model does not support model coefficients")
try:
print("Feature importances: ", list(zip(list(X), estimator.named_steps['clf'].feature_importances_)))
except:
print("Model does not support feature importances")
'''Predict along CV and plot y vs. y_predicted in scatter'''
y_pred = cross_val_predict(estimator, X, y, cv=shuffle)
plt.scatter(y, y_pred)
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
plt.plot([xmin, xmax], [ymin, ymax], "g--", lw=1, alpha=0.4)
plt.xlabel("True prices")
plt.ylabel("Predicted prices")
plt.annotate(' R-squared CV = {}'.format(round(float(cv_scores.mean()), 3)), size=9,
xy=(xmin,ymax), xytext=(10, -15), textcoords='offset points')
plt.annotate(grid_obj.best_params_, size=9,
xy=(xmin, ymax), xytext=(10, -35), textcoords='offset points', wrap=True)
plt.title('Predicted prices (EUR) vs. True prices (EUR)')
plt.show()
# Pipeline and Parameters - Linear Regression
pipe_ols = Pipeline([('scl', StandardScaler()),
('clf', LinearRegression())])
param_ols = {}
# Pipeline and Parameters - XGBoost
pipe_xgb = Pipeline([('clf', xgb.XGBRegressor())])
param_xgb = {'clf__max_depth':[5],
'clf__min_child_weight':[6],
'clf__gamma':[0.01],
'clf__subsample':[0.7],
'clf__colsample_bytree':[1]}
# Pipeline and Parameters - KNN
pipe_knn = Pipeline([('clf', KNeighborsRegressor())])
param_knn = {'clf__n_neighbors':[5, 10, 15, 25, 30]}
# Pipeline and Parameters - Lasso
pipe_lasso = Pipeline([('scl', StandardScaler()),
('clf', Lasso(max_iter=1500))])
param_lasso = {'clf__alpha': [0.01, 0.1, 1, 10]}
# Pipeline and Parameters - Ridge
pipe_ridge = Pipeline([('scl', StandardScaler()),
('clf', Ridge())])
param_ridge = {'clf__alpha': [0.01, 0.1, 1, 10]}
# Pipeline and Parameters - Polynomial Regression
pipe_poly = Pipeline([('scl', StandardScaler()),
('polynomial', PolynomialFeatures()),
('clf', LinearRegression())])
param_poly = {'polynomial__degree': [2, 4, 6]}
# Pipeline and Parameters - Decision Tree Regression
pipe_tree = Pipeline([('clf', DecisionTreeRegressor())])
param_tree = {'clf__max_depth': [2, 5, 10],
'clf__min_samples_leaf': [5,10,50,100]}
# Pipeline and Parameters - Random Forest
pipe_forest = Pipeline([('clf', RandomForestRegressor())])
param_forest = {'clf__n_estimators': [10, 20, 50],
'clf__max_features': [None, 1, 2],
'clf__max_depth': [1, 2, 5]}
# Pipeline and Parameters - MLP Regression
pipe_neural = Pipeline([('scl', StandardScaler()),
('clf', MLPRegressor())])
param_neural = {'clf__alpha': [0.001, 0.01, 0.1, 1, 10, 100],
'clf__hidden_layer_sizes': [(5),(10,10),(7,7,7)],
'clf__solver': ['lbfgs'],
'clf__activation': ['relu', 'tanh'],
'clf__learning_rate' : ['constant', 'invscaling']}
# Execute preprocessing & train/test split
X, y = preprocessing(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# Execute model hyperparameter tuning and crossvalidation
model(pipe_ols, param_ols, X_train, y_train, X, y)
model(pipe_xgb, param_xgb, X_train, y_train, X, y)
model(pipe_knn, param_knn, X_train, y_train, X, y)
model(pipe_lasso, param_lasso, X_train, y_train, X, y)
model(pipe_ridge, param_ridge, X_train, y_train, X, y)
model(pipe_poly, param_poly, X_train, y_train, X, y)
model(pipe_tree, param_tree, X_train, y_train, X, y)
model(pipe_forest, param_forest, X_train, y_train, X, y)
model(pipe_neural, param_neural, X_train, y_train, X, y)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment