Skip to content

Instantly share code, notes, and snippets.

@DorkNstein
Created January 29, 2019 01:30
Show Gist options
  • Save DorkNstein/5bae8e315772808094b1692eb7f995c9 to your computer and use it in GitHub Desktop.
Save DorkNstein/5bae8e315772808094b1692eb7f995c9 to your computer and use it in GitHub Desktop.
# !#/usr/local/bin/python2
import numpy as np
import pandas as pd
from sklearn import datasets
import seaborn as sns
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import os
data = pd.read_csv('./pcorr/examples/kc_house_data_1.csv')
data.head(3)
print(data.isnull().any())
print(data.dtypes)
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('id')
data.price = data.price.astype(int)
data.bathrooms = data.bathrooms.astype(int)
data.floors = data.floors.astype(int)
data.head(5)
data["house_age"] = data["date"].dt.year - data['yr_built']
data['renovated'] = data['yr_renovated'].apply(lambda yr: 0
if yr == 0
else 1)
data = data.drop('date', axis = 1)
data = data.drop('yr_renovated', axis = 1)
data = data.drop('yr_built', axis = 1)
data.head(5)
pd.set_option('precision', 2)
print(data.describe())
correlation = data.corr(method = 'pearson')
columns = correlation.nlargest(10, 'price').index
columns
correlation_map = np.corrcoef(data[columns].values.T)
correlation_map
# sns.set(font_scale = 1.0)
# heatmap = sns.heatmap(correlation_map, cbar = True, annot = True, square = True, fmt = '.2f', yticklabels = columns.values, xticklabels = columns.values)
# plt.show()
data['price'] = np.log(data['price'])
data['sqft_lot'] = np.log(data['sqft_lot'])
X = data[columns]
Y = X['price'].values
X = X.drop('price', axis = 1).values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42)
from sklearn.linear_model import LinearRegression, Ridge, LassoCV, Lasso, LassoLarsCV, ElasticNet, BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostClassifier
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()), ('LR', LinearRegression())])))
pipelines.append(('ScaledRIDGE', Pipeline([('Scaler', StandardScaler()), ('RIDGE', Ridge())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()), ('LASSO', Lasso())])))
pipelines.append(('ScaledLASSOCV', Pipeline([('Scaler', StandardScaler()), ('LASSOCV', LassoCV())])))
pipelines.append(('ScaledLASSOLarsCV', Pipeline([('Scaler', StandardScaler()), ('LASSOLarsCV', LassoLarsCV())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()), ('EN', ElasticNet())])))
pipelines.append(('ScaledBAYESIAN', Pipeline([('Scaler', StandardScaler()), ('BAYESIAN', BayesianRidge())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledADA', Pipeline([('Scaler', StandardScaler()), ('ADABOOST', AdaBoostClassifier(n_estimators=100))])))
results = []
names = []
for name, model in pipelines:
kfold = KFold(n_splits = 10, random_state = 21)
cv_results = cross_val_score(model, X_train, Y_train, cv = kfold, scoring = 'neg_mean_squared_error')
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# from sklearn.model_selection import GridSearchCV
# scaler = StandardScaler().fit(X_train)
# rescaledX = scaler.transform(X_train)
# param_grid = dict(n_estimators = np.array([50, 100, 200, 300, 400]))
# model = GradientBoostingRegressor(random_state = 21)
# kfold = KFold(n_splits = 10, random_state = 21)
# grid = GridSearchCV(estimator = model, param_grid = param_grid, scoring = 'neg_mean_squared_error', cv = kfold)
# grid_result = grid.fit(rescaledX, Y_train)
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']
# for mean, stdev, param in zip(means, stds, params):
# print("%f (%f) with: %r" % (mean, stdev, param))
# print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
# from sklearn.metrics import mean_squared_error
# scaler = StandardScaler().fit(X_train)
# rescaled_X_train = scaler.transform(X_train)
# model = GradientBoostingRegressor(random_state=21, n_estimators=400)
# model.fit(rescaled_X_train, Y_train)
# # transform the validation dataset
# rescaled_X_test = scaler.transform(X_test)
# predictions = model.predict(rescaled_X_test)
# print (mean_squared_error(Y_test, predictions))
# compare = pd.DataFrame({'Prediction': predictions, 'Test Data' : Y_test})
# compare.head(10)
# actual_y_test = np.exp(Y_test)
# actual_predicted = np.exp(predictions)
# diff = abs(actual_y_test - actual_predicted)
# compare_actual = pd.DataFrame({'Test Data': actual_y_test, 'Predicted Price' : actual_predicted, 'Difference' : diff})
# compare_actual = compare_actual.astype(int)
# compare_actual.head(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment