Last active
February 21, 2021 17:14
-
-
Save ksv-muralidhar/2fa8bca485ba576b2dda08b14a7967e6 to your computer and use it in GitHub Desktop.
data leakage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.datasets import load_boston | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.pipeline import Pipeline | |
from sklearn.impute import SimpleImputer | |
from sklearn.neighbors import KNeighborsRegressor | |
from sklearn.model_selection import cross_validate, train_test_split | |
from sklearn.metrics import mean_squared_error | |
#Importing the dataset | |
data = pd.DataFrame(load_boston()['data'],columns=load_boston()['feature_names']) | |
data['target'] = load_boston()['target'] | |
#Split the input and target features | |
X = data.iloc[:,:-1].copy() | |
y = data.iloc[:,-1].copy() | |
# Adding 100 random missing values | |
np.random.seed(11) | |
rand_cols = np.random.randint(0,X.shape[1],100) | |
rand_rows = np.random.randint(0,X.shape[0],100) | |
for i,j in zip(rand_rows,rand_cols): | |
X.iloc[i,j] = np.nan | |
#Splitting the data into training and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=11) | |
#Initislizing KNN Regressor | |
knn = KNeighborsRegressor() | |
#Initializing mode imputer | |
imp = SimpleImputer(strategy='most_frequent') | |
#Initializing StandardScaler | |
standard_scaler = StandardScaler() | |
#Imputing and scaling X_train | |
X_train_impute = imp.fit_transform(X_train).copy() | |
X_train_scaled = standard_scaler.fit_transform(X_train_impute).copy() | |
#Running 5-fold cross-validation | |
cv = cross_validate(estimator=knn,X=X_train_scaled,y=y_train,cv=5,scoring="neg_root_mean_squared_error",return_train_score=True) | |
#Calculating mean of the training scores of cross-validation | |
print(f'Training RMSE (with data leakage): {-1 * np.mean(cv["train_score"])}') | |
#Calculating mean of the validation scores of cross-validation | |
print(f'validation RMSE (with data leakage): {-1 * np.mean(cv["test_score"])}') | |
#fitting the model to the training data | |
lr.fit(X_train_scaled,y_train) | |
#preprocessing the test data | |
X_test_impute = imp.transform(X_test).copy() | |
X_test_scaled = standard_scaler.transform(X_test_impute).copy() | |
#Predictions and model evaluation on unseen data | |
pred = lr.predict(X_test_scaled) | |
print(f'RMSE on unseen data: {np.sqrt(mean_squared_error(y_test,pred))}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Preprocessing and regressor pipeline | |
pipeline = Pipeline(steps=[['imputer',imp],['scaler',standard_scaler],['regressor',knn]]) | |
#Running 5-fold cross-validation using pipeline as estimator | |
cv = cross_validate(estimator=pipeline,X=X_train,y=y_train,cv=5,scoring="neg_root_mean_squared_error",return_train_score=True) | |
#Calculating mean of the training scores of cross-validation | |
print(f'Training RMSE (without data leakage): {-1 * np.mean(cv["train_score"])}') | |
#Calculating mean of the validation scores of cross-validation | |
print(f'validation RMSE (without data leakage): {-1 * np.mean(cv["test_score"])}') | |
#fitting the pipeline to the training data | |
pipeline.fit(X_train,y_train) | |
#Predictions and model evaluation on unseen data | |
pred = pipeline.predict(X_test) | |
print(f'RMSE on unseen data: {np.sqrt(mean_squared_error(y_test,pred))}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment