Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Last active February 21, 2021 17:14
Show Gist options
  • Save ksv-muralidhar/2fa8bca485ba576b2dda08b14a7967e6 to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/2fa8bca485ba576b2dda08b14a7967e6 to your computer and use it in GitHub Desktop.
data leakage
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import mean_squared_error
#Importing the dataset
data = pd.DataFrame(load_boston()['data'],columns=load_boston()['feature_names'])
data['target'] = load_boston()['target']
#Split the input and target features
X = data.iloc[:,:-1].copy()
y = data.iloc[:,-1].copy()
# Adding 100 random missing values
np.random.seed(11)
rand_cols = np.random.randint(0,X.shape[1],100)
rand_rows = np.random.randint(0,X.shape[0],100)
for i,j in zip(rand_rows,rand_cols):
X.iloc[i,j] = np.nan
#Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=11)
#Initislizing KNN Regressor
knn = KNeighborsRegressor()
#Initializing mode imputer
imp = SimpleImputer(strategy='most_frequent')
#Initializing StandardScaler
standard_scaler = StandardScaler()
#Imputing and scaling X_train
X_train_impute = imp.fit_transform(X_train).copy()
X_train_scaled = standard_scaler.fit_transform(X_train_impute).copy()
#Running 5-fold cross-validation
cv = cross_validate(estimator=knn,X=X_train_scaled,y=y_train,cv=5,scoring="neg_root_mean_squared_error",return_train_score=True)
#Calculating mean of the training scores of cross-validation
print(f'Training RMSE (with data leakage): {-1 * np.mean(cv["train_score"])}')
#Calculating mean of the validation scores of cross-validation
print(f'validation RMSE (with data leakage): {-1 * np.mean(cv["test_score"])}')
#fitting the model to the training data
lr.fit(X_train_scaled,y_train)
#preprocessing the test data
X_test_impute = imp.transform(X_test).copy()
X_test_scaled = standard_scaler.transform(X_test_impute).copy()
#Predictions and model evaluation on unseen data
pred = lr.predict(X_test_scaled)
print(f'RMSE on unseen data: {np.sqrt(mean_squared_error(y_test,pred))}')
#Preprocessing and regressor pipeline
pipeline = Pipeline(steps=[['imputer',imp],['scaler',standard_scaler],['regressor',knn]])
#Running 5-fold cross-validation using pipeline as estimator
cv = cross_validate(estimator=pipeline,X=X_train,y=y_train,cv=5,scoring="neg_root_mean_squared_error",return_train_score=True)
#Calculating mean of the training scores of cross-validation
print(f'Training RMSE (without data leakage): {-1 * np.mean(cv["train_score"])}')
#Calculating mean of the validation scores of cross-validation
print(f'validation RMSE (without data leakage): {-1 * np.mean(cv["test_score"])}')
#fitting the pipeline to the training data
pipeline.fit(X_train,y_train)
#Predictions and model evaluation on unseen data
pred = pipeline.predict(X_test)
print(f'RMSE on unseen data: {np.sqrt(mean_squared_error(y_test,pred))}')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment