This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Our main dataframe is df | |
continuous = ['price', 'sqft_living', 'sqft_lot', 'sqft_living15', 'sqft_lot15'] | |
df_log = df[continuous] | |
# Create column names that indicate a log ex. 'price_log' | |
log_names = [f'{column}_log' for column in df_log.columns] | |
df_log = np.log10(df_log) | |
df_log.columns = log_names |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Write function to standard normalize one feature | |
def std_normalize_feature(feature): | |
""" | |
input a feature column name | |
returns series of normalized feature values | |
""" | |
return (feature - feature.mean()) / feature.std() | |
# Apply function to our previous log_df | |
df_log_normal = df_log.apply(std_normalize_feature) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def inv_normalize_price(feature_normalized): | |
""" | |
input the standard normal scaled target feature as an array | |
output the same array without the standard normal scale | |
""" | |
mu = df_log['price_log'].mean() | |
sd = df_log['price_log'].std() | |
return sd*feature_normalized + mu |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Transform back to regular $USD price (not log price) | |
train_mse_non_log = mean_squared_error(10**(inv_normalize_price(y_train)), 10**(inv_normalize_price(y_hat_train))) | |
test_mse_non_log = mean_squared_error(10**(inv_normalize_price(y_test)), 10**(inv_normalize_price(y_hat_test))) | |
#Take the square root of mse to find rmse | |
print('Train rmse non-log:', np.sqrt(train_mse_non_log)) | |
print('Test rmse non-log:', np.sqrt(test_mse_non_log)) | |
Train rmse non-log: 130614.39183027687 | |
Test rmse non-log: 131683.5255367141 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Since we are working with a pandas dataframe I included that, even though I did not actively use the library in this example | |
import pandas as pd | |
import numpy as np | |
from sklearn.metrics import mean_squared_error |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.ensemble import RandomForestClassifier | |
basic_pipe = pipeline(steps=[ | |
('scaler', StandardScaler()), | |
('estimator', RandomForestClassifier()) | |
]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# First fit the pipeline to your training data, like you would with an estimator | |
basic_pipe.fit(X_train, y_train) | |
# Next you can reference the pipeline object in the same way as an estimator | |
score = basic_pipe.score(X_test, y_test) | |
test_preds = basic_pipe.predict(X_test) | |
# You can even reference your feature importances for certain types of estimators | |
# To do this, index the estimator step, and the method within the tuple to access all the RandomForestClassifier attributes |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.model_selection import GridSearchCV | |
param_grid = {"estimator__n_estimators" : [100, 150, 200], | |
"estimator__criterion" : ["gini", "entropy"], | |
"estimator__max_depth" : [3, 4, 5]} | |
# You can change the scoring parameter here depending on which score you want to maximize | |
# You can also change the cv parameter to perform a cross validation with n folds for each model you fit | |
grid_rf = GridSearchCV(estimator= basic_pipe, | |
param_grid = param_grid, |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from imblearn.pipeline import Pipeline | |
from imblearn.over_sampling import SMOTE | |
from sklearn.ensemble import GradientBoostingClassifier | |
pipeline = Pipeline(steps= [ | |
("SMOTE", SMOTE()), | |
("GradientBooster", GradientBoostingClassifier()) | |
]) | |
pipeline.fit(X_train, y_train) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.ensemble import BaseEnsemble | |
# Build custom classes to add to the pipeline | |
class SelectColumnsTransformer(BaseEnsemble): | |
def __init__(self, columns=None): | |
self.columns = columns | |
def transform(self, X, **transform_params): |
OlderNewer