Skip to content

Instantly share code, notes, and snippets.

View tiaplagata's full-sized avatar

Tia Plagata tiaplagata

View GitHub Profile
@tiaplagata
tiaplagata / gist:9082274f6eb15000d9694f5b699ece20
Created October 9, 2020 15:57
Log Transformation of Target Feature in Linear Model
# Our main dataframe is df
continuous = ['price', 'sqft_living', 'sqft_lot', 'sqft_living15', 'sqft_lot15']
df_log = df[continuous]
# Create column names that indicate a log ex. 'price_log'
log_names = [f'{column}_log' for column in df_log.columns]
df_log = np.log10(df_log)
df_log.columns = log_names
@tiaplagata
tiaplagata / gist:77b3eb6a9c7a8a9e1c05cc7958c914bc
Created October 9, 2020 16:07
Standard Normalize Features for Linear Regression
# Write function to standard normalize one feature
def std_normalize_feature(feature):
"""
input a feature column name
returns series of normalized feature values
"""
return (feature - feature.mean()) / feature.std()
# Apply function to our previous log_df
df_log_normal = df_log.apply(std_normalize_feature)
@tiaplagata
tiaplagata / gist:593c6b3c989665d3b528286cbb6f24c4
Created October 19, 2020 14:01
Function to Inverse a Standard Normal Scaled Target
def inv_normalize_price(feature_normalized):
"""
input the standard normal scaled target feature as an array
output the same array without the standard normal scale
"""
mu = df_log['price_log'].mean()
sd = df_log['price_log'].std()
return sd*feature_normalized + mu
@tiaplagata
tiaplagata / gist:0b4d3fec61b963481f4c46c0e3b8357d
Created November 6, 2020 18:17
Transform back to USD and Calculate RMSE
# Transform back to regular $USD price (not log price)
train_mse_non_log = mean_squared_error(10**(inv_normalize_price(y_train)), 10**(inv_normalize_price(y_hat_train)))
test_mse_non_log = mean_squared_error(10**(inv_normalize_price(y_test)), 10**(inv_normalize_price(y_hat_test)))
#Take the square root of mse to find rmse
print('Train rmse non-log:', np.sqrt(train_mse_non_log))
print('Test rmse non-log:', np.sqrt(test_mse_non_log))
Train rmse non-log: 130614.39183027687
Test rmse non-log: 131683.5255367141
@tiaplagata
tiaplagata / gist:7c5b41431f7ad3c6b8ae54c1cf7cfe96
Created November 6, 2020 18:37
Import statements needed for interpreting MSE
# Since we are working with a pandas dataframe I included that, even though I did not actively use the library in this example
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
@tiaplagata
tiaplagata / basic_pipe.py
Created November 15, 2020 23:38
Define Basic Pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
basic_pipe = pipeline(steps=[
('scaler', StandardScaler()),
('estimator', RandomForestClassifier())
])
@tiaplagata
tiaplagata / basic_pipe_abilities.py
Created November 15, 2020 23:49
Basic Pipeline Abilities
# First fit the pipeline to your training data, like you would with an estimator
basic_pipe.fit(X_train, y_train)
# Next you can reference the pipeline object in the same way as an estimator
score = basic_pipe.score(X_test, y_test)
test_preds = basic_pipe.predict(X_test)
# You can even reference your feature importances for certain types of estimators
# To do this, index the estimator step, and the method within the tuple to access all the RandomForestClassifier attributes
@tiaplagata
tiaplagata / basic_pipe_gridsearch.py
Created November 16, 2020 02:33
Using a Pipeline with GridSearch
from sklearn.model_selection import GridSearchCV
param_grid = {"estimator__n_estimators" : [100, 150, 200],
"estimator__criterion" : ["gini", "entropy"],
"estimator__max_depth" : [3, 4, 5]}
# You can change the scoring parameter here depending on which score you want to maximize
# You can also change the cv parameter to perform a cross validation with n folds for each model you fit
grid_rf = GridSearchCV(estimator= basic_pipe,
param_grid = param_grid,
@tiaplagata
tiaplagata / class_imbalance_pipe.py
Created November 17, 2020 02:48
Using Smote in a Pipeline
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import GradientBoostingClassifier
pipeline = Pipeline(steps= [
("SMOTE", SMOTE()),
("GradientBooster", GradientBoostingClassifier())
])
pipeline.fit(X_train, y_train)
@tiaplagata
tiaplagata / custom_pipeline_1.py
Last active November 17, 2020 03:02
Custom Classes for Pipeline
from sklearn.ensemble import BaseEnsemble
# Build custom classes to add to the pipeline
class SelectColumnsTransformer(BaseEnsemble):
def __init__(self, columns=None):
self.columns = columns
def transform(self, X, **transform_params):