tiaplagata’s gists

tiaplagata / gist:9082274f6eb15000d9694f5b699ece20

Created October 9, 2020 15:57

Log Transformation of Target Feature in Linear Model

	# Our main dataframe is df
	continuous = ['price', 'sqft_living', 'sqft_lot', 'sqft_living15', 'sqft_lot15']
	df_log = df[continuous]

	# Create column names that indicate a log ex. 'price_log'
	log_names = [f'{column}_log' for column in df_log.columns]
	df_log = np.log10(df_log)
	df_log.columns = log_names

tiaplagata / gist:77b3eb6a9c7a8a9e1c05cc7958c914bc

Created October 9, 2020 16:07

Standard Normalize Features for Linear Regression

	# Write function to standard normalize one feature
	def std_normalize_feature(feature):
	"""
	input a feature column name
	returns series of normalized feature values
	"""
	return (feature - feature.mean()) / feature.std()

	# Apply function to our previous log_df
	df_log_normal = df_log.apply(std_normalize_feature)

tiaplagata / gist:593c6b3c989665d3b528286cbb6f24c4

Created October 19, 2020 14:01

Function to Inverse a Standard Normal Scaled Target

	def inv_normalize_price(feature_normalized):
	"""
	input the standard normal scaled target feature as an array
	output the same array without the standard normal scale
	"""
	mu = df_log['price_log'].mean()
	sd = df_log['price_log'].std()
	return sd*feature_normalized + mu

tiaplagata / gist:0b4d3fec61b963481f4c46c0e3b8357d

Created November 6, 2020 18:17

Transform back to USD and Calculate RMSE

	# Transform back to regular $USD price (not log price)
	train_mse_non_log = mean_squared_error(10(inv_normalize_price(y_train)), 10(inv_normalize_price(y_hat_train)))
	test_mse_non_log = mean_squared_error(10(inv_normalize_price(y_test)), 10(inv_normalize_price(y_hat_test)))

	#Take the square root of mse to find rmse
	print('Train rmse non-log:', np.sqrt(train_mse_non_log))
	print('Test rmse non-log:', np.sqrt(test_mse_non_log))

	Train rmse non-log: 130614.39183027687
	Test rmse non-log: 131683.5255367141

tiaplagata / gist:7c5b41431f7ad3c6b8ae54c1cf7cfe96

Created November 6, 2020 18:37

Import statements needed for interpreting MSE

	# Since we are working with a pandas dataframe I included that, even though I did not actively use the library in this example
	import pandas as pd
	import numpy as np
	from sklearn.metrics import mean_squared_error

tiaplagata / basic_pipe.py

Created November 15, 2020 23:38

Define Basic Pipeline

	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier

	basic_pipe = pipeline(steps=[
	('scaler', StandardScaler()),
	('estimator', RandomForestClassifier())
	])

tiaplagata / basic_pipe_abilities.py

Created November 15, 2020 23:49

Basic Pipeline Abilities

	# First fit the pipeline to your training data, like you would with an estimator
	basic_pipe.fit(X_train, y_train)

	# Next you can reference the pipeline object in the same way as an estimator
	score = basic_pipe.score(X_test, y_test)

	test_preds = basic_pipe.predict(X_test)

	# You can even reference your feature importances for certain types of estimators
	# To do this, index the estimator step, and the method within the tuple to access all the RandomForestClassifier attributes

tiaplagata / basic_pipe_gridsearch.py

Created November 16, 2020 02:33

Using a Pipeline with GridSearch

	from sklearn.model_selection import GridSearchCV

	param_grid = {"estimator__n_estimators" : [100, 150, 200],
	"estimator__criterion" : ["gini", "entropy"],
	"estimator__max_depth" : [3, 4, 5]}

	# You can change the scoring parameter here depending on which score you want to maximize
	# You can also change the cv parameter to perform a cross validation with n folds for each model you fit
	grid_rf = GridSearchCV(estimator= basic_pipe,
	param_grid = param_grid,

tiaplagata / class_imbalance_pipe.py

Created November 17, 2020 02:48

Using Smote in a Pipeline

	from imblearn.pipeline import Pipeline
	from imblearn.over_sampling import SMOTE
	from sklearn.ensemble import GradientBoostingClassifier

	pipeline = Pipeline(steps= [
	("SMOTE", SMOTE()),
	("GradientBooster", GradientBoostingClassifier())
	])

	pipeline.fit(X_train, y_train)

tiaplagata / custom_pipeline_1.py

Last active November 17, 2020 03:02

Custom Classes for Pipeline

	from sklearn.ensemble import BaseEnsemble

	# Build custom classes to add to the pipeline

	class SelectColumnsTransformer(BaseEnsemble):

	def __init__(self, columns=None):
	self.columns = columns

	def transform(self, X, **transform_params):

Tia Plagata tiaplagata