tiaplagata’s gists

tiaplagata / colab_setup.py

Last active December 15, 2020 13:31

Code to set up data in Google Colab

	from google.colab import drive
	import os

	# Mount Google Drive
	drive.mount('/gdrive', force_remount=True)

	# Location of Zip File
	drive_path = '/gdrive/MyDrive/Data/pneumonia_data.zip'
	local_path = '/content'

tiaplagata / function_transformer.py

Created November 18, 2020 00:31

FunctionTransformer in Pipeline

	from sklearn.preprocessing import FunctionTransformer

	# The function we want to convert to a class for our pipeline
	def transform_yes_no(X):
	X['international plan'] = X['international plan'].apply(lambda x: 1 if x.lower() == 'yes' else 0)
	X['voice mail plan'] = X['voice mail plan'].apply(lambda x: 1 if x.lower() == 'yes' else 0)
	return X

	# Convert to class for pipeline
	YesNoTransformer = FunctionTransformer(transform_yes_no)

tiaplagata / custom_pipeline_2.py

Last active November 17, 2020 03:00

Custom Pipeline with Grid Search

	from sklearn.metrics import recall_score, make_scorer

	# Define the pipeline
	pipeline = Pipeline(steps= [
	("ColumnTransformer", SelectColumnsTransformer(columns=features_to_use)),
	("TransformCategorical", Transform_Categorical()),
	("SMOTE", SMOTE()),
	("GradientBooster", GradientBoostingClassifier())
	])

tiaplagata / custom_pipeline_1.py

Last active November 17, 2020 03:02

Custom Classes for Pipeline

	from sklearn.ensemble import BaseEnsemble

	# Build custom classes to add to the pipeline

	class SelectColumnsTransformer(BaseEnsemble):

	def __init__(self, columns=None):
	self.columns = columns

	def transform(self, X, **transform_params):

tiaplagata / class_imbalance_pipe.py

Created November 17, 2020 02:48

Using Smote in a Pipeline

	from imblearn.pipeline import Pipeline
	from imblearn.over_sampling import SMOTE
	from sklearn.ensemble import GradientBoostingClassifier

	pipeline = Pipeline(steps= [
	("SMOTE", SMOTE()),
	("GradientBooster", GradientBoostingClassifier())
	])

	pipeline.fit(X_train, y_train)

tiaplagata / basic_pipe_gridsearch.py

Created November 16, 2020 02:33

Using a Pipeline with GridSearch

	from sklearn.model_selection import GridSearchCV

	param_grid = {"estimator__n_estimators" : [100, 150, 200],
	"estimator__criterion" : ["gini", "entropy"],
	"estimator__max_depth" : [3, 4, 5]}

	# You can change the scoring parameter here depending on which score you want to maximize
	# You can also change the cv parameter to perform a cross validation with n folds for each model you fit
	grid_rf = GridSearchCV(estimator= basic_pipe,
	param_grid = param_grid,

tiaplagata / basic_pipe_abilities.py

Created November 15, 2020 23:49

Basic Pipeline Abilities

	# First fit the pipeline to your training data, like you would with an estimator
	basic_pipe.fit(X_train, y_train)

	# Next you can reference the pipeline object in the same way as an estimator
	score = basic_pipe.score(X_test, y_test)

	test_preds = basic_pipe.predict(X_test)

	# You can even reference your feature importances for certain types of estimators
	# To do this, index the estimator step, and the method within the tuple to access all the RandomForestClassifier attributes

tiaplagata / basic_pipe.py

Created November 15, 2020 23:38

Define Basic Pipeline

	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.ensemble import RandomForestClassifier

	basic_pipe = pipeline(steps=[
	('scaler', StandardScaler()),
	('estimator', RandomForestClassifier())
	])

tiaplagata / gist:7c5b41431f7ad3c6b8ae54c1cf7cfe96

Created November 6, 2020 18:37

Import statements needed for interpreting MSE

	# Since we are working with a pandas dataframe I included that, even though I did not actively use the library in this example
	import pandas as pd
	import numpy as np
	from sklearn.metrics import mean_squared_error

tiaplagata / gist:0b4d3fec61b963481f4c46c0e3b8357d

Created November 6, 2020 18:17

Transform back to USD and Calculate RMSE

	# Transform back to regular $USD price (not log price)
	train_mse_non_log = mean_squared_error(10(inv_normalize_price(y_train)), 10(inv_normalize_price(y_hat_train)))
	test_mse_non_log = mean_squared_error(10(inv_normalize_price(y_test)), 10(inv_normalize_price(y_hat_test)))

	#Take the square root of mse to find rmse
	print('Train rmse non-log:', np.sqrt(train_mse_non_log))
	print('Test rmse non-log:', np.sqrt(test_mse_non_log))

	Train rmse non-log: 130614.39183027687
	Test rmse non-log: 131683.5255367141

Tia Plagata tiaplagata