BioSciEconomist · May 1, 2020 20:32
diff --git a/ex bank marketing predictive model.py b/ex bank marketing predictive model.py
 # *-----------------------------------------------------------------
 # | PROGRAM NAME: ex bank marketing predictive model.py
 # | DATE: 5/1/20 
 # | CREATED BY: MATT BOGARD 
 # | PROJECT FILE:         
 # *----------------------------------------------------------------
 # | PURPOSE: basic example of predictive modeling, evaluation, and scoring
 # *----------------------------------------------------------------

 # Import numpy and pandas
 import numpy as np
 import pandas as pd

 # Read the CSV file into a DataFrame
 df = pd.read_csv('C://Data/bank_marketing.csv')

 df.head() # inspect fields

 # create binary outcome for target
 df['target'] = np.where(df['y'] == 'yes', 1, 0)
 df.head() # check

 df.groupby(['target']).size().reset_index(name='count') # check
 df.target.mean() # check 


 # Create arrays for the features and the response variable
 y = df['target'].values # .values creates an array from pandas dataframe

 X = df.drop(['target','y'], axis=1) # omit targets & factors with multiple levels (issues scoring)
 X = pd.get_dummies(X).values # convert characters to dummies and create feature array 
 
 ### Building a logistic regression model

 # scikit-learn makes it very easy to try different models, since 
 # the Train-Test-Split/Instantiate/Fit/Predict paradigm applies to
 # all classifiers and regressors - which are known in scikit-learn 
 # as 'estimators'
                   
 # Import the necessary modules
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import confusion_matrix, classification_report

 # Create training and test sets
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

 # Create the classifier: logreg
 logreg = LogisticRegression()

 # Fit the classifier to the training data
 logreg.fit(X_train,y_train)

 # Predict the labels of the test set: y_pred
 y_pred = logreg.predict(X_test)

 # Compute and print the confusion matrix and classification report
 print(confusion_matrix(y_test, y_pred))
 print(classification_report(y_test, y_pred))

 ### plotting an ROC curve

 # Import necessary modules
 from sklearn.metrics import roc_curve

 # Compute predicted probabilities: y_pred_prob
 y_pred_prob = logreg.predict_proba(X_test)[:,1]

 # Generate ROC curve values: fpr, tpr, thresholds
 fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) 
 
 # Plot ROC curve
 import matplotlib.pyplot as plt

 plt.plot([0, 1], [0, 1], 'k--')
 plt.plot(fpr, tpr)
 plt.xlabel('False Positive Rate')
 plt.ylabel('True Positive Rate')
 plt.title('ROC Curve')
 plt.show()


 ### computing area under the ROC score

 # Import necessary modules
 from sklearn.model_selection import cross_val_score
 from sklearn.metrics import roc_auc_score

 # Compute and print AUC score
 print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

 # Compute cross-validated AUC scores: cv_auc
 cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc') # note this uses the entire data set
 # because it is using 'X' and 'y' vs the training and test splits

 # Print list of AUC scores
 print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))


 #--------------------------------------------
 # create binning/decile based calibration test
 #--------------------------------------------

 # are we interested in descrimination (measured by the ROC curve) or
 # calibration - see: http://econometricsense.blogspot.com/2013/04/is-roc-curve-good-metric-for-model.html?_sm_au_=iVVW0G00vKTNTs6PWckVjKH3VvpV6

 # get data frame with predicted probabilities so we can use pandas
 tmp1 = pd.DataFrame({'prob':y_pred_prob})

 p10 = tmp1.quantile(.10)
 p20 = tmp1.quantile(.20)
 p30 = tmp1.quantile(.30)
 p40 = tmp1.quantile(.40)
 p50 = tmp1.quantile(.50)
 p60 = tmp1.quantile(.60)
 p70 = tmp1.quantile(.70)
 p80 = tmp1.quantile(.80)
 p90 = tmp1.quantile(.90)


 # use cutoffs from above to segregate data into 10 groups 

 # Create a temporary data frame for scoring
 tmp2 =pd.DataFrame({'Predicted':y_pred_prob,'ObservedOutcome':y_test})


 conditions = [
    (tmp2['Predicted'] <= p10[0]),
    (tmp2['Predicted'] > p10[0]) & (tmp2['Predicted'] <= p20[0]),
      (tmp2['Predicted'] > p20[0]) & (tmp2['Predicted'] <= p30[0]),
         (tmp2['Predicted'] > p30[0]) & (tmp2['Predicted'] <= p40[0]),
             (tmp2['Predicted'] > p40[0]) & (tmp2['Predicted'] <= p50[0]),
                 (tmp2['Predicted'] > p50[0]) & (tmp2['Predicted'] <= p60[0]),
                     (tmp2['Predicted'] > p60[0]) & (tmp2['Predicted'] <= p70[0]),
                         (tmp2['Predicted'] > p70[0]) & (tmp2['Predicted'] <= p80[0]),
                           (tmp2['Predicted'] > p80[0]) & (tmp2['Predicted'] <= p90[0]),    
                                   (tmp2['Predicted'] > p90[0])]

 choices = ['D1', 'D2', 'D3','D4', 'D5', 'D6','D7', 'D8', 'D9','D10']
 tmp2['Probability Decile'] = np.select(conditions, choices, default='na')

                                                 
 # assess average target outcome and predicted probability within each decile
 tmp3 = pd.DataFrame(tmp2.groupby(['Probability Decile']).size().reset_index(name='count'))  
 tmp4 = pd.DataFrame(tmp2.groupby(['Probability Decile']).Predicted.mean().reset_index())
 tmp5 = pd.DataFrame(tmp2.groupby(['Probability Decile']).ObservedOutcome.mean().reset_index())

 # built report dataframe
 rpt = tmp3
 rpt['Predicted'] = tmp4.Predicted
 rpt['Observed'] = tmp5.ObservedOutcome

 print(rpt) # print report
                                                                

 #--------------------------------------------------
 #  calibration plot
 #-------------------------------------------------

 plt.plot(rpt.Predicted, rpt.Observed, marker='+', linestyle='none')


 #-----------------------------------------------
 # score a new data set (example 1)
 #-----------------------------------------------

 # read data bank_mkt_score.csv
 # Read the CSV file into a DataFrame
 df2 = pd.read_csv('C:/Data/bank_mkt_score.csv')

 df2.head() # inspect fields

 # create numeric target (if you actually have target labels, which usually you may not)
 df2['target'] = np.where(df2['y'] == 'yes', 1, 0)
 df2.head() # check

 # create score vector -
 # you will have want to remove the target field y and ID to simulate a real world unlabeled example
 # as well as avoid errors with predict method
 X_score =df2.drop(['y','ID','target'], axis=1)
 X_score = pd.get_dummies(X_score).values # convert characters to dummies and create feature array like training data above

 # Create numeric target array for scoring
 y_score = df2['target'].values 

 # go ahead and extract vector ID for merging
 ID = df2['ID'].values
             
 # use previously fit model and predict to score this data &
 # Compute predicted probabilities: score_pred_prob
 score_pred_prob = logreg.predict_proba(X_score)[:,1]

 # add scores + ID
 df3 = pd.DataFrame({'ID':ID,'prob':score_pred_prob})
 df3.head()                        

 # merge with original data
 bank_mkt_scored = pd.merge(df2,df3, on='ID', how='left')

 bank_mkt_scored.head() # check

 #----------------------------------------------------
 # create risk strata/score card
 #----------------------------------------------------

 # can we segment our market based on the predicted probability of respondiing to 
 # our marketing campaign?

 conditions = [
    (bank_mkt_scored['prob'] < .01),
    (bank_mkt_scored['prob'] >= .01) & (bank_mkt_scored['prob'] < .05),
      (bank_mkt_scored['prob'] >= .05) & (bank_mkt_scored['prob'] <= .10),
         (bank_mkt_scored['prob'] > .10) & (bank_mkt_scored['prob'] <= .30),
             (bank_mkt_scored['prob'] > .30) & (bank_mkt_scored['prob'] <= .60),
                 (bank_mkt_scored['prob'] > .6)]

 choices = ['A - Exclude','B - Very Unlikely', 'C - Unlikely', 'D - Marginal','E - Likely','F - Most Likely']
 bank_mkt_scored['Prospect Level'] = np.select(conditions, choices, default='na')

                                                 
 # assess average target outcome and predicted probability within each decile
 bank_mkt_scored.groupby(['Prospect Level']).size()  # prospects per group
 bank_mkt_scored.groupby(['Prospect Level']).prob.mean() # propensity to respond

 # export to csv - for production or exploration in a dashboard
 bank_mkt_scored.to_csv('C:/Report/test123.csv') # this will write to working directory
	# *-----------------------------------------------------------------
	# \| PROGRAM NAME: ex bank marketing predictive model.py
	# \| DATE: 5/1/20
	# \| CREATED BY: MATT BOGARD
	# \| PROJECT FILE:
	# *----------------------------------------------------------------
	# \| PURPOSE: basic example of predictive modeling, evaluation, and scoring
	# *----------------------------------------------------------------

	# Import numpy and pandas
	import numpy as np
	import pandas as pd

	# Read the CSV file into a DataFrame
	df = pd.read_csv('C://Data/bank_marketing.csv')

	df.head() # inspect fields

	# create binary outcome for target
	df['target'] = np.where(df['y'] == 'yes', 1, 0)
	df.head() # check

	df.groupby(['target']).size().reset_index(name='count') # check
	df.target.mean() # check


	# Create arrays for the features and the response variable
	y = df['target'].values # .values creates an array from pandas dataframe

	X = df.drop(['target','y'], axis=1) # omit targets & factors with multiple levels (issues scoring)
	X = pd.get_dummies(X).values # convert characters to dummies and create feature array

	### Building a logistic regression model

	# scikit-learn makes it very easy to try different models, since
	# the Train-Test-Split/Instantiate/Fit/Predict paradigm applies to
	# all classifiers and regressors - which are known in scikit-learn
	# as 'estimators'

	# Import the necessary modules
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import confusion_matrix, classification_report

	# Create training and test sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)

	# Create the classifier: logreg
	logreg = LogisticRegression()

	# Fit the classifier to the training data
	logreg.fit(X_train,y_train)

	# Predict the labels of the test set: y_pred
	y_pred = logreg.predict(X_test)

	# Compute and print the confusion matrix and classification report
	print(confusion_matrix(y_test, y_pred))
	print(classification_report(y_test, y_pred))

	### plotting an ROC curve

	# Import necessary modules
	from sklearn.metrics import roc_curve

	# Compute predicted probabilities: y_pred_prob
	y_pred_prob = logreg.predict_proba(X_test)[:,1]

	# Generate ROC curve values: fpr, tpr, thresholds
	fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

	# Plot ROC curve
	import matplotlib.pyplot as plt

	plt.plot([0, 1], [0, 1], 'k--')
	plt.plot(fpr, tpr)
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('ROC Curve')
	plt.show()


	### computing area under the ROC score

	# Import necessary modules
	from sklearn.model_selection import cross_val_score
	from sklearn.metrics import roc_auc_score

	# Compute and print AUC score
	print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

	# Compute cross-validated AUC scores: cv_auc
	cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc') # note this uses the entire data set
	# because it is using 'X' and 'y' vs the training and test splits

	# Print list of AUC scores
	print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))


	#--------------------------------------------
	# create binning/decile based calibration test
	#--------------------------------------------

	# are we interested in descrimination (measured by the ROC curve) or
	# calibration - see: http://econometricsense.blogspot.com/2013/04/is-roc-curve-good-metric-for-model.html?_sm_au_=iVVW0G00vKTNTs6PWckVjKH3VvpV6

	# get data frame with predicted probabilities so we can use pandas
	tmp1 = pd.DataFrame({'prob':y_pred_prob})

	p10 = tmp1.quantile(.10)
	p20 = tmp1.quantile(.20)
	p30 = tmp1.quantile(.30)
	p40 = tmp1.quantile(.40)
	p50 = tmp1.quantile(.50)
	p60 = tmp1.quantile(.60)
	p70 = tmp1.quantile(.70)
	p80 = tmp1.quantile(.80)
	p90 = tmp1.quantile(.90)


	# use cutoffs from above to segregate data into 10 groups

	# Create a temporary data frame for scoring
	tmp2 =pd.DataFrame({'Predicted':y_pred_prob,'ObservedOutcome':y_test})


	conditions = [
	(tmp2['Predicted'] <= p10[0]),
	(tmp2['Predicted'] > p10[0]) & (tmp2['Predicted'] <= p20[0]),
	(tmp2['Predicted'] > p20[0]) & (tmp2['Predicted'] <= p30[0]),
	(tmp2['Predicted'] > p30[0]) & (tmp2['Predicted'] <= p40[0]),
	(tmp2['Predicted'] > p40[0]) & (tmp2['Predicted'] <= p50[0]),
	(tmp2['Predicted'] > p50[0]) & (tmp2['Predicted'] <= p60[0]),
	(tmp2['Predicted'] > p60[0]) & (tmp2['Predicted'] <= p70[0]),
	(tmp2['Predicted'] > p70[0]) & (tmp2['Predicted'] <= p80[0]),
	(tmp2['Predicted'] > p80[0]) & (tmp2['Predicted'] <= p90[0]),
	(tmp2['Predicted'] > p90[0])]

	choices = ['D1', 'D2', 'D3','D4', 'D5', 'D6','D7', 'D8', 'D9','D10']
	tmp2['Probability Decile'] = np.select(conditions, choices, default='na')


	# assess average target outcome and predicted probability within each decile
	tmp3 = pd.DataFrame(tmp2.groupby(['Probability Decile']).size().reset_index(name='count'))
	tmp4 = pd.DataFrame(tmp2.groupby(['Probability Decile']).Predicted.mean().reset_index())
	tmp5 = pd.DataFrame(tmp2.groupby(['Probability Decile']).ObservedOutcome.mean().reset_index())

	# built report dataframe
	rpt = tmp3
	rpt['Predicted'] = tmp4.Predicted
	rpt['Observed'] = tmp5.ObservedOutcome

	print(rpt) # print report


	#--------------------------------------------------
	# calibration plot
	#-------------------------------------------------

	plt.plot(rpt.Predicted, rpt.Observed, marker='+', linestyle='none')


	#-----------------------------------------------
	# score a new data set (example 1)
	#-----------------------------------------------

	# read data bank_mkt_score.csv
	# Read the CSV file into a DataFrame
	df2 = pd.read_csv('C:/Data/bank_mkt_score.csv')

	df2.head() # inspect fields

	# create numeric target (if you actually have target labels, which usually you may not)
	df2['target'] = np.where(df2['y'] == 'yes', 1, 0)
	df2.head() # check

	# create score vector -
	# you will have want to remove the target field y and ID to simulate a real world unlabeled example
	# as well as avoid errors with predict method
	X_score =df2.drop(['y','ID','target'], axis=1)
	X_score = pd.get_dummies(X_score).values # convert characters to dummies and create feature array like training data above

	# Create numeric target array for scoring
	y_score = df2['target'].values

	# go ahead and extract vector ID for merging
	ID = df2['ID'].values

	# use previously fit model and predict to score this data &
	# Compute predicted probabilities: score_pred_prob
	score_pred_prob = logreg.predict_proba(X_score)[:,1]

	# add scores + ID
	df3 = pd.DataFrame({'ID':ID,'prob':score_pred_prob})
	df3.head()

	# merge with original data
	bank_mkt_scored = pd.merge(df2,df3, on='ID', how='left')

	bank_mkt_scored.head() # check

	#----------------------------------------------------
	# create risk strata/score card
	#----------------------------------------------------

	# can we segment our market based on the predicted probability of respondiing to
	# our marketing campaign?

	conditions = [
	(bank_mkt_scored['prob'] < .01),
	(bank_mkt_scored['prob'] >= .01) & (bank_mkt_scored['prob'] < .05),
	(bank_mkt_scored['prob'] >= .05) & (bank_mkt_scored['prob'] <= .10),
	(bank_mkt_scored['prob'] > .10) & (bank_mkt_scored['prob'] <= .30),
	(bank_mkt_scored['prob'] > .30) & (bank_mkt_scored['prob'] <= .60),
	(bank_mkt_scored['prob'] > .6)]

	choices = ['A - Exclude','B - Very Unlikely', 'C - Unlikely', 'D - Marginal','E - Likely','F - Most Likely']
	bank_mkt_scored['Prospect Level'] = np.select(conditions, choices, default='na')


	# assess average target outcome and predicted probability within each decile
	bank_mkt_scored.groupby(['Prospect Level']).size() # prospects per group
	bank_mkt_scored.groupby(['Prospect Level']).prob.mean() # propensity to respond

	# export to csv - for production or exploration in a dashboard
	bank_mkt_scored.to_csv('C:/Report/test123.csv') # this will write to working directory