Forked from BioSciEconomist/ex bank marketing predictive model.py
Created
January 24, 2022 01:50
-
-
Save TheSkallywag/36844d8b211729afdd51e93e5f181284 to your computer and use it in GitHub Desktop.
Predictive model training, validation, and scoring basic example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# *----------------------------------------------------------------- | |
# | PROGRAM NAME: ex bank marketing predictive model.py | |
# | DATE: 5/1/20 | |
# | CREATED BY: MATT BOGARD | |
# | PROJECT FILE: | |
# *---------------------------------------------------------------- | |
# | PURPOSE: basic example of predictive modeling, evaluation, and scoring | |
# *---------------------------------------------------------------- | |
# Import numpy and pandas | |
import numpy as np | |
import pandas as pd | |
# Read the CSV file into a DataFrame | |
df = pd.read_csv('C://Data/bank_marketing.csv') | |
df.head() # inspect fields | |
# create binary outcome for target | |
df['target'] = np.where(df['y'] == 'yes', 1, 0) | |
df.head() # check | |
df.groupby(['target']).size().reset_index(name='count') # check | |
df.target.mean() # check | |
# Create arrays for the features and the response variable | |
y = df['target'].values # .values creates an array from pandas dataframe | |
X = df.drop(['target','y'], axis=1) # omit targets & factors with multiple levels (issues scoring) | |
X = pd.get_dummies(X).values # convert characters to dummies and create feature array | |
### Building a logistic regression model | |
# scikit-learn makes it very easy to try different models, since | |
# the Train-Test-Split/Instantiate/Fit/Predict paradigm applies to | |
# all classifiers and regressors - which are known in scikit-learn | |
# as 'estimators' | |
# Import the necessary modules | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix, classification_report | |
# Create training and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42) | |
# Create the classifier: logreg | |
logreg = LogisticRegression() | |
# Fit the classifier to the training data | |
logreg.fit(X_train,y_train) | |
# Predict the labels of the test set: y_pred | |
y_pred = logreg.predict(X_test) | |
# Compute and print the confusion matrix and classification report | |
print(confusion_matrix(y_test, y_pred)) | |
print(classification_report(y_test, y_pred)) | |
### plotting an ROC curve | |
# Import necessary modules | |
from sklearn.metrics import roc_curve | |
# Compute predicted probabilities: y_pred_prob | |
y_pred_prob = logreg.predict_proba(X_test)[:,1] | |
# Generate ROC curve values: fpr, tpr, thresholds | |
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob) | |
# Plot ROC curve | |
import matplotlib.pyplot as plt | |
plt.plot([0, 1], [0, 1], 'k--') | |
plt.plot(fpr, tpr) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.title('ROC Curve') | |
plt.show() | |
### computing area under the ROC score | |
# Import necessary modules | |
from sklearn.model_selection import cross_val_score | |
from sklearn.metrics import roc_auc_score | |
# Compute and print AUC score | |
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob))) | |
# Compute cross-validated AUC scores: cv_auc | |
cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc') # note this uses the entire data set | |
# because it is using 'X' and 'y' vs the training and test splits | |
# Print list of AUC scores | |
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc)) | |
#-------------------------------------------- | |
# create binning/decile based calibration test | |
#-------------------------------------------- | |
# are we interested in descrimination (measured by the ROC curve) or | |
# calibration - see: http://econometricsense.blogspot.com/2013/04/is-roc-curve-good-metric-for-model.html?_sm_au_=iVVW0G00vKTNTs6PWckVjKH3VvpV6 | |
# get data frame with predicted probabilities so we can use pandas | |
tmp1 = pd.DataFrame({'prob':y_pred_prob}) | |
p10 = tmp1.quantile(.10) | |
p20 = tmp1.quantile(.20) | |
p30 = tmp1.quantile(.30) | |
p40 = tmp1.quantile(.40) | |
p50 = tmp1.quantile(.50) | |
p60 = tmp1.quantile(.60) | |
p70 = tmp1.quantile(.70) | |
p80 = tmp1.quantile(.80) | |
p90 = tmp1.quantile(.90) | |
# use cutoffs from above to segregate data into 10 groups | |
# Create a temporary data frame for scoring | |
tmp2 =pd.DataFrame({'Predicted':y_pred_prob,'ObservedOutcome':y_test}) | |
conditions = [ | |
(tmp2['Predicted'] <= p10[0]), | |
(tmp2['Predicted'] > p10[0]) & (tmp2['Predicted'] <= p20[0]), | |
(tmp2['Predicted'] > p20[0]) & (tmp2['Predicted'] <= p30[0]), | |
(tmp2['Predicted'] > p30[0]) & (tmp2['Predicted'] <= p40[0]), | |
(tmp2['Predicted'] > p40[0]) & (tmp2['Predicted'] <= p50[0]), | |
(tmp2['Predicted'] > p50[0]) & (tmp2['Predicted'] <= p60[0]), | |
(tmp2['Predicted'] > p60[0]) & (tmp2['Predicted'] <= p70[0]), | |
(tmp2['Predicted'] > p70[0]) & (tmp2['Predicted'] <= p80[0]), | |
(tmp2['Predicted'] > p80[0]) & (tmp2['Predicted'] <= p90[0]), | |
(tmp2['Predicted'] > p90[0])] | |
choices = ['D1', 'D2', 'D3','D4', 'D5', 'D6','D7', 'D8', 'D9','D10'] | |
tmp2['Probability Decile'] = np.select(conditions, choices, default='na') | |
# assess average target outcome and predicted probability within each decile | |
tmp3 = pd.DataFrame(tmp2.groupby(['Probability Decile']).size().reset_index(name='count')) | |
tmp4 = pd.DataFrame(tmp2.groupby(['Probability Decile']).Predicted.mean().reset_index()) | |
tmp5 = pd.DataFrame(tmp2.groupby(['Probability Decile']).ObservedOutcome.mean().reset_index()) | |
# built report dataframe | |
rpt = tmp3 | |
rpt['Predicted'] = tmp4.Predicted | |
rpt['Observed'] = tmp5.ObservedOutcome | |
print(rpt) # print report | |
#-------------------------------------------------- | |
# calibration plot | |
#------------------------------------------------- | |
plt.plot(rpt.Predicted, rpt.Observed, marker='+', linestyle='none') | |
#----------------------------------------------- | |
# score a new data set (example 1) | |
#----------------------------------------------- | |
# read data bank_mkt_score.csv | |
# Read the CSV file into a DataFrame | |
df2 = pd.read_csv('C:/Data/bank_mkt_score.csv') | |
df2.head() # inspect fields | |
# create numeric target (if you actually have target labels, which usually you may not) | |
df2['target'] = np.where(df2['y'] == 'yes', 1, 0) | |
df2.head() # check | |
# create score vector - | |
# you will have want to remove the target field y and ID to simulate a real world unlabeled example | |
# as well as avoid errors with predict method | |
X_score =df2.drop(['y','ID','target'], axis=1) | |
X_score = pd.get_dummies(X_score).values # convert characters to dummies and create feature array like training data above | |
# Create numeric target array for scoring | |
y_score = df2['target'].values | |
# go ahead and extract vector ID for merging | |
ID = df2['ID'].values | |
# use previously fit model and predict to score this data & | |
# Compute predicted probabilities: score_pred_prob | |
score_pred_prob = logreg.predict_proba(X_score)[:,1] | |
# add scores + ID | |
df3 = pd.DataFrame({'ID':ID,'prob':score_pred_prob}) | |
df3.head() | |
# merge with original data | |
bank_mkt_scored = pd.merge(df2,df3, on='ID', how='left') | |
bank_mkt_scored.head() # check | |
#---------------------------------------------------- | |
# create risk strata/score card | |
#---------------------------------------------------- | |
# can we segment our market based on the predicted probability of respondiing to | |
# our marketing campaign? | |
conditions = [ | |
(bank_mkt_scored['prob'] < .01), | |
(bank_mkt_scored['prob'] >= .01) & (bank_mkt_scored['prob'] < .05), | |
(bank_mkt_scored['prob'] >= .05) & (bank_mkt_scored['prob'] <= .10), | |
(bank_mkt_scored['prob'] > .10) & (bank_mkt_scored['prob'] <= .30), | |
(bank_mkt_scored['prob'] > .30) & (bank_mkt_scored['prob'] <= .60), | |
(bank_mkt_scored['prob'] > .6)] | |
choices = ['A - Exclude','B - Very Unlikely', 'C - Unlikely', 'D - Marginal','E - Likely','F - Most Likely'] | |
bank_mkt_scored['Prospect Level'] = np.select(conditions, choices, default='na') | |
# assess average target outcome and predicted probability within each decile | |
bank_mkt_scored.groupby(['Prospect Level']).size() # prospects per group | |
bank_mkt_scored.groupby(['Prospect Level']).prob.mean() # propensity to respond | |
# export to csv - for production or exploration in a dashboard | |
bank_mkt_scored.to_csv('C:/Report/test123.csv') # this will write to working directory | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment