Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save BioSciEconomist/7e2e64e482f49d353c3854179bb99343 to your computer and use it in GitHub Desktop.
Save BioSciEconomist/7e2e64e482f49d353c3854179bb99343 to your computer and use it in GitHub Desktop.
Predictive model training, validation, and scoring basic example
# *-----------------------------------------------------------------
# | PROGRAM NAME: ex bank marketing predictive model.py
# | DATE: 5/1/20
# | CREATED BY: MATT BOGARD
# | PROJECT FILE:
# *----------------------------------------------------------------
# | PURPOSE: basic example of predictive modeling, evaluation, and scoring
# *----------------------------------------------------------------
# Import numpy and pandas
import numpy as np
import pandas as pd
# Read the CSV file into a DataFrame
df = pd.read_csv('C://Data/bank_marketing.csv')
df.head() # inspect fields
# create binary outcome for target
df['target'] = np.where(df['y'] == 'yes', 1, 0)
df.head() # check
df.groupby(['target']).size().reset_index(name='count') # check
df.target.mean() # check
# Create arrays for the features and the response variable
y = df['target'].values # .values creates an array from pandas dataframe
X = df.drop(['target','y'], axis=1) # omit targets & factors with multiple levels (issues scoring)
X = pd.get_dummies(X).values # convert characters to dummies and create feature array
### Building a logistic regression model
# scikit-learn makes it very easy to try different models, since
# the Train-Test-Split/Instantiate/Fit/Predict paradigm applies to
# all classifiers and regressors - which are known in scikit-learn
# as 'estimators'
# Import the necessary modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state=42)
# Create the classifier: logreg
logreg = LogisticRegression()
# Fit the classifier to the training data
logreg.fit(X_train,y_train)
# Predict the labels of the test set: y_pred
y_pred = logreg.predict(X_test)
# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
### plotting an ROC curve
# Import necessary modules
from sklearn.metrics import roc_curve
# Compute predicted probabilities: y_pred_prob
y_pred_prob = logreg.predict_proba(X_test)[:,1]
# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
# Plot ROC curve
import matplotlib.pyplot as plt
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()
### computing area under the ROC score
# Import necessary modules
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))
# Compute cross-validated AUC scores: cv_auc
cv_auc = cross_val_score(logreg, X, y, cv=5, scoring='roc_auc') # note this uses the entire data set
# because it is using 'X' and 'y' vs the training and test splits
# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))
#--------------------------------------------
# create binning/decile based calibration test
#--------------------------------------------
# are we interested in descrimination (measured by the ROC curve) or
# calibration - see: http://econometricsense.blogspot.com/2013/04/is-roc-curve-good-metric-for-model.html?_sm_au_=iVVW0G00vKTNTs6PWckVjKH3VvpV6
# get data frame with predicted probabilities so we can use pandas
tmp1 = pd.DataFrame({'prob':y_pred_prob})
p10 = tmp1.quantile(.10)
p20 = tmp1.quantile(.20)
p30 = tmp1.quantile(.30)
p40 = tmp1.quantile(.40)
p50 = tmp1.quantile(.50)
p60 = tmp1.quantile(.60)
p70 = tmp1.quantile(.70)
p80 = tmp1.quantile(.80)
p90 = tmp1.quantile(.90)
# use cutoffs from above to segregate data into 10 groups
# Create a temporary data frame for scoring
tmp2 =pd.DataFrame({'Predicted':y_pred_prob,'ObservedOutcome':y_test})
conditions = [
(tmp2['Predicted'] <= p10[0]),
(tmp2['Predicted'] > p10[0]) & (tmp2['Predicted'] <= p20[0]),
(tmp2['Predicted'] > p20[0]) & (tmp2['Predicted'] <= p30[0]),
(tmp2['Predicted'] > p30[0]) & (tmp2['Predicted'] <= p40[0]),
(tmp2['Predicted'] > p40[0]) & (tmp2['Predicted'] <= p50[0]),
(tmp2['Predicted'] > p50[0]) & (tmp2['Predicted'] <= p60[0]),
(tmp2['Predicted'] > p60[0]) & (tmp2['Predicted'] <= p70[0]),
(tmp2['Predicted'] > p70[0]) & (tmp2['Predicted'] <= p80[0]),
(tmp2['Predicted'] > p80[0]) & (tmp2['Predicted'] <= p90[0]),
(tmp2['Predicted'] > p90[0])]
choices = ['D1', 'D2', 'D3','D4', 'D5', 'D6','D7', 'D8', 'D9','D10']
tmp2['Probability Decile'] = np.select(conditions, choices, default='na')
# assess average target outcome and predicted probability within each decile
tmp3 = pd.DataFrame(tmp2.groupby(['Probability Decile']).size().reset_index(name='count'))
tmp4 = pd.DataFrame(tmp2.groupby(['Probability Decile']).Predicted.mean().reset_index())
tmp5 = pd.DataFrame(tmp2.groupby(['Probability Decile']).ObservedOutcome.mean().reset_index())
# built report dataframe
rpt = tmp3
rpt['Predicted'] = tmp4.Predicted
rpt['Observed'] = tmp5.ObservedOutcome
print(rpt) # print report
#--------------------------------------------------
# calibration plot
#-------------------------------------------------
plt.plot(rpt.Predicted, rpt.Observed, marker='+', linestyle='none')
#-----------------------------------------------
# score a new data set (example 1)
#-----------------------------------------------
# read data bank_mkt_score.csv
# Read the CSV file into a DataFrame
df2 = pd.read_csv('C:/Data/bank_mkt_score.csv')
df2.head() # inspect fields
# create numeric target (if you actually have target labels, which usually you may not)
df2['target'] = np.where(df2['y'] == 'yes', 1, 0)
df2.head() # check
# create score vector -
# you will have want to remove the target field y and ID to simulate a real world unlabeled example
# as well as avoid errors with predict method
X_score =df2.drop(['y','ID','target'], axis=1)
X_score = pd.get_dummies(X_score).values # convert characters to dummies and create feature array like training data above
# Create numeric target array for scoring
y_score = df2['target'].values
# go ahead and extract vector ID for merging
ID = df2['ID'].values
# use previously fit model and predict to score this data &
# Compute predicted probabilities: score_pred_prob
score_pred_prob = logreg.predict_proba(X_score)[:,1]
# add scores + ID
df3 = pd.DataFrame({'ID':ID,'prob':score_pred_prob})
df3.head()
# merge with original data
bank_mkt_scored = pd.merge(df2,df3, on='ID', how='left')
bank_mkt_scored.head() # check
#----------------------------------------------------
# create risk strata/score card
#----------------------------------------------------
# can we segment our market based on the predicted probability of respondiing to
# our marketing campaign?
conditions = [
(bank_mkt_scored['prob'] < .01),
(bank_mkt_scored['prob'] >= .01) & (bank_mkt_scored['prob'] < .05),
(bank_mkt_scored['prob'] >= .05) & (bank_mkt_scored['prob'] <= .10),
(bank_mkt_scored['prob'] > .10) & (bank_mkt_scored['prob'] <= .30),
(bank_mkt_scored['prob'] > .30) & (bank_mkt_scored['prob'] <= .60),
(bank_mkt_scored['prob'] > .6)]
choices = ['A - Exclude','B - Very Unlikely', 'C - Unlikely', 'D - Marginal','E - Likely','F - Most Likely']
bank_mkt_scored['Prospect Level'] = np.select(conditions, choices, default='na')
# assess average target outcome and predicted probability within each decile
bank_mkt_scored.groupby(['Prospect Level']).size() # prospects per group
bank_mkt_scored.groupby(['Prospect Level']).prob.mean() # propensity to respond
# export to csv - for production or exploration in a dashboard
bank_mkt_scored.to_csv('C:/Report/test123.csv') # this will write to working directory
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment