Created
October 19, 2018 03:44
-
-
Save HarshSingh16/3e4689b892cf8ec822e6fadadd85be81 to your computer and use it in GitHub Desktop.
Predicting Customer Default for a US based financial institution
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
if("pacman" %in% rownames(installed.packages()) == FALSE) {install.packages("pacman")} # Check if you have universal installer package, install if not | |
pacman::p_load("caret","ROCR","lift","glmnet","MASS","e1071","readxl") #Check, and if needed install the necessary packages | |
###########LOADING THE CREDIT DATA FILE | |
creditdata1<-read_excel(file.choose()) | |
names(creditdata1)[1]<-"CurrentId" | |
str(creditdata1) | |
creditdata1$`OLD SYSTEM ID`<-NULL | |
###########LOADING THE NEW APPLICATIONS FILE | |
NEWAPPLICATIONS<-read.csv(choose.files()) | |
###########THE NEXT 25 LINES JUST CONVERT THE TYPES OF NEWAPPLICATIONS.CSV TO MATCH THAT OF CREDITDATA. PROPER VARIABLE CONVERSIONS HAVE BEEN PERFORMED LATER ONCE THE TWO DATASETS HAVE BEEN COMINED USING RBIND | |
names(NEWAPPLICATIONS)[1]<-"CurrentId" | |
NEWAPPLICATIONS$LIMIT_BAL<-as.character(NEWAPPLICATIONS$LIMIT_BAL) | |
NEWAPPLICATIONS$SEX<-as.numeric(NEWAPPLICATIONS$SEX) | |
NEWAPPLICATIONS$EDUCATION<-as.numeric(NEWAPPLICATIONS$EDUCATION) | |
NEWAPPLICATIONS$MARRIAGE<-as.numeric(NEWAPPLICATIONS$MARRIAGE) | |
NEWAPPLICATIONS$AGE<-as.numeric(NEWAPPLICATIONS$AGE) | |
NEWAPPLICATIONS$PAY_0<-as.numeric(NEWAPPLICATIONS$PAY_0) | |
NEWAPPLICATIONS$PAY_2<-as.numeric(NEWAPPLICATIONS$PAY_2) | |
NEWAPPLICATIONS$PAY_3<-as.numeric(NEWAPPLICATIONS$PAY_3) | |
NEWAPPLICATIONS$PAY_4<-as.numeric(NEWAPPLICATIONS$PAY_4) | |
NEWAPPLICATIONS$PAY_5<-as.numeric(NEWAPPLICATIONS$PAY_5) | |
NEWAPPLICATIONS$PAY_6<-as.numeric(NEWAPPLICATIONS$PAY_6) | |
NEWAPPLICATIONS$BILL_AMT1<-as.numeric(NEWAPPLICATIONS$BILL_AMT1) | |
NEWAPPLICATIONS$BILL_AMT2<-as.numeric(NEWAPPLICATIONS$BILL_AMT2) | |
NEWAPPLICATIONS$BILL_AMT3<-as.numeric(NEWAPPLICATIONS$BILL_AMT3) | |
NEWAPPLICATIONS$BILL_AMT4<-as.numeric(NEWAPPLICATIONS$BILL_AMT4) | |
NEWAPPLICATIONS$BILL_AMT5<-as.numeric(NEWAPPLICATIONS$BILL_AMT5) | |
NEWAPPLICATIONS$BILL_AMT6<-as.numeric(NEWAPPLICATIONS$BILL_AMT6) | |
NEWAPPLICATIONS$PAY_AMT1<-as.character(NEWAPPLICATIONS$PAY_AMT1) | |
NEWAPPLICATIONS$PAY_AMT2<-as.numeric(NEWAPPLICATIONS$PAY_AMT2) | |
NEWAPPLICATIONS$PAY_AMT3<-as.character(NEWAPPLICATIONS$PAY_AMT3) | |
NEWAPPLICATIONS$PAY_AMT4<-as.character(NEWAPPLICATIONS$PAY_AMT4) | |
NEWAPPLICATIONS$PAY_AMT5<-as.character(NEWAPPLICATIONS$PAY_AMT5) | |
NEWAPPLICATIONS$PAY_AMT6<-as.numeric(NEWAPPLICATIONS$PAY_AMT6) | |
str(NEWAPPLICATIONS) | |
NEWAPPLICATIONS[1,1]<-15001 | |
NEWAPPLICATIONS[2,1]<-15002 | |
NEWAPPLICATIONS$default.payment.next.month<-NA | |
creditdata<-rbind(creditdata1,NEWAPPLICATIONS) | |
str(creditdata) | |
str(NEWAPPLICATIONS) | |
creditdata$SEX<-as.factor(creditdata$SEX) | |
creditdata$EDUCATION<-as.factor(creditdata$EDUCATION) | |
creditdata$MARRIAGE<-as.factor(creditdata$MARRIAGE) | |
str(creditdata) | |
creditdata$PAY_0<-as.factor(creditdata$PAY_0) | |
creditdata$PAY_2<-as.factor(creditdata$PAY_2) | |
creditdata$PAY_3<-as.factor(creditdata$PAY_3) | |
creditdata$PAY_4<-as.factor(creditdata$PAY_4) | |
creditdata$PAY_5<-as.factor(creditdata$PAY_5) | |
creditdata$PAY_6<-as.factor(creditdata$PAY_6) | |
creditdata$LIMIT_BAL<-as.numeric(creditdata$LIMIT_BAL) | |
creditdata$BILL_AMT1<-as.numeric(creditdata$BILL_AMT1) | |
creditdata$BILL_AMT2<-as.numeric(creditdata$BILL_AMT2) | |
creditdata$BILL_AMT3<-as.numeric(creditdata$BILL_AMT3) | |
creditdata$BILL_AMT4<-as.numeric(creditdata$BILL_AMT4) | |
creditdata$BILL_AMT5<-as.numeric(creditdata$BILL_AMT5) | |
creditdata$BILL_AMT6<-as.numeric(creditdata$BILL_AMT6) | |
creditdata$PAY_AMT1<-as.numeric(creditdata$PAY_AMT1) | |
creditdata$PAY_AMT2<-as.numeric(creditdata$PAY_AMT2) | |
creditdata$PAY_AMT3<-as.numeric(creditdata$PAY_AMT3) | |
creditdata$PAY_AMT4<-as.numeric(creditdata$PAY_AMT4) | |
creditdata$PAY_AMT5<-as.numeric(creditdata$PAY_AMT5) | |
creditdata$PAY_AMT6<-as.numeric(creditdata$PAY_AMT6) | |
str(creditdata) | |
creditdata$AGE<-as.numeric(creditdata$AGE) | |
creditdata$default.payment.next.month<-as.factor(creditdata$default.payment.next.month) | |
str(creditdata) | |
######CHECKING NA's | |
sapply(creditdata, function(x)sum(is.na(x))) | |
#####SEPERATING THE OBSERVATIONS TO PREDICT FROM THE REST | |
tail(creditdata) | |
CREDIT1<-subset(creditdata,CurrentId<=15000) | |
CREDIT2<-subset(creditdata,CurrentId>15000) | |
###################Creating Training and Test Data | |
set.seed(77850) #set a random number generation seed to ensure that the split is the same everytime | |
inTrain <- createDataPartition(y = CREDIT1$default.payment.next.month, | |
p = 12000/15000, list = FALSE) | |
training <- CREDIT1[ inTrain,] | |
testing <- CREDIT1[ -inTrain,] | |
str(testing) | |
#########################Training the model using logisitc Regression | |
library(glmnet) | |
model_logistic<-glm(default.payment.next.month~.-CurrentId-SEX, data=training, family="binomial"(link="logit")) | |
summary(model_logistic) | |
model_logistic_stepwiseAIC<-model_logistic | |
summary(model_logistic_stepwiseAIC) | |
par(mfrow=c(1,4)) | |
plot(model_logistic_stepwiseAIC) #Error plots: similar nature to lm plots | |
par(mfrow=c(1,1)) | |
str(testing) | |
par(mar=c(6,4,4,2)) | |
###Finding predicitons: probabilities and classification | |
logistic_probabilities<-predict(model_logistic_stepwiseAIC,newdata=testing,type="response") #Predict probabilities | |
logistic_classification<-rep("1",2999) | |
logistic_classification[logistic_probabilities<0.5]="0" #Predict classification using 0.6073 threshold. Why 0.6073 - that's the average probability of being retained in the data. An alternative code: logistic_classification <- as.integer(logistic_probabilities > mean(testing$Retained.in.2012. == "1")) | |
logistic_classification<-as.factor(logistic_classification) | |
str(testing) | |
str(training) | |
###Confusion matrix | |
confusionMatrix(logistic_classification,testing$default.payment.next.month) #Display confusion matrix | |
####ROC Curve | |
logistic_ROC_prediction <- prediction(logistic_probabilities, testing$default.payment.next.month) | |
logistic_ROC <- performance(logistic_ROC_prediction,"tpr","fpr") #Create ROC curve data | |
plot(logistic_ROC) #Plot ROC curve | |
####AUC (area under curve) | |
auc.tmp <- performance(logistic_ROC_prediction,"auc") #Create AUC data | |
logistic_auc_testing <- as.numeric([email protected]) #Calculate AUC | |
logistic_auc_testing #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value | |
###################################Training the model using Random Forest | |
str(training) | |
library(randomForest) | |
model_forest <- randomForest(default.payment.next.month~ .-CurrentId, data=training, | |
importance=TRUE,proximity=TRUE, | |
cutoff = c(0.5, 0.5),type="classification") #cutoffs need to be determined for class 0 and class 1. By default 50/50, but need not be those necessarily | |
print(model_forest) | |
plot(model_forest) | |
importance(model_forest) | |
varImpPlot(model_forest) | |
###Finding predicitons: probabilities and classification | |
forest_probabilities<-predict(model_forest,newdata=testing,type="prob") #Predict probabilities -- an array with 2 columns: for not retained (class 0) and for retained (class 1) | |
forest_classification<-rep("1",2999) | |
forest_classification[forest_probabilities[,2]<0.5]="0" #Predict classification using 0.5 threshold. Why 0.5 and not 0.6073? Use the same as in cutoff above | |
forest_classification<-as.factor(forest_classification) | |
confusionMatrix(forest_classification,testing$default.payment.next.month) #Display confusion matrix. Note, confusion matrix actually displays a better accuracy with threshold of 50% | |
#There is also a "shortcut" forest_prediction<-predict(model_forest,newdata=testing, type="response") | |
#But it by default uses threshold of 50%: actually works better (more accuracy) on this data | |
####ROC Curve | |
forest_ROC_prediction <- prediction(forest_probabilities[,2], testing$default.payment.next.month) #Calculate errors | |
forest_ROC <- performance(forest_ROC_prediction,"tpr","fpr") #Create ROC curve data | |
plot(forest_ROC) #Plot ROC curve | |
plot(forest_ROC, add=TRUE, col="blue") #For comparison, overlay/add the ROC curve from (A) in red | |
legend("right", legend=c("Logistic","Random Forest"), col=c("red","blue"), lty=1:2, cex=0.6) | |
####AUC (area under curve) | |
AUC.tmp <- performance(forest_ROC_prediction,"auc") #Create AUC data | |
forest_AUC <- as.numeric([email protected]) #Calculate AUC | |
forest_AUC #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value | |
######################################### TRAINING THE MODEL USING XGBOOST | |
library(xgboost) | |
training.x <-model.matrix(default.payment.next.month~ ., data = training) | |
testing.x <-model.matrix(default.payment.next.month~ ., data = testing) | |
model_XGboost<-xgboost(data = data.matrix(training.x[,-1:-3]), | |
label = as.numeric(as.character(training$default.payment.next.month)), | |
eta = 0.1, | |
max_depth = 20, | |
nround=50, | |
objective = "binary:logistic") | |
XGboost_prediction<-predict(model_XGboost,newdata=testing.x[,-1:-3], type="response") #Predict classification (for confusion matrix) | |
XGboost_prediction2<-as.factor(ifelse(XGboost_prediction>0.5,1,0)) | |
confusionMatrix(XGboost_prediction2,testing$default.payment.next.month) #Display confusion matrix | |
####ROC Curve | |
XGboost_pred_testing <- prediction(XGboost_prediction, testing$default.payment.next.month) #Calculate errors | |
XGboost_ROC_testing <- performance(XGboost_pred_testing,"tpr","fpr") #Create ROC curve data | |
plot(XGboost_ROC_testing) #Plot ROC curve | |
plot(XGboost_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red | |
legend("right", legend=c("Logistic","Random Forest","XGBoost"), col=c("red","blue","green"), lty=1:2, cex=0.6) | |
####AUC | |
auc.tmp <- performance(XGboost_pred_testing,"auc") #Create AUC data | |
XGboost_auc_testing <- as.numeric([email protected]) #Calculate AUC | |
XGboost_auc_testing #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value | |
######################TRAINING THE MODEL USING GRADIENT BOOSTING | |
library(gbm) | |
str(training) | |
training$default.payment.next.month<-as.character(training$default.payment.next.month) | |
model_ExtremeGradientBoosting<-gbm(default.payment.next.month~.-CurrentId, | |
distribution="bernoulli", | |
data=training, | |
n.trees=1000, | |
interaction.depth = 4, | |
shrinkage = 0.01) | |
summary(model_ExtremeGradientBoosting) | |
GBM_prediction<-predict(model_ExtremeGradientBoosting,testing,n.trees = 1000,type = "response") #Predict classification (for confusion matrix) | |
GBM_prediction2<-as.factor(ifelse(GBM_prediction>0.5,1,0)) | |
confusionMatrix(GBM_prediction2,testing$default.payment.next.month) #Display confusion matrix | |
####ROC Curve | |
GBM_pred_testing <- prediction(GBM_prediction, testing$default.payment.next.month) #Calculate errors | |
GBM_ROC_testing <- performance(GBM_pred_testing,"tpr","fpr") #Create ROC curve data | |
plot(GBM_ROC_testing) #Plot ROC curve | |
plot(GBM_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red | |
legend("right", legend=c("Logistic","Random Forest","XGBoost"), col=c("red","blue","green"), lty=1:2, cex=0.6) | |
####AUC | |
auc.tmp <- performance(GBM_pred_testing,"auc") #Create AUC data | |
gbm_auc_testing <- as.numeric([email protected]) #Calculate AUC | |
gbm_auc_testing ##AUC Score WITHOUT Feature Engineering | |
plot(GBM_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red | |
legend("right", legend=c("Without feature engineering"), col=c("green"), lty=1:2, cex=0.6) | |
#########################PREDICTING THE TWO NEW OBSERVATIONS | |
str(creditdata) | |
tail(creditdata) | |
CREDIT2<-subset(creditdata,CurrentId>15000) | |
tail(CREDIT2) | |
library(gbm) | |
str(training) | |
CREDIT1$default.payment.next.month<-as.character(CREDIT1$default.payment.next.month) | |
model_ExtremeGradientBoosting<-gbm(default.payment.next.month~.-CurrentId, | |
distribution="bernoulli", | |
data=CREDIT1, | |
n.trees=1000, | |
interaction.depth = 4, | |
shrinkage = 0.01) | |
summary(model_ExtremeGradientBoosting) | |
GBM_prediction<-predict(model_ExtremeGradientBoosting,CREDIT2,n.trees = 1000,type = "response") #Predict classification (for confusion matrix) | |
GBM_prediction2<-as.factor(ifelse(GBM_prediction>0.5,1,0)) | |
CREDIT2 | |
str(CREDIT2) | |
str(creditdata) | |
##################################### PERFORMING FEATURE ENGINEERING | |
creditdata<-CREDIT1 | |
str(creditdata) | |
creditdata$SEX<-as.factor(creditdata$SEX) | |
creditdata$EDUCATION<-as.factor(creditdata$EDUCATION) | |
creditdata$MARRIAGE<-as.factor(creditdata$MARRIAGE) | |
creditdata$PAY_0<-as.factor(creditdata$PAY_0) | |
creditdata$PAY_2<-as.factor(creditdata$PAY_2) | |
creditdata$PAY_3<-as.factor(creditdata$PAY_3) | |
creditdata$PAY_4<-as.factor(creditdata$PAY_4) | |
creditdata$PAY_5<-as.factor(creditdata$PAY_5) | |
creditdata$PAY_6<-as.factor(creditdata$PAY_6) | |
creditdata$LIMIT_BAL<-as.numeric(creditdata$LIMIT_BAL) | |
creditdata$BILL_AMT1<-as.numeric(creditdata$BILL_AMT1) | |
creditdata$BILL_AMT2<-as.numeric(creditdata$BILL_AMT2) | |
creditdata$BILL_AMT3<-as.numeric(creditdata$BILL_AMT3) | |
creditdata$BILL_AMT4<-as.numeric(creditdata$BILL_AMT4) | |
creditdata$BILL_AMT5<-as.numeric(creditdata$BILL_AMT5) | |
creditdata$BILL_AMT6<-as.numeric(creditdata$BILL_AMT6) | |
creditdata$PAY_AMT1<-as.numeric(creditdata$PAY_AMT1) | |
creditdata$PAY_AMT3<-as.numeric(creditdata$PAY_AMT3) | |
creditdata$PAY_AMT4<-as.numeric(creditdata$PAY_AMT4) | |
creditdata$PAY_AMT5<-as.numeric(creditdata$PAY_AMT5) | |
creditdata$AGE<-as.numeric(creditdata$AGE) | |
creditdata$default.payment.next.month<-as.factor(creditdata$default.payment.next.month) | |
str(creditdata) | |
table(creditdata$AGE) | |
sapply(creditdata, function(x)sum(is.na(x))) | |
names(creditdata)[1]<-"CurrentId" | |
summary(creditdata) | |
str(creditdata) | |
######################feature engineering for sex | |
#creditdata$SEX1<-as.numeric(creditdata$SEX) | |
#creditdata$SexLimit<-creditdata$LIMIT_BAL*creditdata$SEX1 | |
#creditdata$Billamt1sex<-creditdata$BILL_AMT1*creditdata$SEX1 | |
#creditdata$Billamt2sex<-creditdata$BILL_AMT2*creditdata$SEX1 | |
#creditdata$Billamt3sex<-creditdata$BILL_AMT3*creditdata$SEX1 | |
#creditdata$Billamt4sex<-creditdata$BILL_AMT4*creditdata$SEX1 | |
#creditdata$Billamt5sex<-creditdata$BILL_AMT5*creditdata$SEX1 | |
#creditdata$Billamt6sex<-creditdata$BILL_AMT6*creditdata$SEX1 | |
#creditdata$PayAmt1sex<-creditdata$PAY_AMT1*creditdata$SEX1 | |
#creditdata$PayAmt2sex<-creditdata$PAY_AMT2*creditdata$SEX1 | |
#creditdata$PayAmt3sex<-creditdata$PAY_AMT3*creditdata$SEX1 | |
#creditdata$PayAmt4sex<-creditdata$PAY_AMT4*creditdata$SEX1 | |
#creditdata$PayAmt5sex<-creditdata$PAY_AMT5*creditdata$SEX1 | |
#creditdata$PayAmt6sex<-creditdata$PAY_AMT6*creditdata$SEX1 | |
######################feature engineering for education | |
#########creditdata$EDUCATION2<-creditdata$EDUCATION | |
##########creditdata$EDUCATION2<-as.character(creditdata$EDUCATION2) | |
###########creditdata[which(creditdata$EDUCATION2==0),"EDUCATION2"]<-4 | |
#creditdata[which(creditdata$EDUCATION2==4),"EDUCATION2"]<-4 | |
#creditdata[which(creditdata$EDUCATION2==5),"EDUCATION2"]<-4 | |
#creditdata[which(creditdata$EDUCATION2==6),"EDUCATION2"]<-4 | |
#creditdata$EDUCATION2<-as.factor(creditdata$EDUCATION2) | |
#creditdata$EDUCATION1<-as.numeric(creditdata$EDUCATION) | |
#creditdata$eduLimit<-creditdata$LIMIT_BAL*creditdata$EDUCATION1 | |
#creditdata$Billamt1edu<-creditdata$BILL_AMT1*creditdata$EDUCATION1 | |
#creditdata$Billamt2edu<-creditdata$BILL_AMT2*creditdata$EDUCATION1 | |
#creditdata$Billamt3edu<-creditdata$BILL_AMT3*creditdata$EDUCATION1 | |
#creditdata$Billamt4edu<-creditdata$BILL_AMT4*creditdata$EDUCATION1 | |
#creditdata$Billamt5edu<-creditdata$BILL_AMT5*creditdata$EDUCATION1 | |
#creditdata$Billamt6edu<-creditdata$BILL_AMT6*creditdata$EDUCATION1 | |
#creditdata$PayAmt1edu<-creditdata$PAY_AMT1*creditdata$EDUCATION1 | |
#creditdata$PayAmt2edu<-creditdata$PAY_AMT2*creditdata$EDUCATION1 | |
#creditdata$PayAmt3edu<-creditdata$PAY_AMT3*creditdata$EDUCATION1 | |
#creditdata$PayAmt4edu<-creditdata$PAY_AMT4*creditdata$EDUCATION1 | |
#creditdata$PayAmt5edu<-creditdata$PAY_AMT5*creditdata$EDUCATION1 | |
#creditdata$PayAmt6edu<-creditdata$PAY_AMT6*creditdata$EDUCATION1 | |
######################feature engineering for MARRIAGE | |
#creditdata$MARRIAGE1<-as.numeric(creditdata$MARRIAGE) | |
#creditdata$MarriageLimit<-creditdata$LIMIT_BAL*creditdata$MARRIAGE1 | |
#creditdata$Billamt1mARRIAGE<-creditdata$BILL_AMT1*creditdata$MARRIAGE1 | |
#creditdata$Billamt2mARRIAGE<-creditdata$BILL_AMT2*creditdata$MARRIAGE1 | |
#creditdata$Billamt3mARRIAGE<-creditdata$BILL_AMT3*creditdata$MARRIAGE1 | |
#creditdata$Billamt4mARRIAGE<-creditdata$BILL_AMT4*creditdata$MARRIAGE1 | |
#creditdata$Billamt5mARRIAGE<-creditdata$BILL_AMT5*creditdata$MARRIAGE1 | |
#creditdata$Billamt6mARRIAGE<-creditdata$BILL_AMT6*creditdata$MARRIAGE1 | |
#creditdata$PayAmt1MARRIAGE<-creditdata$PAY_AMT1*creditdata$MARRIAGE1 | |
#creditdata$PayAmt2MARRIAGE<-creditdata$PAY_AMT2creditdata$MARRIAGE1 | |
#creditdata$PayAmt3MARRIAGE<-creditdata$PAY_AMT3*creditdata$MARRIAGE1 | |
#creditdata$PayAmt4MARRIAGE<-creditdata$PAY_AMT4*creditdata$MARRIAGE1 | |
#creditdata$PayAmt5MARRIAGE<-creditdata$PAY_AMT5*creditdata$MARRIAGE1 | |
#creditdata$PayAmt6MARRIAGE<-creditdata$PAY_AMT6*creditdata$MARRIAGE1 | |
#######RUN THIS | |
creditdata$Month1<-(creditdata$BILL_AMT1-creditdata$BILL_AMT2) | |
creditdata$Month2<-(creditdata$BILL_AMT2-creditdata$BILL_AMT3) | |
creditdata$Month3<-(creditdata$BILL_AMT3-creditdata$BILL_AMT4) | |
creditdata$Month4<-(creditdata$BILL_AMT4-creditdata$BILL_AMT5) | |
creditdata$Month5<-(creditdata$BILL_AMT5-creditdata$BILL_AMT6) | |
creditdata$DiffPayAmt1<-(creditdata$PAY_AMT1-creditdata$PAY_AMT2) | |
creditdata$DiffPayAmt2<-(creditdata$PAY_AMT2-creditdata$PAY_AMT3) | |
creditdata$DiffPayAmt3<-(creditdata$PAY_AMT3-creditdata$PAY_AMT4) | |
creditdata$DiffPayAmt4<-(creditdata$PAY_AMT4-creditdata$PAY_AMT5) | |
creditdata$DiffPayAmt5<-(creditdata$PAY_AMT5-creditdata$PAY_AMT6) | |
str(creditdata) | |
creditdata$Month1BalanceProp<-max(creditdata$BILL_AMT1,0)/creditdata$LIMIT_BAL | |
creditdata$Month2BalanceProp<-max(creditdata$BILL_AMT2,0)/creditdata$LIMIT_BAL | |
creditdata$Month3BalanceProp<-max(creditdata$BILL_AMT3,0)/creditdata$LIMIT_BAL | |
creditdata$Month4BalanceProp<-max(creditdata$BILL_AMT4,0)/creditdata$LIMIT_BAL | |
creditdata$Month5BalanceProp<-max(creditdata$BILL_AMT5,0)/creditdata$LIMIT_BAL | |
creditdata$Month6BalanceProp<-max(creditdata$BILL_AMT6,0)/creditdata$LIMIT_BAL | |
creditdata$AggregateLimBalance<-max(creditdata$BILL_AMT1+creditdata$BILL_AMT2+ | |
creditdata$BILL_AMT3+creditdata$BILL_AMT4+ | |
creditdata$BILL_AMT5+creditdata$BILL_AMT6,0)/creditdata$LIMIT_BAL | |
#####RUN THIS | |
creditdata$PayAmtBalance1<-ifelse(creditdata$BILL_AMT1<=0,1,(creditdata$PAY_AMT1/creditdata$BILL_AMT1)) | |
creditdata$PayAmtBalance2<-ifelse(creditdata$BILL_AMT2<=0,1,(creditdata$PAY_AMT2/creditdata$BILL_AMT2)) | |
creditdata$PayAmtBalance3<-ifelse(creditdata$BILL_AMT3<=0,1,(creditdata$PAY_AMT3/creditdata$BILL_AMT3)) | |
creditdata$PayAmtBalance4<-ifelse(creditdata$BILL_AMT4<=0,1,(creditdata$PAY_AMT4/creditdata$BILL_AMT4)) | |
creditdata$PayAmtBalance5<-ifelse(creditdata$BILL_AMT5<=0,1,(creditdata$PAY_AMT5/creditdata$BILL_AMT5)) | |
creditdata$PayAmtBalance6<-ifelse(creditdata$BILL_AMT6<=0,1,(creditdata$PAY_AMT6/creditdata$BILL_AMT6)) | |
#####RUN THIS | |
creditdata$TotalBalancedue<-creditdata$BILL_AMT1+creditdata$BILL_AMT2+creditdata$BILL_AMT3+creditdata$BILL_AMT4+ | |
creditdata$BILL_AMT5+creditdata$BILL_AMT6 | |
##########RUN THIS | |
creditdata$PayAmtdue<-creditdata$PAY_AMT1+creditdata$PAY_AMT2+creditdata$PAY_AMT3+creditdata$PAY_AMT4+ | |
creditdata$PAY_AMT5+creditdata$PAY_AMT6 | |
#######RUN THIS | |
creditdata$Difference<-creditdata$TotalBalancedue-creditdata$PayAmtdue | |
creditdata$Difference1<-creditdata$LIMIT_BAL-creditdata$TotalBalancedue | |
creditdata$Difference2<-creditdata$LIMIT_BAL-creditdata$PayAmtdue | |
creditdata$TWOMONTHBILLAMNT1<-creditdata$BILL_AMT1+creditdata$BILL_AMT2 | |
creditdata$TWOMONTHBILLAMNT2<-creditdata$BILL_AMT3+creditdata$BILL_AMT4 | |
creditdata$TWOMONTHBILLAMNT3<-creditdata$BILL_AMT5+creditdata$BILL_AMT6 | |
creditdata$TWOMONTHpayAMNT1<-creditdata$PAY_AMT1+creditdata$PAY_AMT2 | |
creditdata$TWOMONTHpayAMNT2<-creditdata$PAY_AMT3+creditdata$PAY_AMT4 | |
creditdata$TWOMONTHpayAMNT3<-creditdata$PAY_AMT5+creditdata$PAY_AMT6 | |
creditdata$D1<-creditdata$BILL_AMT1-creditdata$PAY_AMT1 | |
creditdata$D2<-creditdata$BILL_AMT2-creditdata$PAY_AMT2 | |
creditdata$D3<-creditdata$BILL_AMT3-creditdata$PAY_AMT3 | |
creditdata$D4<-creditdata$BILL_AMT4-creditdata$PAY_AMT4 | |
creditdata$D5<-creditdata$BILL_AMT5-creditdata$PAY_AMT5 | |
creditdata$D6<-creditdata$BILL_AMT6-creditdata$PAY_AMT6 | |
creditdata$newdiff1<-creditdata$BILL_AMT1-creditdata$BILL_AMT3 | |
creditdata$newdiff2<-creditdata$BILL_AMT3-creditdata$BILL_AMT5 | |
creditdata$newdiff3<-creditdata$BILL_AMT2-creditdata$BILL_AMT4 | |
creditdata$newdiff4<-creditdata$BILL_AMT4-creditdata$BILL_AMT6 | |
creditdata$newpaydeiff1<-creditdata$PAY_AMT1-creditdata$PAY_AMT3 | |
creditdata$newpaydeiff2<-creditdata$PAY_AMT3-creditdata$PAY_AMT5 | |
creditdata$newpaydeiff3<-creditdata$PAY_AMT2-creditdata$PAY_AMT4 | |
creditdata$newpaydeiff2<-creditdata$PAY_AMT4-creditdata$PAY_AMT6 | |
#creditdata$extremedeiff1<-creditdata$BILL_AMT1-creditdata$BILL_AMT6 | |
#creditdata$extremedeiff2<-creditdata$BILL_AMT1-creditdata$BILL_AMT5 | |
#creditdata$extremedeiff3<-creditdata$BILL_AMT1-creditdata$BILL_AMT4 | |
#creditdata$extremedeiff4<-creditdata$BILL_AMT1-creditdata$BILL_AMT3 | |
#creditdata$extremedeiff5<-creditdata$BILL_AMT2-creditdata$BILL_AMT6 | |
#creditdata$extremedeiff6<-creditdata$BILL_AMT2-creditdata$BILL_AMT5 | |
#creditdata$extremedeiff7<-creditdata$BILL_AMT2-creditdata$BILL_AMT4 | |
#creditdata$extremedeiff8<-creditdata$BILL_AMT3-creditdata$BILL_AMT6 | |
#creditdata$extremedeiff9<-creditdata$BILL_AMT3-creditdata$BILL_AMT5 | |
#creditdata$extremedeiff10<-creditdata$BILL_AMT4-creditdata$BILL_AMT6 | |
#creditdata$extremediffpaydue<-creditdata$PAY_AMT1-creditdata$PAY_AMT6 | |
#creditdata$extremediffpaydue2<-creditdata$PAY_AMT1-creditdata$PAY_AMT5 | |
#creditdata$extremediffpaydue3<-creditdata$PAY_AMT1-creditdata$PAY_AMT4 | |
#creditdata$extremediffpaydue4<-creditdata$PAY_AMT1-creditdata$PAY_AMT3 | |
#creditdata$extremediffpaydue10<-creditdata$PAY_AMT2-creditdata$PAY_AMT6 | |
#creditdata$extremediffpaydue5<-creditdata$PAY_AMT2-creditdata$PAY_AMT5 | |
#creditdata$extremediffpaydue6<-creditdata$PAY_AMT2-creditdata$PAY_AMT4 | |
#creditdata$extremediffpaydue7<-creditdata$PAY_AMT3-creditdata$PAY_AMT6 | |
#creditdata$extremediffpaydue8<-creditdata$PAY_AMT3-creditdata$PAY_AMT5 | |
#creditdata$extremediffpaydue9<-creditdata$PAY_AMT4-creditdata$PAY_AMT6 | |
creditdata$ThreemonthBillAmtDue<-creditdata$BILL_AMT1+creditdata$BILL_AMT2+creditdata$BILL_AMT3 | |
creditdata$ThreemonthBillAmtDue2<-creditdata$BILL_AMT4+creditdata$BILL_AMT5+creditdata$BILL_AMT6 | |
creditdata$ThreemonthPayAmtDue<-creditdata$PAY_AMT1+creditdata$PAY_AMT2+creditdata$PAY_AMT3 | |
creditdata$ThreemonthPayAmtDue2<-creditdata$PAY_AMT4+creditdata$PAY_AMT5+creditdata$PAY_AMT6 | |
###############################CORRECTING PAY_0 | |
#creditdata$PAY_00<-creditdata$PAY_0 | |
#R1<-which(creditdata$PAY_00==4) | |
#R2<-which(creditdata$PAY_00==5) | |
#R3<-which(creditdata$PAY_00==6) | |
#R4<-which(creditdata$PAY_00==7) | |
#R5<-which(creditdata$PAY_00==8) | |
#R6<-which(creditdata$PAY_00==-2) | |
#R7<-which(creditdata$PAY_00==-1) | |
#R8<-which(creditdata$PAY_00==0) | |
#creditdata[R1,"PAY_00"]<-4 | |
#creditdata[R2,"PAY_00"]<-4 | |
#creditdata[R3,"PAY_00"]<-4 | |
#creditdata[R4,"PAY_00"]<-4 | |
#creditdata[R5,"PAY_00"]<-4 | |
#creditdata[R6,"PAY_00"]<-0 | |
#creditdata[R7,"PAY_00"]<-0 | |
#creditdata[R8,"PAY_00"]<-0 | |
###############################CORRECTING PAY_2 | |
creditdata$PAY_02<-creditdata$PAY_2 | |
R1<-which(creditdata$PAY_02==4) | |
R2<-which(creditdata$PAY_02==5) | |
R3<-which(creditdata$PAY_02==6) | |
R4<-which(creditdata$PAY_02==7) | |
R5<-which(creditdata$PAY_02==8) | |
R6<-which(creditdata$PAY_02==-2) | |
R7<-which(creditdata$PAY_02==-1) | |
R8<-which(creditdata$PAY_02==0) | |
creditdata[R1,"PAY_02"]<-4 | |
creditdata[R2,"PAY_02"]<-4 | |
creditdata[R3,"PAY_02"]<-4 | |
creditdata[R4,"PAY_02"]<-4 | |
creditdata[R5,"PAY_02"]<-4 | |
creditdata[R6,"PAY_02"]<-0 | |
creditdata[R7,"PAY_02"]<-0 | |
creditdata[R8,"PAY_02"]<-0 | |
###############################CORRECTING PAY_3 | |
#creditdata$PAY_03<-creditdata$PAY_3 | |
#R1<-which(creditdata$PAY_03==4) | |
#R2<-which(creditdata$PAY_03==5) | |
#R3<-which(creditdata$PAY_03==6) | |
#R4<-which(creditdata$PAY_03==7) | |
#R5<-which(creditdata$PAY_03==8) | |
#R6<-which(creditdata$PAY_03==-2) | |
#R7<-which(creditdata$PAY_03==-1) | |
#R8<-which(creditdata$PAY_03==0) | |
#creditdata[R1,"PAY_03"]<-4 | |
#creditdata[R2,"PAY_03"]<-4 | |
#creditdata[R3,"PAY_03"]<-4 | |
#creditdata[R4,"PAY_03"]<-4 | |
#creditdata[R5,"PAY_03"]<-4 | |
#creditdata[R6,"PAY_03"]<-0 | |
#creditdata[R7,"PAY_03"]<-0 | |
#creditdata[R8,"PAY_03"]<-0 | |
###############################CORRECTING PAY_5 | |
creditdata$PAY_05<-creditdata$PAY_5 | |
R1<-which(creditdata$PAY_05==4) | |
R2<-which(creditdata$PAY_05==5) | |
R3<-which(creditdata$PAY_05==6) | |
R4<-which(creditdata$PAY_05==7) | |
R5<-which(creditdata$PAY_05==8) | |
R6<-which(creditdata$PAY_05==-2) | |
R7<-which(creditdata$PAY_05==-1) | |
R8<-which(creditdata$PAY_05==0) | |
creditdata[R1,"PAY_05"]<-4 | |
creditdata[R2,"PAY_05"]<-4 | |
creditdata[R3,"PAY_05"]<-4 | |
creditdata[R4,"PAY_05"]<-4 | |
creditdata[R5,"PAY_05"]<-4 | |
creditdata[R6,"PAY_05"]<-0 | |
creditdata[R7,"PAY_05"]<-0 | |
creditdata[R8,"PAY_05"]<-0 | |
###############################CORRECTING PAY_6 | |
creditdata$PAY_06<-creditdata$PAY_6 | |
R1<-which(creditdata$PAY_06==4) | |
R2<-which(creditdata$PAY_06==5) | |
R3<-which(creditdata$PAY_06==6) | |
R4<-which(creditdata$PAY_06==7) | |
R5<-which(creditdata$PAY_06==8) | |
R6<-which(creditdata$PAY_06==-2) | |
R7<-which(creditdata$PAY_06==-1) | |
R8<-which(creditdata$PAY_06==0) | |
creditdata[R1,"PAY_06"]<-4 | |
creditdata[R2,"PAY_06"]<-4 | |
creditdata[R3,"PAY_06"]<-4 | |
creditdata[R4,"PAY_06"]<-4 | |
creditdata[R5,"PAY_06"]<-4 | |
creditdata[R6,"PAY_06"]<-0 | |
creditdata[R7,"PAY_06"]<-0 | |
creditdata[R8,"PAY_06"]<-0 | |
############CORRECTING EDUCATION | |
creditdata$EDUCATION2<-creditdata$EDUCATION | |
Row1<-which(creditdata$EDUCATION2==4) | |
Row2<-which(creditdata$EDUCATION2==5) | |
Row3<-which(creditdata$EDUCATION2==6) | |
Row4<-which(creditdata$EDUCATION2==0) | |
creditdata[Row1,"EDUCATION2"]<-0 | |
creditdata[Row2,"EDUCATION2"]<-0 | |
creditdata[Row3,"EDUCATION2"]<-0 | |
creditdata[Row4,"EDUCATION2"]<-0 | |
############CORRECTING MARRIAGE | |
creditdata$Mariage2<-creditdata$MARRIAGE | |
Row1<-which(creditdata$Mariage2==0) | |
creditdata[Row1,"Mariage2"]<-3 | |
str(creditdata) | |
table(creditdata$AGE) | |
creditdata3<-creditdata | |
row1<-which(creditdata3$BILL_AMT1==0 & creditdata3$BILL_AMT2==0&creditdata3$BILL_AMT3==0&creditdata3$BILL_AMT4==0& | |
creditdata3$BILL_AMT5==0&creditdata3$BILL_AMT6==0&creditdata3$default.payment.next.month==1) | |
creditdata3<-creditdata3[-row1,] | |
str(creditdata3) | |
str(creditdata) | |
############club 4, 5 and 6 in education as others and include 0 values | |
########### club 3 in marriage as others | |
sapply(training1, function(x) sum(is.na(x))) | |
hist(creditdata$Month2) | |
###################Creating Data Partition | |
set.seed(77800) | |
inTrain1 <- createDataPartition(y = creditdata3$default.payment.next.month, | |
p = 12500/14837, list = FALSE) | |
training <- creditdata3[ inTrain1,] | |
testing <- creditdata3[ -inTrain1,] | |
str(testing) | |
######################Extreme Gracdient Boosting with feature engineered variables | |
library(gbm) | |
str(training) | |
training$default.payment.next.month<-as.character(training$default.payment.next.month) | |
model_ExtremeGradientBoosting<-gbm(default.payment.next.month ~ ., | |
distribution="bernoulli", | |
data=training, | |
n.trees=1000, | |
interaction.depth = 5, | |
shrinkage = 0.016) | |
summary(model_ExtremeGradientBoosting) | |
gbm_prediction<-predict(model_ExtremeGradientBoosting,testing,n.trees = 1000,type = "response") #Predict classification (for confusion matrix) | |
gbm_prediction2<-as.factor(ifelse(gbm_prediction>0.5,1,0)) | |
confusionMatrix(gbm_prediction2,testing$default.payment.next.month) #Display confusion matrix | |
####ROC Curve | |
gbm_pred_testing <- prediction(gbm_prediction, testing$default.payment.next.month) #Calculate errors | |
gbm_ROC_testing <- performance(gbm_pred_testing,"tpr","fpr") #Create ROC curve data | |
plot(gbm_ROC_testing) #Plot ROC curve | |
plot(gbm_ROC_testing, add=TRUE, col="blue") #For comparison, overlay/add the ROC curve from (A) in red | |
legend("right", legend=c("Without feature engineering","With feature engineering"), col=c("green","blue"), lty=1:2, cex=0.6) | |
####AUC | |
auc.tmp <- performance(gbm_pred_testing,"auc") #Create AUC data | |
GBM_auc_testing <- as.numeric([email protected]) #Calculate AUC | |
GBM_auc_testing #D | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment