HarshSingh16 · October 19, 2018 03:44
diff --git a/Predicitng Customer Default.R b/Predicitng Customer Default.R
 if("pacman" %in% rownames(installed.packages()) == FALSE) {install.packages("pacman")} # Check if you have universal installer package, install if not

 pacman::p_load("caret","ROCR","lift","glmnet","MASS","e1071","readxl") #Check, and if needed install the necessary packages
 ###########LOADING THE CREDIT DATA FILE
 creditdata1<-read_excel(file.choose())
 names(creditdata1)[1]<-"CurrentId"
 str(creditdata1)
 creditdata1$`OLD SYSTEM ID`<-NULL
 ###########LOADING THE NEW APPLICATIONS FILE
 NEWAPPLICATIONS<-read.csv(choose.files())
 ###########THE NEXT 25 LINES JUST CONVERT THE TYPES OF NEWAPPLICATIONS.CSV TO MATCH THAT OF CREDITDATA. PROPER VARIABLE CONVERSIONS HAVE BEEN PERFORMED LATER ONCE THE TWO DATASETS HAVE BEEN COMINED USING RBIND
 names(NEWAPPLICATIONS)[1]<-"CurrentId"
 NEWAPPLICATIONS$LIMIT_BAL<-as.character(NEWAPPLICATIONS$LIMIT_BAL)
 NEWAPPLICATIONS$SEX<-as.numeric(NEWAPPLICATIONS$SEX)
 NEWAPPLICATIONS$EDUCATION<-as.numeric(NEWAPPLICATIONS$EDUCATION)
 NEWAPPLICATIONS$MARRIAGE<-as.numeric(NEWAPPLICATIONS$MARRIAGE)
 NEWAPPLICATIONS$AGE<-as.numeric(NEWAPPLICATIONS$AGE)
 NEWAPPLICATIONS$PAY_0<-as.numeric(NEWAPPLICATIONS$PAY_0)
 NEWAPPLICATIONS$PAY_2<-as.numeric(NEWAPPLICATIONS$PAY_2)
 NEWAPPLICATIONS$PAY_3<-as.numeric(NEWAPPLICATIONS$PAY_3)
 NEWAPPLICATIONS$PAY_4<-as.numeric(NEWAPPLICATIONS$PAY_4)
 NEWAPPLICATIONS$PAY_5<-as.numeric(NEWAPPLICATIONS$PAY_5)
 NEWAPPLICATIONS$PAY_6<-as.numeric(NEWAPPLICATIONS$PAY_6)
 NEWAPPLICATIONS$BILL_AMT1<-as.numeric(NEWAPPLICATIONS$BILL_AMT1)
 NEWAPPLICATIONS$BILL_AMT2<-as.numeric(NEWAPPLICATIONS$BILL_AMT2)
 NEWAPPLICATIONS$BILL_AMT3<-as.numeric(NEWAPPLICATIONS$BILL_AMT3)
 NEWAPPLICATIONS$BILL_AMT4<-as.numeric(NEWAPPLICATIONS$BILL_AMT4)
 NEWAPPLICATIONS$BILL_AMT5<-as.numeric(NEWAPPLICATIONS$BILL_AMT5)
 NEWAPPLICATIONS$BILL_AMT6<-as.numeric(NEWAPPLICATIONS$BILL_AMT6)
 NEWAPPLICATIONS$PAY_AMT1<-as.character(NEWAPPLICATIONS$PAY_AMT1)
 NEWAPPLICATIONS$PAY_AMT2<-as.numeric(NEWAPPLICATIONS$PAY_AMT2)
 NEWAPPLICATIONS$PAY_AMT3<-as.character(NEWAPPLICATIONS$PAY_AMT3)
 NEWAPPLICATIONS$PAY_AMT4<-as.character(NEWAPPLICATIONS$PAY_AMT4)
 NEWAPPLICATIONS$PAY_AMT5<-as.character(NEWAPPLICATIONS$PAY_AMT5)
 NEWAPPLICATIONS$PAY_AMT6<-as.numeric(NEWAPPLICATIONS$PAY_AMT6)

 str(NEWAPPLICATIONS)
 NEWAPPLICATIONS[1,1]<-15001
 NEWAPPLICATIONS[2,1]<-15002
 NEWAPPLICATIONS$default.payment.next.month<-NA
 creditdata<-rbind(creditdata1,NEWAPPLICATIONS)

 str(creditdata)
 str(NEWAPPLICATIONS)
 creditdata$SEX<-as.factor(creditdata$SEX)
 creditdata$EDUCATION<-as.factor(creditdata$EDUCATION)
 creditdata$MARRIAGE<-as.factor(creditdata$MARRIAGE)
 str(creditdata)
 creditdata$PAY_0<-as.factor(creditdata$PAY_0)
 creditdata$PAY_2<-as.factor(creditdata$PAY_2)
 creditdata$PAY_3<-as.factor(creditdata$PAY_3)
 creditdata$PAY_4<-as.factor(creditdata$PAY_4)
 creditdata$PAY_5<-as.factor(creditdata$PAY_5)
 creditdata$PAY_6<-as.factor(creditdata$PAY_6)
 creditdata$LIMIT_BAL<-as.numeric(creditdata$LIMIT_BAL)
 creditdata$BILL_AMT1<-as.numeric(creditdata$BILL_AMT1)
 creditdata$BILL_AMT2<-as.numeric(creditdata$BILL_AMT2)
 creditdata$BILL_AMT3<-as.numeric(creditdata$BILL_AMT3)
 creditdata$BILL_AMT4<-as.numeric(creditdata$BILL_AMT4)
 creditdata$BILL_AMT5<-as.numeric(creditdata$BILL_AMT5)
 creditdata$BILL_AMT6<-as.numeric(creditdata$BILL_AMT6)

 creditdata$PAY_AMT1<-as.numeric(creditdata$PAY_AMT1)
 creditdata$PAY_AMT2<-as.numeric(creditdata$PAY_AMT2)
 creditdata$PAY_AMT3<-as.numeric(creditdata$PAY_AMT3)
 creditdata$PAY_AMT4<-as.numeric(creditdata$PAY_AMT4)
 creditdata$PAY_AMT5<-as.numeric(creditdata$PAY_AMT5)
 creditdata$PAY_AMT6<-as.numeric(creditdata$PAY_AMT6)
 str(creditdata)

 creditdata$AGE<-as.numeric(creditdata$AGE)
 creditdata$default.payment.next.month<-as.factor(creditdata$default.payment.next.month)
 str(creditdata)

 ######CHECKING NA's
 sapply(creditdata, function(x)sum(is.na(x)))


 #####SEPERATING THE OBSERVATIONS TO PREDICT FROM THE REST
 tail(creditdata)
 CREDIT1<-subset(creditdata,CurrentId<=15000)
 CREDIT2<-subset(creditdata,CurrentId>15000)

 ###################Creating Training and Test Data
 set.seed(77850) #set a random number generation seed to ensure that the split is the same everytime
 inTrain <- createDataPartition(y = CREDIT1$default.payment.next.month,
                               p = 12000/15000, list = FALSE)
 training <- CREDIT1[ inTrain,]
 testing <- CREDIT1[ -inTrain,]
 str(testing)

 #########################Training the model using logisitc Regression
 library(glmnet)
 model_logistic<-glm(default.payment.next.month~.-CurrentId-SEX, data=training, family="binomial"(link="logit"))

 summary(model_logistic)

 model_logistic_stepwiseAIC<-model_logistic

 summary(model_logistic_stepwiseAIC) 

 par(mfrow=c(1,4))
 plot(model_logistic_stepwiseAIC) #Error plots: similar nature to lm plots
 par(mfrow=c(1,1))
 str(testing)
 par(mar=c(6,4,4,2))
 ###Finding predicitons: probabilities and classification
 logistic_probabilities<-predict(model_logistic_stepwiseAIC,newdata=testing,type="response") #Predict probabilities
 logistic_classification<-rep("1",2999)
 logistic_classification[logistic_probabilities<0.5]="0" #Predict classification using 0.6073 threshold. Why 0.6073 - that's the average probability of being retained in the data. An alternative code: logistic_classification <- as.integer(logistic_probabilities > mean(testing$Retained.in.2012. == "1"))
 logistic_classification<-as.factor(logistic_classification)

 str(testing)
 str(training)

 ###Confusion matrix  
 confusionMatrix(logistic_classification,testing$default.payment.next.month) #Display confusion matrix

 ####ROC Curve
 logistic_ROC_prediction <- prediction(logistic_probabilities, testing$default.payment.next.month)
 logistic_ROC <- performance(logistic_ROC_prediction,"tpr","fpr") #Create ROC curve data
 plot(logistic_ROC) #Plot ROC curve

 ####AUC (area under curve)
 auc.tmp <- performance(logistic_ROC_prediction,"auc") #Create AUC data
 logistic_auc_testing <- as.numeric([email protected]) #Calculate AUC
 logistic_auc_testing #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value




 ###################################Training the model using Random Forest
 str(training)
 library(randomForest)
 model_forest <- randomForest(default.payment.next.month~ .-CurrentId, data=training, 
                             importance=TRUE,proximity=TRUE,
                             cutoff = c(0.5, 0.5),type="classification") #cutoffs need to be determined for class 0 and class 1. By default 50/50, but need not be those necessarily
 print(model_forest)   
 plot(model_forest)
 importance(model_forest)
 varImpPlot(model_forest)

 ###Finding predicitons: probabilities and classification
 forest_probabilities<-predict(model_forest,newdata=testing,type="prob") #Predict probabilities -- an array with 2 columns: for not retained (class 0) and for retained (class 1)
 forest_classification<-rep("1",2999)
 forest_classification[forest_probabilities[,2]<0.5]="0" #Predict classification using 0.5 threshold. Why 0.5 and not 0.6073? Use the same as in cutoff above
 forest_classification<-as.factor(forest_classification)

 confusionMatrix(forest_classification,testing$default.payment.next.month) #Display confusion matrix. Note, confusion matrix actually displays a better accuracy with threshold of 50%

 #There is also a "shortcut" forest_prediction<-predict(model_forest,newdata=testing, type="response") 
 #But it by default uses threshold of 50%: actually works better (more accuracy) on this data


 ####ROC Curve
 forest_ROC_prediction <- prediction(forest_probabilities[,2], testing$default.payment.next.month) #Calculate errors
 forest_ROC <- performance(forest_ROC_prediction,"tpr","fpr") #Create ROC curve data
 plot(forest_ROC) #Plot ROC curve
 plot(forest_ROC, add=TRUE, col="blue") #For comparison, overlay/add the ROC curve from (A) in red
 legend("right", legend=c("Logistic","Random Forest"), col=c("red","blue"), lty=1:2, cex=0.6)

 ####AUC (area under curve)
 AUC.tmp <- performance(forest_ROC_prediction,"auc") #Create AUC data
 forest_AUC <- as.numeric([email protected]) #Calculate AUC
 forest_AUC #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value


 ######################################### TRAINING THE MODEL USING XGBOOST
 library(xgboost)
 training.x <-model.matrix(default.payment.next.month~ ., data = training)
 testing.x <-model.matrix(default.payment.next.month~ ., data = testing)

 model_XGboost<-xgboost(data = data.matrix(training.x[,-1:-3]), 
                       label = as.numeric(as.character(training$default.payment.next.month)), 
                       eta = 0.1,
                       max_depth = 20, 
                       nround=50, 
                       objective = "binary:logistic")

 XGboost_prediction<-predict(model_XGboost,newdata=testing.x[,-1:-3], type="response") #Predict classification (for confusion matrix)
 XGboost_prediction2<-as.factor(ifelse(XGboost_prediction>0.5,1,0))
 confusionMatrix(XGboost_prediction2,testing$default.payment.next.month) #Display confusion matrix

 ####ROC Curve
 XGboost_pred_testing <- prediction(XGboost_prediction, testing$default.payment.next.month) #Calculate errors
 XGboost_ROC_testing <- performance(XGboost_pred_testing,"tpr","fpr") #Create ROC curve data
 plot(XGboost_ROC_testing) #Plot ROC curve
 plot(XGboost_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red
 legend("right", legend=c("Logistic","Random Forest","XGBoost"), col=c("red","blue","green"), lty=1:2, cex=0.6)
 ####AUC
 auc.tmp <- performance(XGboost_pred_testing,"auc") #Create AUC data
 XGboost_auc_testing <- as.numeric([email protected]) #Calculate AUC
 XGboost_auc_testing #Display AUC value: 90+% - excellent, 80-90% - very good, 70-80% - good, 60-70% - so so, below 60% - not much value



 ######################TRAINING THE MODEL USING GRADIENT BOOSTING
 library(gbm)
 str(training)
 training$default.payment.next.month<-as.character(training$default.payment.next.month)

 model_ExtremeGradientBoosting<-gbm(default.payment.next.month~.-CurrentId,
                                   distribution="bernoulli",
                                   data=training,
                                   n.trees=1000,
                                   interaction.depth = 4,
                                   shrinkage = 0.01)

 summary(model_ExtremeGradientBoosting)
 GBM_prediction<-predict(model_ExtremeGradientBoosting,testing,n.trees = 1000,type = "response") #Predict classification (for confusion matrix)
 GBM_prediction2<-as.factor(ifelse(GBM_prediction>0.5,1,0))
 confusionMatrix(GBM_prediction2,testing$default.payment.next.month) #Display confusion matrix

 ####ROC Curve
 GBM_pred_testing <- prediction(GBM_prediction, testing$default.payment.next.month) #Calculate errors
 GBM_ROC_testing <- performance(GBM_pred_testing,"tpr","fpr") #Create ROC curve data
 plot(GBM_ROC_testing) #Plot ROC curve
 plot(GBM_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red
 legend("right", legend=c("Logistic","Random Forest","XGBoost"), col=c("red","blue","green"), lty=1:2, cex=0.6)
 ####AUC
 auc.tmp <- performance(GBM_pred_testing,"auc") #Create AUC data
 gbm_auc_testing <- as.numeric([email protected]) #Calculate AUC
 gbm_auc_testing ##AUC Score WITHOUT Feature Engineering


 plot(GBM_ROC_testing, add=TRUE, col="green") #For comparison, overlay/add the ROC curve from (A) in red
 legend("right", legend=c("Without feature engineering"), col=c("green"), lty=1:2, cex=0.6)



 #########################PREDICTING THE TWO NEW OBSERVATIONS
 str(creditdata)
 tail(creditdata)
 CREDIT2<-subset(creditdata,CurrentId>15000)
 tail(CREDIT2)

 library(gbm)
 str(training)
 CREDIT1$default.payment.next.month<-as.character(CREDIT1$default.payment.next.month)

 model_ExtremeGradientBoosting<-gbm(default.payment.next.month~.-CurrentId,
                                   distribution="bernoulli",
                                   data=CREDIT1,
                                   n.trees=1000,
                                   interaction.depth = 4,
                                   shrinkage = 0.01)
 summary(model_ExtremeGradientBoosting)

 GBM_prediction<-predict(model_ExtremeGradientBoosting,CREDIT2,n.trees = 1000,type = "response") #Predict classification (for confusion matrix)
 GBM_prediction2<-as.factor(ifelse(GBM_prediction>0.5,1,0))

 CREDIT2


 str(CREDIT2)

 str(creditdata)











 ##################################### PERFORMING FEATURE ENGINEERING

 creditdata<-CREDIT1
 str(creditdata)
 creditdata$SEX<-as.factor(creditdata$SEX)
 creditdata$EDUCATION<-as.factor(creditdata$EDUCATION)
 creditdata$MARRIAGE<-as.factor(creditdata$MARRIAGE)

 creditdata$PAY_0<-as.factor(creditdata$PAY_0)
 creditdata$PAY_2<-as.factor(creditdata$PAY_2)
 creditdata$PAY_3<-as.factor(creditdata$PAY_3)
 creditdata$PAY_4<-as.factor(creditdata$PAY_4)
 creditdata$PAY_5<-as.factor(creditdata$PAY_5)
 creditdata$PAY_6<-as.factor(creditdata$PAY_6)
 creditdata$LIMIT_BAL<-as.numeric(creditdata$LIMIT_BAL)
 creditdata$BILL_AMT1<-as.numeric(creditdata$BILL_AMT1)
 creditdata$BILL_AMT2<-as.numeric(creditdata$BILL_AMT2)
 creditdata$BILL_AMT3<-as.numeric(creditdata$BILL_AMT3)
 creditdata$BILL_AMT4<-as.numeric(creditdata$BILL_AMT4)
 creditdata$BILL_AMT5<-as.numeric(creditdata$BILL_AMT5)
 creditdata$BILL_AMT6<-as.numeric(creditdata$BILL_AMT6)
 creditdata$PAY_AMT1<-as.numeric(creditdata$PAY_AMT1)
 creditdata$PAY_AMT3<-as.numeric(creditdata$PAY_AMT3)
 creditdata$PAY_AMT4<-as.numeric(creditdata$PAY_AMT4)
 creditdata$PAY_AMT5<-as.numeric(creditdata$PAY_AMT5)
 creditdata$AGE<-as.numeric(creditdata$AGE)
 creditdata$default.payment.next.month<-as.factor(creditdata$default.payment.next.month)
 str(creditdata)
 table(creditdata$AGE)
 sapply(creditdata, function(x)sum(is.na(x)))

 names(creditdata)[1]<-"CurrentId"

 summary(creditdata)
 str(creditdata)
 ######################feature engineering for sex
 #creditdata$SEX1<-as.numeric(creditdata$SEX)
 #creditdata$SexLimit<-creditdata$LIMIT_BAL*creditdata$SEX1
 #creditdata$Billamt1sex<-creditdata$BILL_AMT1*creditdata$SEX1
 #creditdata$Billamt2sex<-creditdata$BILL_AMT2*creditdata$SEX1
 #creditdata$Billamt3sex<-creditdata$BILL_AMT3*creditdata$SEX1
 #creditdata$Billamt4sex<-creditdata$BILL_AMT4*creditdata$SEX1
 #creditdata$Billamt5sex<-creditdata$BILL_AMT5*creditdata$SEX1
 #creditdata$Billamt6sex<-creditdata$BILL_AMT6*creditdata$SEX1

 #creditdata$PayAmt1sex<-creditdata$PAY_AMT1*creditdata$SEX1
 #creditdata$PayAmt2sex<-creditdata$PAY_AMT2*creditdata$SEX1
 #creditdata$PayAmt3sex<-creditdata$PAY_AMT3*creditdata$SEX1
 #creditdata$PayAmt4sex<-creditdata$PAY_AMT4*creditdata$SEX1
 #creditdata$PayAmt5sex<-creditdata$PAY_AMT5*creditdata$SEX1
 #creditdata$PayAmt6sex<-creditdata$PAY_AMT6*creditdata$SEX1

 ######################feature engineering for education
 #########creditdata$EDUCATION2<-creditdata$EDUCATION

 ##########creditdata$EDUCATION2<-as.character(creditdata$EDUCATION2)
 ###########creditdata[which(creditdata$EDUCATION2==0),"EDUCATION2"]<-4
 #creditdata[which(creditdata$EDUCATION2==4),"EDUCATION2"]<-4
 #creditdata[which(creditdata$EDUCATION2==5),"EDUCATION2"]<-4
 #creditdata[which(creditdata$EDUCATION2==6),"EDUCATION2"]<-4
 #creditdata$EDUCATION2<-as.factor(creditdata$EDUCATION2)

 #creditdata$EDUCATION1<-as.numeric(creditdata$EDUCATION)
 #creditdata$eduLimit<-creditdata$LIMIT_BAL*creditdata$EDUCATION1
 #creditdata$Billamt1edu<-creditdata$BILL_AMT1*creditdata$EDUCATION1
 #creditdata$Billamt2edu<-creditdata$BILL_AMT2*creditdata$EDUCATION1
 #creditdata$Billamt3edu<-creditdata$BILL_AMT3*creditdata$EDUCATION1
 #creditdata$Billamt4edu<-creditdata$BILL_AMT4*creditdata$EDUCATION1
 #creditdata$Billamt5edu<-creditdata$BILL_AMT5*creditdata$EDUCATION1
 #creditdata$Billamt6edu<-creditdata$BILL_AMT6*creditdata$EDUCATION1

 #creditdata$PayAmt1edu<-creditdata$PAY_AMT1*creditdata$EDUCATION1
 #creditdata$PayAmt2edu<-creditdata$PAY_AMT2*creditdata$EDUCATION1
 #creditdata$PayAmt3edu<-creditdata$PAY_AMT3*creditdata$EDUCATION1
 #creditdata$PayAmt4edu<-creditdata$PAY_AMT4*creditdata$EDUCATION1
 #creditdata$PayAmt5edu<-creditdata$PAY_AMT5*creditdata$EDUCATION1
 #creditdata$PayAmt6edu<-creditdata$PAY_AMT6*creditdata$EDUCATION1


 ######################feature engineering for MARRIAGE
 #creditdata$MARRIAGE1<-as.numeric(creditdata$MARRIAGE)
 #creditdata$MarriageLimit<-creditdata$LIMIT_BAL*creditdata$MARRIAGE1
 #creditdata$Billamt1mARRIAGE<-creditdata$BILL_AMT1*creditdata$MARRIAGE1
 #creditdata$Billamt2mARRIAGE<-creditdata$BILL_AMT2*creditdata$MARRIAGE1
 #creditdata$Billamt3mARRIAGE<-creditdata$BILL_AMT3*creditdata$MARRIAGE1
 #creditdata$Billamt4mARRIAGE<-creditdata$BILL_AMT4*creditdata$MARRIAGE1
 #creditdata$Billamt5mARRIAGE<-creditdata$BILL_AMT5*creditdata$MARRIAGE1
 #creditdata$Billamt6mARRIAGE<-creditdata$BILL_AMT6*creditdata$MARRIAGE1

 #creditdata$PayAmt1MARRIAGE<-creditdata$PAY_AMT1*creditdata$MARRIAGE1
 #creditdata$PayAmt2MARRIAGE<-creditdata$PAY_AMT2creditdata$MARRIAGE1
 #creditdata$PayAmt3MARRIAGE<-creditdata$PAY_AMT3*creditdata$MARRIAGE1
 #creditdata$PayAmt4MARRIAGE<-creditdata$PAY_AMT4*creditdata$MARRIAGE1
 #creditdata$PayAmt5MARRIAGE<-creditdata$PAY_AMT5*creditdata$MARRIAGE1
 #creditdata$PayAmt6MARRIAGE<-creditdata$PAY_AMT6*creditdata$MARRIAGE1

 #######RUN THIS
 creditdata$Month1<-(creditdata$BILL_AMT1-creditdata$BILL_AMT2)
 creditdata$Month2<-(creditdata$BILL_AMT2-creditdata$BILL_AMT3)
 creditdata$Month3<-(creditdata$BILL_AMT3-creditdata$BILL_AMT4)
 creditdata$Month4<-(creditdata$BILL_AMT4-creditdata$BILL_AMT5)
 creditdata$Month5<-(creditdata$BILL_AMT5-creditdata$BILL_AMT6)

 creditdata$DiffPayAmt1<-(creditdata$PAY_AMT1-creditdata$PAY_AMT2)
 creditdata$DiffPayAmt2<-(creditdata$PAY_AMT2-creditdata$PAY_AMT3)
 creditdata$DiffPayAmt3<-(creditdata$PAY_AMT3-creditdata$PAY_AMT4)
 creditdata$DiffPayAmt4<-(creditdata$PAY_AMT4-creditdata$PAY_AMT5)
 creditdata$DiffPayAmt5<-(creditdata$PAY_AMT5-creditdata$PAY_AMT6)



 str(creditdata)
 creditdata$Month1BalanceProp<-max(creditdata$BILL_AMT1,0)/creditdata$LIMIT_BAL
 creditdata$Month2BalanceProp<-max(creditdata$BILL_AMT2,0)/creditdata$LIMIT_BAL
 creditdata$Month3BalanceProp<-max(creditdata$BILL_AMT3,0)/creditdata$LIMIT_BAL
 creditdata$Month4BalanceProp<-max(creditdata$BILL_AMT4,0)/creditdata$LIMIT_BAL
 creditdata$Month5BalanceProp<-max(creditdata$BILL_AMT5,0)/creditdata$LIMIT_BAL
 creditdata$Month6BalanceProp<-max(creditdata$BILL_AMT6,0)/creditdata$LIMIT_BAL
 creditdata$AggregateLimBalance<-max(creditdata$BILL_AMT1+creditdata$BILL_AMT2+
                                      creditdata$BILL_AMT3+creditdata$BILL_AMT4+
                                      creditdata$BILL_AMT5+creditdata$BILL_AMT6,0)/creditdata$LIMIT_BAL

 #####RUN THIS
 creditdata$PayAmtBalance1<-ifelse(creditdata$BILL_AMT1<=0,1,(creditdata$PAY_AMT1/creditdata$BILL_AMT1))
 creditdata$PayAmtBalance2<-ifelse(creditdata$BILL_AMT2<=0,1,(creditdata$PAY_AMT2/creditdata$BILL_AMT2))
 creditdata$PayAmtBalance3<-ifelse(creditdata$BILL_AMT3<=0,1,(creditdata$PAY_AMT3/creditdata$BILL_AMT3))
 creditdata$PayAmtBalance4<-ifelse(creditdata$BILL_AMT4<=0,1,(creditdata$PAY_AMT4/creditdata$BILL_AMT4))
 creditdata$PayAmtBalance5<-ifelse(creditdata$BILL_AMT5<=0,1,(creditdata$PAY_AMT5/creditdata$BILL_AMT5))
 creditdata$PayAmtBalance6<-ifelse(creditdata$BILL_AMT6<=0,1,(creditdata$PAY_AMT6/creditdata$BILL_AMT6))
 #####RUN THIS
 creditdata$TotalBalancedue<-creditdata$BILL_AMT1+creditdata$BILL_AMT2+creditdata$BILL_AMT3+creditdata$BILL_AMT4+
  creditdata$BILL_AMT5+creditdata$BILL_AMT6
 ##########RUN THIS
 creditdata$PayAmtdue<-creditdata$PAY_AMT1+creditdata$PAY_AMT2+creditdata$PAY_AMT3+creditdata$PAY_AMT4+
  creditdata$PAY_AMT5+creditdata$PAY_AMT6
 #######RUN THIS
 creditdata$Difference<-creditdata$TotalBalancedue-creditdata$PayAmtdue
 creditdata$Difference1<-creditdata$LIMIT_BAL-creditdata$TotalBalancedue
 creditdata$Difference2<-creditdata$LIMIT_BAL-creditdata$PayAmtdue
 creditdata$TWOMONTHBILLAMNT1<-creditdata$BILL_AMT1+creditdata$BILL_AMT2
 creditdata$TWOMONTHBILLAMNT2<-creditdata$BILL_AMT3+creditdata$BILL_AMT4
 creditdata$TWOMONTHBILLAMNT3<-creditdata$BILL_AMT5+creditdata$BILL_AMT6
 creditdata$TWOMONTHpayAMNT1<-creditdata$PAY_AMT1+creditdata$PAY_AMT2
 creditdata$TWOMONTHpayAMNT2<-creditdata$PAY_AMT3+creditdata$PAY_AMT4
 creditdata$TWOMONTHpayAMNT3<-creditdata$PAY_AMT5+creditdata$PAY_AMT6

 creditdata$D1<-creditdata$BILL_AMT1-creditdata$PAY_AMT1
 creditdata$D2<-creditdata$BILL_AMT2-creditdata$PAY_AMT2
 creditdata$D3<-creditdata$BILL_AMT3-creditdata$PAY_AMT3
 creditdata$D4<-creditdata$BILL_AMT4-creditdata$PAY_AMT4
 creditdata$D5<-creditdata$BILL_AMT5-creditdata$PAY_AMT5
 creditdata$D6<-creditdata$BILL_AMT6-creditdata$PAY_AMT6


 creditdata$newdiff1<-creditdata$BILL_AMT1-creditdata$BILL_AMT3
 creditdata$newdiff2<-creditdata$BILL_AMT3-creditdata$BILL_AMT5
 creditdata$newdiff3<-creditdata$BILL_AMT2-creditdata$BILL_AMT4
 creditdata$newdiff4<-creditdata$BILL_AMT4-creditdata$BILL_AMT6


 creditdata$newpaydeiff1<-creditdata$PAY_AMT1-creditdata$PAY_AMT3
 creditdata$newpaydeiff2<-creditdata$PAY_AMT3-creditdata$PAY_AMT5
 creditdata$newpaydeiff3<-creditdata$PAY_AMT2-creditdata$PAY_AMT4
 creditdata$newpaydeiff2<-creditdata$PAY_AMT4-creditdata$PAY_AMT6


 #creditdata$extremedeiff1<-creditdata$BILL_AMT1-creditdata$BILL_AMT6
 #creditdata$extremedeiff2<-creditdata$BILL_AMT1-creditdata$BILL_AMT5
 #creditdata$extremedeiff3<-creditdata$BILL_AMT1-creditdata$BILL_AMT4
 #creditdata$extremedeiff4<-creditdata$BILL_AMT1-creditdata$BILL_AMT3
 #creditdata$extremedeiff5<-creditdata$BILL_AMT2-creditdata$BILL_AMT6
 #creditdata$extremedeiff6<-creditdata$BILL_AMT2-creditdata$BILL_AMT5
 #creditdata$extremedeiff7<-creditdata$BILL_AMT2-creditdata$BILL_AMT4
 #creditdata$extremedeiff8<-creditdata$BILL_AMT3-creditdata$BILL_AMT6
 #creditdata$extremedeiff9<-creditdata$BILL_AMT3-creditdata$BILL_AMT5
 #creditdata$extremedeiff10<-creditdata$BILL_AMT4-creditdata$BILL_AMT6





 #creditdata$extremediffpaydue<-creditdata$PAY_AMT1-creditdata$PAY_AMT6
 #creditdata$extremediffpaydue2<-creditdata$PAY_AMT1-creditdata$PAY_AMT5
 #creditdata$extremediffpaydue3<-creditdata$PAY_AMT1-creditdata$PAY_AMT4
 #creditdata$extremediffpaydue4<-creditdata$PAY_AMT1-creditdata$PAY_AMT3
 #creditdata$extremediffpaydue10<-creditdata$PAY_AMT2-creditdata$PAY_AMT6
 #creditdata$extremediffpaydue5<-creditdata$PAY_AMT2-creditdata$PAY_AMT5
 #creditdata$extremediffpaydue6<-creditdata$PAY_AMT2-creditdata$PAY_AMT4
 #creditdata$extremediffpaydue7<-creditdata$PAY_AMT3-creditdata$PAY_AMT6
 #creditdata$extremediffpaydue8<-creditdata$PAY_AMT3-creditdata$PAY_AMT5
 #creditdata$extremediffpaydue9<-creditdata$PAY_AMT4-creditdata$PAY_AMT6


 creditdata$ThreemonthBillAmtDue<-creditdata$BILL_AMT1+creditdata$BILL_AMT2+creditdata$BILL_AMT3
 creditdata$ThreemonthBillAmtDue2<-creditdata$BILL_AMT4+creditdata$BILL_AMT5+creditdata$BILL_AMT6

 creditdata$ThreemonthPayAmtDue<-creditdata$PAY_AMT1+creditdata$PAY_AMT2+creditdata$PAY_AMT3
 creditdata$ThreemonthPayAmtDue2<-creditdata$PAY_AMT4+creditdata$PAY_AMT5+creditdata$PAY_AMT6



 ###############################CORRECTING PAY_0
 #creditdata$PAY_00<-creditdata$PAY_0
 #R1<-which(creditdata$PAY_00==4)
 #R2<-which(creditdata$PAY_00==5)
 #R3<-which(creditdata$PAY_00==6)
 #R4<-which(creditdata$PAY_00==7)
 #R5<-which(creditdata$PAY_00==8)
 #R6<-which(creditdata$PAY_00==-2)
 #R7<-which(creditdata$PAY_00==-1)
 #R8<-which(creditdata$PAY_00==0)


 #creditdata[R1,"PAY_00"]<-4
 #creditdata[R2,"PAY_00"]<-4
 #creditdata[R3,"PAY_00"]<-4
 #creditdata[R4,"PAY_00"]<-4
 #creditdata[R5,"PAY_00"]<-4
 #creditdata[R6,"PAY_00"]<-0
 #creditdata[R7,"PAY_00"]<-0
 #creditdata[R8,"PAY_00"]<-0


 ###############################CORRECTING PAY_2
 creditdata$PAY_02<-creditdata$PAY_2
 R1<-which(creditdata$PAY_02==4)
 R2<-which(creditdata$PAY_02==5)
 R3<-which(creditdata$PAY_02==6)
 R4<-which(creditdata$PAY_02==7)
 R5<-which(creditdata$PAY_02==8)
 R6<-which(creditdata$PAY_02==-2)
 R7<-which(creditdata$PAY_02==-1)
 R8<-which(creditdata$PAY_02==0)


 creditdata[R1,"PAY_02"]<-4
 creditdata[R2,"PAY_02"]<-4
 creditdata[R3,"PAY_02"]<-4
 creditdata[R4,"PAY_02"]<-4
 creditdata[R5,"PAY_02"]<-4
 creditdata[R6,"PAY_02"]<-0
 creditdata[R7,"PAY_02"]<-0
 creditdata[R8,"PAY_02"]<-0

 ###############################CORRECTING PAY_3
 #creditdata$PAY_03<-creditdata$PAY_3
 #R1<-which(creditdata$PAY_03==4)
 #R2<-which(creditdata$PAY_03==5)
 #R3<-which(creditdata$PAY_03==6)
 #R4<-which(creditdata$PAY_03==7)
 #R5<-which(creditdata$PAY_03==8)
 #R6<-which(creditdata$PAY_03==-2)
 #R7<-which(creditdata$PAY_03==-1)
 #R8<-which(creditdata$PAY_03==0)


 #creditdata[R1,"PAY_03"]<-4
 #creditdata[R2,"PAY_03"]<-4
 #creditdata[R3,"PAY_03"]<-4
 #creditdata[R4,"PAY_03"]<-4
 #creditdata[R5,"PAY_03"]<-4
 #creditdata[R6,"PAY_03"]<-0
 #creditdata[R7,"PAY_03"]<-0
 #creditdata[R8,"PAY_03"]<-0

 ###############################CORRECTING PAY_5
 creditdata$PAY_05<-creditdata$PAY_5
 R1<-which(creditdata$PAY_05==4)
 R2<-which(creditdata$PAY_05==5)
 R3<-which(creditdata$PAY_05==6)
 R4<-which(creditdata$PAY_05==7)
 R5<-which(creditdata$PAY_05==8)
 R6<-which(creditdata$PAY_05==-2)
 R7<-which(creditdata$PAY_05==-1)
 R8<-which(creditdata$PAY_05==0)


 creditdata[R1,"PAY_05"]<-4
 creditdata[R2,"PAY_05"]<-4
 creditdata[R3,"PAY_05"]<-4
 creditdata[R4,"PAY_05"]<-4
 creditdata[R5,"PAY_05"]<-4
 creditdata[R6,"PAY_05"]<-0
 creditdata[R7,"PAY_05"]<-0
 creditdata[R8,"PAY_05"]<-0



 ###############################CORRECTING PAY_6
 creditdata$PAY_06<-creditdata$PAY_6
 R1<-which(creditdata$PAY_06==4)
 R2<-which(creditdata$PAY_06==5)
 R3<-which(creditdata$PAY_06==6)
 R4<-which(creditdata$PAY_06==7)
 R5<-which(creditdata$PAY_06==8)
 R6<-which(creditdata$PAY_06==-2)
 R7<-which(creditdata$PAY_06==-1)
 R8<-which(creditdata$PAY_06==0)


 creditdata[R1,"PAY_06"]<-4
 creditdata[R2,"PAY_06"]<-4
 creditdata[R3,"PAY_06"]<-4
 creditdata[R4,"PAY_06"]<-4
 creditdata[R5,"PAY_06"]<-4
 creditdata[R6,"PAY_06"]<-0
 creditdata[R7,"PAY_06"]<-0
 creditdata[R8,"PAY_06"]<-0


 ############CORRECTING EDUCATION
 creditdata$EDUCATION2<-creditdata$EDUCATION
 Row1<-which(creditdata$EDUCATION2==4)
 Row2<-which(creditdata$EDUCATION2==5)
 Row3<-which(creditdata$EDUCATION2==6)
 Row4<-which(creditdata$EDUCATION2==0)

 creditdata[Row1,"EDUCATION2"]<-0
 creditdata[Row2,"EDUCATION2"]<-0
 creditdata[Row3,"EDUCATION2"]<-0
 creditdata[Row4,"EDUCATION2"]<-0


 ############CORRECTING MARRIAGE
 creditdata$Mariage2<-creditdata$MARRIAGE
 Row1<-which(creditdata$Mariage2==0)

 creditdata[Row1,"Mariage2"]<-3

 str(creditdata)
 table(creditdata$AGE)



 creditdata3<-creditdata

 row1<-which(creditdata3$BILL_AMT1==0 & creditdata3$BILL_AMT2==0&creditdata3$BILL_AMT3==0&creditdata3$BILL_AMT4==0&
              creditdata3$BILL_AMT5==0&creditdata3$BILL_AMT6==0&creditdata3$default.payment.next.month==1)


 creditdata3<-creditdata3[-row1,]
 str(creditdata3)
 str(creditdata)
 ############club 4, 5 and 6 in education as others and include 0 values
 ########### club 3 in marriage as others 






 sapply(training1, function(x) sum(is.na(x)))









 hist(creditdata$Month2)





 ###################Creating Data Partition
 set.seed(77800)
 inTrain1 <- createDataPartition(y = creditdata3$default.payment.next.month,
                                p = 12500/14837, list = FALSE)
 training <- creditdata3[ inTrain1,]
 testing <- creditdata3[ -inTrain1,]

 str(testing)

 ######################Extreme Gracdient Boosting with feature engineered variables
 library(gbm)
 str(training)
 training$default.payment.next.month<-as.character(training$default.payment.next.month)



 model_ExtremeGradientBoosting<-gbm(default.payment.next.month ~ .,
                                   distribution="bernoulli",
                                   data=training,
                                   n.trees=1000,
                                   interaction.depth = 5,
                                   shrinkage = 0.016)



 summary(model_ExtremeGradientBoosting)
 gbm_prediction<-predict(model_ExtremeGradientBoosting,testing,n.trees = 1000,type = "response") #Predict classification (for confusion matrix)
 gbm_prediction2<-as.factor(ifelse(gbm_prediction>0.5,1,0))
 confusionMatrix(gbm_prediction2,testing$default.payment.next.month) #Display confusion matrix

 ####ROC Curve
 gbm_pred_testing <- prediction(gbm_prediction, testing$default.payment.next.month) #Calculate errors
 gbm_ROC_testing <- performance(gbm_pred_testing,"tpr","fpr") #Create ROC curve data
 plot(gbm_ROC_testing) #Plot ROC curve
 plot(gbm_ROC_testing, add=TRUE, col="blue") #For comparison, overlay/add the ROC curve from (A) in red
 legend("right", legend=c("Without feature engineering","With feature engineering"), col=c("green","blue"), lty=1:2, cex=0.6)
 ####AUC
 auc.tmp <- performance(gbm_pred_testing,"auc") #Create AUC data
 GBM_auc_testing <- as.numeric([email protected]) #Calculate AUC
 GBM_auc_testing #D