Created
October 20, 2017 13:53
-
-
Save vrajesh26/a1f1af6d83a2e9eab1ec19c253a0b0ec to your computer and use it in GitHub Desktop.
To identify the customer segments who are eligible for loan
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train<-read.csv("D:/loan prediction/loan_train.csv",na.strings = c(""," ",NA)) | |
test<-read.csv("D:/loan prediction/loan_test.csv",na.strings = c(""," ",NA)) | |
View(train) | |
colSums(is.na(train)) | |
colSums(is.na(test)) | |
dim(train) | |
library(mlr) | |
summarizeColumns(train) | |
summarizeColumns(train) | |
library("ggplot2") | |
ggplot(data = train, mapping = aes(Loan_Status,fill=Loan_Status))+geom_bar() | |
prop.table(table(train$Loan_Status)) | |
ggplot(data = train, mapping = aes(Gender,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Gender))+geom_bar() | |
gl<-table(train$Gender,train$Loan_Status) | |
chisq.test(gl) | |
ggplot(data = train, mapping = aes(Married,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Married))+geom_bar() | |
ml<-table(train$Married,train$Loan_Status) | |
chisq.test(ml) | |
ggplot(data = train, mapping = aes(Dependents,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Dependents))+geom_bar() | |
levels(train$Dependents) | |
dl<-table(train$Dependents,train$Loan_Status) | |
chisq.test(dl) | |
ggplot(data = train, mapping = aes(Education,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Education))+geom_bar() | |
el<-table(train$Education,train$Loan_Status) | |
chisq.test(el) | |
ggplot(data = train, mapping = aes(Self_Employed,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Self_Employed))+geom_bar() | |
sl<-table(train$Self_Employed,train$Loan_Status) | |
chisq.test(sl) | |
ggplot(data = train, mapping = aes(ApplicantIncome,fill=Loan_Status))+geom_histogram() | |
ggplot(data = test, mapping = aes(ApplicantIncome))+geom_histogram() | |
ggplot(data = train, mapping = aes(CoapplicantIncome,fill=Loan_Status))+geom_histogram() | |
ggplot(data = test, mapping = aes(CoapplicantIncome))+geom_histogram() | |
ggplot(data = train, mapping = aes(LoanAmount,fill=Loan_Status))+geom_histogram() | |
ggplot(data = test, mapping = aes(LoanAmount))+geom_histogram() | |
summary(train$Loan_Amount_Term) | |
ggplot(data = train, mapping = aes(Loan_Amount_Term,fill=Loan_Status))+geom_histogram() | |
summary(train$Credit_History) | |
train$Credit_History <-as.factor(train$Credit_History) | |
test$Credit_History <- as.factor(test$Credit_History) | |
levels(train$Credit_History) | |
ggplot(data = train, mapping = aes(Credit_History,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Credit_History))+geom_bar() | |
cl<-table(train$Credit_History,train$Loan_Status) | |
chisq.test(cl) | |
ggplot(data = train, mapping = aes(Property_Area,fill=Loan_Status))+geom_bar() | |
ggplot(data = test, mapping = aes(Property_Area))+geom_bar() | |
pl<-table(train$Property_Area,train$Loan_Status) | |
chisq.test(pl) | |
summary(is.na(train)) | |
loan<-rbind(train[,2:12],test[,2:12]) | |
colSums(is.na(loan)) | |
#Feature | |
loan$TotalIncome=loan$ApplicantIncome+loan$CoapplicantIncome | |
loan$Married[is.na(loan$Married) & loan$CoapplicantIncome==0]<-"No" | |
loan$Married[is.na(loan$Married)]<- "Yes" | |
loan[is.na(loan$Gender) & is.na(loan$Dependents),] | |
loan$Gender[is.na(loan$Gender) & is.na(loan$Dependents)] <- "Male" | |
loan$Dependents[is.na(loan$Dependents) & loan$Married=="No"]<- "0" | |
mm <- loan[(loan$Gender=="Male" & loan$Married=="Yes"),c(3,6:9,11)] | |
mmtrain<-mm[!is.na(mm$Dependents),] | |
mmtest<- mm[is.na(mm$Dependents),] | |
library("rpart") | |
depFit <- rpart(data=mmtrain,Dependents~.,xval=3) | |
rpart.plot(depFit) | |
p<-predict(depFit,mmtrain,type="class") | |
loan$Dependents[is.na(loan$Dependents) & loan$Gender=="Male" & | |
loan$Married == "Yes"]<- predict(depFit,newdata=mmtest,type="class") | |
gtrain<-loan[!is.na(loan$Gender),1:7] | |
gtest<-loan[is.na(loan$Gender),1:7] | |
genFit<-rpart(data=gtrain,Gender~.,xval=3) | |
rpart.plot(genFit) | |
p<-predict(genFit,gtrain,type="class") | |
loan$Gender[is.na(loan$Gender)]<-predict(genFit,gtest,type="class") | |
table(loan$Self_Employed) | |
loan$Self_Employed[is.na(loan$Self_Employed)] <- "No" | |
library(car) | |
loan$Credit_History<-recode(loan$Credit_History,"NA=2") | |
ltrain<-loan[!is.na(loan$LoanAmount) & loan$LoanAmount<500,c(1:8,10)] | |
ltest <- loan[is.na(loan$LoanAmount),c(1:8,10)] | |
loanFit <- glm(data=ltrain,LoanAmount~.,na.action=na.exclude) | |
loan$LoanAmount[is.na(loan$LoanAmount)] <- predict(loanFit,newdata=ltest) | |
loan$Loan_Amount_Term <- as.factor(loan$Loan_Amount_Term) | |
loan$Loan_Amount_Term[is.na(loan$Loan_Amount_Term)]<-"360" | |
numDependents <- recode(loan$Dependents,"'3+'='3' ") | |
numDependents <- as.numeric(as.character(numDependents)) | |
loan$FamilySize <- ifelse((loan$CoapplicantIncome>0 |loan$Married=="Y"),numDependents+2,numDependents+1) | |
loan$IncomePC <- loan$TotalIncome/loan$FamilySize | |
loan$LoanAmountByTotInc <- loan$LoanAmount/loan$TotalIncome | |
loan$LoanAmountPC <- loan$LoanAmount/loan$IncomePC | |
loan$Loan_Amount_Term <- as.numeric(as.character(loan$Loan_Amount_Term)) | |
loan$LoanPerMonth <- loan$LoanAmount/loan$Loan_Amount_Term | |
loan$LoanPerMOnthByTotInc <- loan$LoanPerMonth/loan$TotalIncome | |
loan$LoanPerMonthPC <- loan$LoanPerMonth/loan$LoanAmountPC | |
loan$Loan_Amount_Term <- as.factor(loan$Loan_Amount_Term) | |
cor(loan$LoanPerMonthByTotInc,loan$LoanPerMonth) | |
cor(newtrain[,(sapply(newtrain, is.numeric))]) | |
loan=loan[,-18] | |
nrow(loan) | |
nrow(train) | |
nrow(test) | |
newtrain <- cbind(loan[1:614,],Loan_Status=train$Loan_Status) | |
View(newtrain) | |
newtest<-loan[615:981,] | |
library("caret") | |
cntrl<-trainControl(method="repeatedcv",number=5) | |
rp<-train(x=newtrain[,-c(6,17,19)],y=newtrain[,19],method = "rpart",trControl = cntrl) | |
cntrl<-trainControl(method="repeatedcv",number=5) | |
rf1 <- train(x=newtrain[,-c(6,17,19)],y=newtrain[,19],method = 'rf', trControl = cntrl) | |
pred_rf<-predict(rf1,newdata=newtest) | |
adg<-data.frame(Loan_ID=test[,1],Loan_Status=pred_rf) | |
write.csv(adg,file = "loan_output.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment