Skip to content

Instantly share code, notes, and snippets.

@BioSciEconomist
Created August 2, 2018 16:38
Show Gist options
  • Save BioSciEconomist/eb78c3da13809ad13dbb6160af8d52ce to your computer and use it in GitHub Desktop.
Save BioSciEconomist/eb78c3da13809ad13dbb6160af8d52ce to your computer and use it in GitHub Desktop.
Example Predictive Modeling Project in R
# ------------------------------------------------------------------
# |PROGRAM NAME: ex banking model
# |DATE: 4/5/18
# |CREATED BY: MATT BOGARD
# |UPDATED: 5/8/18
# |PROJECT FILE: C:\Users\mtb2901\Documents\Tools and References\R References
# |----------------------------------------------------------------
# | PURPOSE: predictive modeling example
# |------------------------------------------------------------------
library(dplyr)
library(ggplot2)
# read data
banking <- read.csv("/Users/amandabogard/Google Drive/R Training/bank_marketing.csv", header = TRUE) # https://archive.ics.uci.edu/ml/datasets/bank+marketing
# create binary target
banking$target <- ifelse(banking$y =="no",0,1)
# create toy scoring set
randomSample = function(df,n) {
return (df[sample(nrow(df), n),])
}
banking_score_data <- randomSample(banking,25000)
# drop target fields
banking_score_data$target <- NULL
# add ID
banking_score_data$ID <- seq(1:nrow(banking_score_data))
# export to csv
write.csv(banking_score_data, file = "bank_mkt_score.csv", row.names = FALSE)
dim(banking_score_data)
View(banking_score_data[,17:18])
view(banking)
#-----------------------------------------------
# exploratory analysis
#----------------------------------------------
summary(banking)
# look at numeric variables
is.fact <- sapply(banking, is.factor)
tmp <- banking[,!is.fact]
vars <- names(tmp)
# vars: "age" "balance" "day" "duration" "campaign" "pdays" "previous"
by(banking$age, banking$y,mean)
by(banking$balance, banking$y,mean)
by(banking$day, banking$y,mean)
by(banking$duration, banking$y,mean)
by(banking$campaign, banking$y,mean)
by(banking$previous, banking$y,mean)
by(banking$pdays, banking$y,mean)
# Load the gmodels package
library(gmodels)
# Call CrossTable()
CrossTable(banking$y)
# look at factor variables
is.fact <- sapply(banking, is.factor)
tmp <- banking[,!is.fact]
vars <- names(tmp)
# "job" "marital" "education" "default" "housing" "loan" "contact" "month" "poutcome"
CrossTable(banking$job,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
CrossTable(banking$marital,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
CrossTable(banking$education,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
CrossTable(banking$default,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
CrossTable(banking$housing,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
# using dplyr with binary target
banking%>%
group_by(job) %>% # group by factor
summarize(mean = mean(target)) # mean of target
#-----------------------------------------------
# data visualization
#----------------------------------------------
# simple bar chart
# "job" "marital" "education" "default" "housing" "loan" "contact" "month" "poutcome"
### categorical variables
# Simple Bar Plot
xvar <- banking$education
counts <- table(xvar)
barplot(counts, main="Car Distribution",
xlab="Number of Gears")
# Grouped Bar Plot by target
xvar <- banking$education
counts <- table(banking$target, xvar)
barplot(counts, main="Car Distribution by Gears and VS",
xlab="Number of Gears", col=c("darkblue","red"),
legend = rownames(counts), beside=TRUE)
# bar chart stacked by target
xvar <- banking$education
# Stacked Bar Plot with Colors and Legend
counts <- table(banking$target, xvar)
barplot(counts, main="Car Distribution by Gears and VS",
xlab="Number of Gears", col=c("darkblue","red"),
legend = rownames(counts))
### numerical variables
# look at numeric x's by target - boxplot;
# vars: "age" "balance" "day" "duration" "campaign" "pdays" "previous"
# distributions - histograms
hist(banking$age)
hist(banking$balance)
hist(banking$day)
hist(banking$duration)
hist(banking$campaign)
hist(banking$pdays)
hist(banking$previous)
# Boxplot by target
xvar <- banking$age
boxplot(xvar~target,data=banking, main="Car Milage Data",
xlab="Number of Cylinders", ylab="Miles Per Gallon")
# correlations between x's
# Simple Scatterplot
plot(banking$age, banking$day, main="Scatterplot Example",
xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19)
# Basic Scatterplot Matrix
pairs(~age+balance+day+duration+campaign+pdays,data=banking,
main="Simple Scatterplot Matrix")
# brushed scatterplot using ggplot2
ggplot(banking,aes(x=age,y=duration, shape = y,color = y)) + geom_point()
# histograms by target
par(mfrow=c(1,2))
hist(banking$age[banking$y=="no"])
hist(banking$age[banking$y=="yes"])
dev.off() # reset par
# overlapping histograms by target
library(ggplot2)
ggplot(banking, aes(x=age, fill=y)) +
geom_histogram(binwidth=.5, alpha=.5, position="identity")
# overlapping densities
ggplot(banking, aes(x=age, colour=y)) + geom_density()
ggplot(banking, aes(x=age, fill=y)) + geom_density(alpha=.3)
### combining plots like a dashboard
par(mfrow=c(2,2))
# target distribution
counts <- table(banking$y)
barplot(counts, main="Target Distribution",
xlab="Decision", col=c("darkblue","red"))
# grouped bar
xvar <- banking$education
counts <- table(banking$target, xvar)
barplot(counts, main="Target Distribution by Education",
xlab="Education Level", col=c("darkblue","red"))
#histograms by age
hist(banking$age[banking$y=="no"], xlab = "Age", col = "darkblue", main = "Decision: No")
hist(banking$age[banking$y=="yes"], xlab = "Age", col = "red", main = "Decision: Yes")
dev.off()
#-------------------------------------------
# data partitioning
#-------------------------------------------
set.seed(567)
# Store row numbers for 80% training set: index_train
index_train <- sample(1:nrow(banking), .8* nrow(banking))
# Create training set: training_set
training_set <- banking[index_train, ]
# Create test set: test_set
test_set <- banking[-index_train, ]
# check distributions
mean(banking$target)
mean(training_set$target)
mean(test_set$target)
#-------------------------------------------
# logistic regression
#-------------------------------------------
names(banking)
# create model formula
(fmla <- target ~ age + job + marital + education + default + housing + loan + contact + day + month + duration +
campaign + pdays + previous + poutcome)
logistic_full<- glm(fmla, family = "binomial", data = training_set)
summary(logistic_full)
predictions_all_full <- predict(logistic_full, newdata = test_set, type = "response")
print(data.frame(predictions_all_full))
length(predictions_all_full)
#-------------------------------------------
# decision tree
#-------------------------------------------
library(rpart)
# Include rpart.control to relax the complexity parameter to 0.001.
tree_full_default <- rpart(fmla, method = "class",
data = training_set, control = rpart.control(cp = 0.001))
summary(tree_full_default)
# Plot the decision tree
plot(tree_full_default, uniform = TRUE)
# Add labels to the decision tree
text(tree_full_default)
### pruning the tree
# Plot the cross-validated error rate as a function of the complexity parameter
plotcp(tree_full_default)
# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
printcp(tree_full_default)
# Create an index for of the row with the minimum xerror
index <- which.min(tree_full_default$cptable[ , "xerror"])
# Create tree_min
tree_min <- tree_full_default$cptable[index, "CP"]
# Prune the tree using tree_min
ptree <- prune(tree_full_default, cp = tree_min)
summary(ptree)
# Plot the decision tree
plot(ptree, uniform = TRUE)
# Add labels to the decision tree
text(tree_full_default)
### tree with additional options
# set a seed and run the code to obtain a tree using weights, minsplit and minbucket
set.seed(567)
tree_full_mod <- rpart(fmla, method = "class",
data = training_set,
control = rpart.control(minsplit = 5, minbucket = 300, cp = 0.001))
summary(tree_full_mod)
# Plot the decision tree
plot(tree_full_mod, uniform = TRUE)
# Add labels to the decision tree
text(tree_full_mod)
### pruning the tree
# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
printcp(tree_full_mod)
# Create an index for of the row with the minimum xerror
index <- which.min(tree_full_mod$cptable[ , "xerror"])
# Create tree_min
tree_min <- tree_full_mod$cptable[index, "CP"]
# Prune the tree using tree_min
ptree_mod <- prune(tree_full_mod, cp = tree_min)
summary(ptree_mod)
# Plot the decision tree
plot(ptree_mod, uniform = TRUE)
# Add labels to the decision tree
text(ptree_mod)
### model predictions
pred_ptree_mod <- predict(ptree_mod, newdata = test_set, type = "prob")
dim(pred_ptree_mod)
pred_ptree_mod[,2]
#-----------------------------------------
# randomForest
#----------------------------------------
# reference: http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/
library(randomForest)
set.seed(567)
random_forest <- randomForest(as.factor(target) ~ age + job + marital + education + default + housing + loan + contact + day + month + duration +
campaign + pdays + previous + poutcome,
data=training_set,
importance=TRUE,
ntree=2000)
summary(random_forest)
varImpPlot(random_forest)
# model predictions
pred_rforest <- predict(random_forest,test_set, type = "prob")
# add scores to test data
test_set$rf_y <- pred_rforest[,2]
# add new ID field
test_set$newID <- seq(1:9043)
# export scored data to csv for visualization in tableau
write.csv(test_set, file = "bank_mkt_scored.csv")
#-------------------------------------
# model evaluation
#------------------------------------
### comparing regression models with ROC curves
# Load the pROC-package
library(pROC)
# Construct the objects containing ROC-information
ROC_predictions_all_full <- roc(test_set$target, predictions_all_full)
ROC_pred_ptree_mod <- roc(test_set$target, pred_ptree_mod[,2])
ROC_pred_rforest <- roc(test_set$target, pred_rforest[,2])
# Compute the AUCs
auc(ROC_predictions_all_full)
auc(ROC_pred_ptree_mod)
auc(ROC_pred_rforest)
# Draw all ROCs on one plot
par(mar=c(1,1,1,1))
plot(ROC_predictions_all_full)
lines(ROC_pred_ptree_mod, col = "blue")
lines(ROC_pred_rforest, col = "red")
# export scored test data from RF for exploration in tableau
#--------------------------------------------------
# explore test data scored results
#-------------------------------------------------
CrossTable(test_set$housing,test_set$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
### get all factor variables
is.fact <- sapply(test_set, is.factor)
tmp <- test_set[,is.fact]
names(tmp)
# using dplyr with binary target look at distribution of top predictors
# duration month day contact housing poutcome age
# factors
test_set%>%
group_by(housing) %>% # group by factor
summarize(mean = mean(target)) # mean of target
# numerics
test_set%>%
group_by(target) %>% # group by target
summarize(mean = mean(duration)) # mean of xvar
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment