BioSciEconomist · August 2, 2018 16:38
diff --git a/ex banking model.R b/ex banking model.R
 #  ------------------------------------------------------------------
 #  |PROGRAM NAME: ex banking model
 #  |DATE: 4/5/18
 #  |CREATED BY: MATT BOGARD 
 #  |UPDATED: 5/8/18
 #  |PROJECT FILE: C:\Users\mtb2901\Documents\Tools and References\R References     
 #  |----------------------------------------------------------------
 #  | PURPOSE: predictive modeling example 
 #  |------------------------------------------------------------------

 library(dplyr)
 library(ggplot2)

 # read data
 banking <- read.csv("/Users/amandabogard/Google Drive/R Training/bank_marketing.csv", header = TRUE) # https://archive.ics.uci.edu/ml/datasets/bank+marketing

 # create binary target
 banking$target <- ifelse(banking$y =="no",0,1)

 # create toy scoring set 
 randomSample = function(df,n) { 
  return (df[sample(nrow(df), n),])
 }

 banking_score_data <- randomSample(banking,25000)

 # drop target fields
 banking_score_data$target <- NULL

 # add ID
 banking_score_data$ID <- seq(1:nrow(banking_score_data))

 # export to csv
 write.csv(banking_score_data, file = "bank_mkt_score.csv", row.names = FALSE)

 dim(banking_score_data)
 View(banking_score_data[,17:18])
 view(banking)

 #-----------------------------------------------
 # exploratory analysis
 #----------------------------------------------

 summary(banking)

 # look at numeric variables
 is.fact <- sapply(banking, is.factor)
 tmp <- banking[,!is.fact]
 vars <- names(tmp)

 # vars: "age"      "balance"  "day"      "duration" "campaign" "pdays"    "previous"

 by(banking$age, banking$y,mean)
 by(banking$balance, banking$y,mean)
 by(banking$day, banking$y,mean)
 by(banking$duration, banking$y,mean)
 by(banking$campaign, banking$y,mean)
 by(banking$previous, banking$y,mean)
 by(banking$pdays, banking$y,mean)

 # Load the gmodels package 
 library(gmodels)

 # Call CrossTable() 
 CrossTable(banking$y)


 # look at factor variables
 is.fact <- sapply(banking, is.factor)
 tmp <- banking[,!is.fact]
 vars <- names(tmp)

 # "job"       "marital"   "education" "default"   "housing"   "loan"      "contact"   "month"     "poutcome" 
 CrossTable(banking$job,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
 CrossTable(banking$marital,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
 CrossTable(banking$education,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
 CrossTable(banking$default,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
 CrossTable(banking$housing,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)


 # using dplyr with binary target
 banking%>% 
  group_by(job) %>%     # group by factor
  summarize(mean = mean(target))  # mean of target

 #-----------------------------------------------
 # data visualization
 #----------------------------------------------

 # simple bar chart
 # "job"       "marital"   "education" "default"   "housing"   "loan"      "contact"   "month"     "poutcome" 

 ### categorical variables

 # Simple Bar Plot
 xvar <- banking$education
 counts <- table(xvar)
 barplot(counts, main="Car Distribution", 
        xlab="Number of Gears")


 # Grouped Bar Plot by target
 xvar <- banking$education
 counts <- table(banking$target, xvar)
 barplot(counts, main="Car Distribution by Gears and VS",
        xlab="Number of Gears", col=c("darkblue","red"),
        legend = rownames(counts), beside=TRUE)

 # bar chart stacked by target

 xvar <- banking$education

 # Stacked Bar Plot with Colors and Legend
 counts <- table(banking$target, xvar)
 barplot(counts, main="Car Distribution by Gears and VS",
        xlab="Number of Gears", col=c("darkblue","red"),
        legend = rownames(counts))

 ### numerical variables

 # look at numeric x's by target - boxplot;
 # vars: "age"      "balance"  "day"      "duration" "campaign" "pdays"    "previous"


 # distributions - histograms
 hist(banking$age)
 hist(banking$balance)
 hist(banking$day)
 hist(banking$duration)
 hist(banking$campaign)
 hist(banking$pdays)
 hist(banking$previous)

 # Boxplot by target
 xvar <- banking$age
 boxplot(xvar~target,data=banking, main="Car Milage Data", 
        xlab="Number of Cylinders", ylab="Miles Per Gallon")

 # correlations between x's

 # Simple Scatterplot

 plot(banking$age, banking$day, main="Scatterplot Example", 
     xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19)


 # Basic Scatterplot Matrix
 pairs(~age+balance+day+duration+campaign+pdays,data=banking, 
      main="Simple Scatterplot Matrix")

 # brushed scatterplot using ggplot2
 ggplot(banking,aes(x=age,y=duration, shape = y,color = y)) + geom_point()

 # histograms by target
 par(mfrow=c(1,2))
 hist(banking$age[banking$y=="no"])
 hist(banking$age[banking$y=="yes"])
 dev.off() # reset par

 # overlapping histograms by target
 library(ggplot2)

 ggplot(banking, aes(x=age, fill=y)) +
  geom_histogram(binwidth=.5, alpha=.5, position="identity")

 # overlapping densities
 ggplot(banking, aes(x=age, colour=y)) + geom_density()

 ggplot(banking, aes(x=age, fill=y)) + geom_density(alpha=.3)


 ### combining plots like a dashboard

 par(mfrow=c(2,2))

 # target distribution
 counts <- table(banking$y)
 barplot(counts, main="Target Distribution", 
        xlab="Decision", col=c("darkblue","red"))

 # grouped bar
 xvar <- banking$education
 counts <- table(banking$target, xvar)
 barplot(counts, main="Target Distribution by Education",
        xlab="Education Level", col=c("darkblue","red"))

 #histograms by age
 hist(banking$age[banking$y=="no"], xlab = "Age", col = "darkblue", main = "Decision: No")
 hist(banking$age[banking$y=="yes"], xlab = "Age", col = "red", main = "Decision: Yes")

 dev.off()


 #-------------------------------------------
 # data partitioning
 #-------------------------------------------

 set.seed(567)

 # Store row numbers for 80% training set: index_train
 index_train <- sample(1:nrow(banking), .8* nrow(banking))

 # Create training set: training_set
 training_set <- banking[index_train, ]

 # Create test set: test_set
 test_set <- banking[-index_train, ]

 # check distributions
 mean(banking$target)
 mean(training_set$target)
 mean(test_set$target)


 #-------------------------------------------
 # logistic regression
 #-------------------------------------------

 names(banking)

 # create model formula

 (fmla <- target ~ age + job + marital + education + default + housing + loan + contact + day + month + duration + 
    campaign + pdays + previous + poutcome)

 logistic_full<- glm(fmla, family = "binomial", data = training_set)

 summary(logistic_full)

 predictions_all_full <- predict(logistic_full, newdata = test_set, type = "response")

 print(data.frame(predictions_all_full))

 length(predictions_all_full)

 #-------------------------------------------
 # decision tree
 #-------------------------------------------

 library(rpart)

 # Include rpart.control to relax the complexity parameter to 0.001.
 tree_full_default <- rpart(fmla, method = "class",
                          data =  training_set, control = rpart.control(cp = 0.001))

 summary(tree_full_default)

 # Plot the decision tree
 plot(tree_full_default, uniform = TRUE)

 # Add labels to the decision tree
 text(tree_full_default)

 ### pruning the tree 

 # Plot the cross-validated error rate as a function of the complexity parameter
 plotcp(tree_full_default)

 # Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
 printcp(tree_full_default)

 # Create an index for of the row with the minimum xerror
 index <- which.min(tree_full_default$cptable[ , "xerror"])

 # Create tree_min
 tree_min <- tree_full_default$cptable[index, "CP"]

 # Prune the tree using tree_min
 ptree <- prune(tree_full_default, cp = tree_min)

 summary(ptree)

 # Plot the decision tree
 plot(ptree, uniform = TRUE)

 # Add labels to the decision tree
 text(tree_full_default)


 ### tree with additional options

 # set a seed and run the code to obtain a tree using weights, minsplit and minbucket
 set.seed(567)
 tree_full_mod <- rpart(fmla, method = "class",
                      data = training_set,
                      control = rpart.control(minsplit = 5, minbucket = 300, cp = 0.001))

 summary(tree_full_mod)


 # Plot the decision tree
 plot(tree_full_mod, uniform = TRUE)

 # Add labels to the decision tree
 text(tree_full_mod)

 ### pruning the tree 

 # Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
 printcp(tree_full_mod)

 # Create an index for of the row with the minimum xerror
 index <- which.min(tree_full_mod$cptable[ , "xerror"])

 # Create tree_min
 tree_min <- tree_full_mod$cptable[index, "CP"]

 #  Prune the tree using tree_min
 ptree_mod <- prune(tree_full_mod, cp = tree_min)

 summary(ptree_mod)

 # Plot the decision tree
 plot(ptree_mod, uniform = TRUE)

 # Add labels to the decision tree
 text(ptree_mod)


 ### model predictions

 pred_ptree_mod <- predict(ptree_mod, newdata = test_set,  type = "prob")

 dim(pred_ptree_mod)

 pred_ptree_mod[,2]

 #-----------------------------------------
 # randomForest
 #----------------------------------------

 # reference: http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/ 

 library(randomForest)

 set.seed(567)

 random_forest <- randomForest(as.factor(target) ~ age + job + marital + education + default + housing + loan + contact + day + month + duration + 
                          campaign + pdays + previous + poutcome,
                    data=training_set, 
                    importance=TRUE, 
                    ntree=2000)

 summary(random_forest)
 varImpPlot(random_forest)

 # model predictions
 pred_rforest <- predict(random_forest,test_set, type = "prob")

 # add scores to test data
 test_set$rf_y <- pred_rforest[,2]

 # add new ID field

 test_set$newID <- seq(1:9043)

 # export scored data to csv for visualization in tableau

 write.csv(test_set, file = "bank_mkt_scored.csv")


 #-------------------------------------
 # model evaluation
 #------------------------------------

 ### comparing regression models with ROC curves

 # Load the pROC-package
 library(pROC)

 # Construct the objects containing ROC-information
 ROC_predictions_all_full <- roc(test_set$target, predictions_all_full)
 ROC_pred_ptree_mod <- roc(test_set$target, pred_ptree_mod[,2])
 ROC_pred_rforest <- roc(test_set$target, pred_rforest[,2])


 # Compute the AUCs
 auc(ROC_predictions_all_full)
 auc(ROC_pred_ptree_mod)
 auc(ROC_pred_rforest)

 # Draw all ROCs on one plot

 par(mar=c(1,1,1,1))

 plot(ROC_predictions_all_full)
 lines(ROC_pred_ptree_mod, col = "blue")
 lines(ROC_pred_rforest, col = "red")

 # export scored test data from RF for exploration in tableau

 #--------------------------------------------------
 # explore test data scored results
 #-------------------------------------------------

 CrossTable(test_set$housing,test_set$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)


 ### get all factor variables
 is.fact <- sapply(test_set, is.factor)
 tmp <- test_set[,is.fact]
 names(tmp)

 # using dplyr with binary target look at distribution of top predictors 
 # duration month day contact housing poutcome age 

 # factors
 test_set%>% 
  group_by(housing) %>%     # group by factor
  summarize(mean = mean(target))  # mean of target 

 # numerics
 test_set%>% 
  group_by(target) %>%     # group by target
  summarize(mean = mean(duration))  # mean of xvar
	# ------------------------------------------------------------------
	# \|PROGRAM NAME: ex banking model
	# \|DATE: 4/5/18
	# \|CREATED BY: MATT BOGARD
	# \|UPDATED: 5/8/18
	# \|PROJECT FILE: C:\Users\mtb2901\Documents\Tools and References\R References
	# \|----------------------------------------------------------------
	# \| PURPOSE: predictive modeling example
	# \|------------------------------------------------------------------

	library(dplyr)
	library(ggplot2)

	# read data
	banking <- read.csv("/Users/amandabogard/Google Drive/R Training/bank_marketing.csv", header = TRUE) # https://archive.ics.uci.edu/ml/datasets/bank+marketing

	# create binary target
	banking$target <- ifelse(banking$y =="no",0,1)

	# create toy scoring set
	randomSample = function(df,n) {
	return (df[sample(nrow(df), n),])
	}

	banking_score_data <- randomSample(banking,25000)

	# drop target fields
	banking_score_data$target <- NULL

	# add ID
	banking_score_data$ID <- seq(1:nrow(banking_score_data))

	# export to csv
	write.csv(banking_score_data, file = "bank_mkt_score.csv", row.names = FALSE)

	dim(banking_score_data)
	View(banking_score_data[,17:18])
	view(banking)

	#-----------------------------------------------
	# exploratory analysis
	#----------------------------------------------

	summary(banking)

	# look at numeric variables
	is.fact <- sapply(banking, is.factor)
	tmp <- banking[,!is.fact]
	vars <- names(tmp)

	# vars: "age" "balance" "day" "duration" "campaign" "pdays" "previous"

	by(banking$age, banking$y,mean)
	by(banking$balance, banking$y,mean)
	by(banking$day, banking$y,mean)
	by(banking$duration, banking$y,mean)
	by(banking$campaign, banking$y,mean)
	by(banking$previous, banking$y,mean)
	by(banking$pdays, banking$y,mean)

	# Load the gmodels package
	library(gmodels)

	# Call CrossTable()
	CrossTable(banking$y)


	# look at factor variables
	is.fact <- sapply(banking, is.factor)
	tmp <- banking[,!is.fact]
	vars <- names(tmp)

	# "job" "marital" "education" "default" "housing" "loan" "contact" "month" "poutcome"
	CrossTable(banking$job,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
	CrossTable(banking$marital,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
	CrossTable(banking$education,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
	CrossTable(banking$default,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)
	CrossTable(banking$housing,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)


	# using dplyr with binary target
	banking%>%
	group_by(job) %>% # group by factor
	summarize(mean = mean(target)) # mean of target

	#-----------------------------------------------
	# data visualization
	#----------------------------------------------

	# simple bar chart
	# "job" "marital" "education" "default" "housing" "loan" "contact" "month" "poutcome"

	### categorical variables

	# Simple Bar Plot
	xvar <- banking$education
	counts <- table(xvar)
	barplot(counts, main="Car Distribution",
	xlab="Number of Gears")


	# Grouped Bar Plot by target
	xvar <- banking$education
	counts <- table(banking$target, xvar)
	barplot(counts, main="Car Distribution by Gears and VS",
	xlab="Number of Gears", col=c("darkblue","red"),
	legend = rownames(counts), beside=TRUE)

	# bar chart stacked by target

	xvar <- banking$education

	# Stacked Bar Plot with Colors and Legend
	counts <- table(banking$target, xvar)
	barplot(counts, main="Car Distribution by Gears and VS",
	xlab="Number of Gears", col=c("darkblue","red"),
	legend = rownames(counts))

	### numerical variables

	# look at numeric x's by target - boxplot;
	# vars: "age" "balance" "day" "duration" "campaign" "pdays" "previous"


	# distributions - histograms
	hist(banking$age)
	hist(banking$balance)
	hist(banking$day)
	hist(banking$duration)
	hist(banking$campaign)
	hist(banking$pdays)
	hist(banking$previous)

	# Boxplot by target
	xvar <- banking$age
	boxplot(xvar~target,data=banking, main="Car Milage Data",
	xlab="Number of Cylinders", ylab="Miles Per Gallon")

	# correlations between x's

	# Simple Scatterplot

	plot(banking$age, banking$day, main="Scatterplot Example",
	xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19)


	# Basic Scatterplot Matrix
	pairs(~age+balance+day+duration+campaign+pdays,data=banking,
	main="Simple Scatterplot Matrix")

	# brushed scatterplot using ggplot2
	ggplot(banking,aes(x=age,y=duration, shape = y,color = y)) + geom_point()

	# histograms by target
	par(mfrow=c(1,2))
	hist(banking$age[banking$y=="no"])
	hist(banking$age[banking$y=="yes"])
	dev.off() # reset par

	# overlapping histograms by target
	library(ggplot2)

	ggplot(banking, aes(x=age, fill=y)) +
	geom_histogram(binwidth=.5, alpha=.5, position="identity")

	# overlapping densities
	ggplot(banking, aes(x=age, colour=y)) + geom_density()

	ggplot(banking, aes(x=age, fill=y)) + geom_density(alpha=.3)


	### combining plots like a dashboard

	par(mfrow=c(2,2))

	# target distribution
	counts <- table(banking$y)
	barplot(counts, main="Target Distribution",
	xlab="Decision", col=c("darkblue","red"))

	# grouped bar
	xvar <- banking$education
	counts <- table(banking$target, xvar)
	barplot(counts, main="Target Distribution by Education",
	xlab="Education Level", col=c("darkblue","red"))

	#histograms by age
	hist(banking$age[banking$y=="no"], xlab = "Age", col = "darkblue", main = "Decision: No")
	hist(banking$age[banking$y=="yes"], xlab = "Age", col = "red", main = "Decision: Yes")

	dev.off()


	#-------------------------------------------
	# data partitioning
	#-------------------------------------------

	set.seed(567)

	# Store row numbers for 80% training set: index_train
	index_train <- sample(1:nrow(banking), .8* nrow(banking))

	# Create training set: training_set
	training_set <- banking[index_train, ]

	# Create test set: test_set
	test_set <- banking[-index_train, ]

	# check distributions
	mean(banking$target)
	mean(training_set$target)
	mean(test_set$target)


	#-------------------------------------------
	# logistic regression
	#-------------------------------------------

	names(banking)

	# create model formula

	(fmla <- target ~ age + job + marital + education + default + housing + loan + contact + day + month + duration +
	campaign + pdays + previous + poutcome)

	logistic_full<- glm(fmla, family = "binomial", data = training_set)

	summary(logistic_full)

	predictions_all_full <- predict(logistic_full, newdata = test_set, type = "response")

	print(data.frame(predictions_all_full))

	length(predictions_all_full)

	#-------------------------------------------
	# decision tree
	#-------------------------------------------

	library(rpart)

	# Include rpart.control to relax the complexity parameter to 0.001.
	tree_full_default <- rpart(fmla, method = "class",
	data = training_set, control = rpart.control(cp = 0.001))

	summary(tree_full_default)

	# Plot the decision tree
	plot(tree_full_default, uniform = TRUE)

	# Add labels to the decision tree
	text(tree_full_default)

	### pruning the tree

	# Plot the cross-validated error rate as a function of the complexity parameter
	plotcp(tree_full_default)

	# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
	printcp(tree_full_default)

	# Create an index for of the row with the minimum xerror
	index <- which.min(tree_full_default$cptable[ , "xerror"])

	# Create tree_min
	tree_min <- tree_full_default$cptable[index, "CP"]

	# Prune the tree using tree_min
	ptree <- prune(tree_full_default, cp = tree_min)

	summary(ptree)

	# Plot the decision tree
	plot(ptree, uniform = TRUE)

	# Add labels to the decision tree
	text(tree_full_default)


	### tree with additional options

	# set a seed and run the code to obtain a tree using weights, minsplit and minbucket
	set.seed(567)
	tree_full_mod <- rpart(fmla, method = "class",
	data = training_set,
	control = rpart.control(minsplit = 5, minbucket = 300, cp = 0.001))

	summary(tree_full_mod)


	# Plot the decision tree
	plot(tree_full_mod, uniform = TRUE)

	# Add labels to the decision tree
	text(tree_full_mod)

	### pruning the tree

	# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
	printcp(tree_full_mod)

	# Create an index for of the row with the minimum xerror
	index <- which.min(tree_full_mod$cptable[ , "xerror"])

	# Create tree_min
	tree_min <- tree_full_mod$cptable[index, "CP"]

	# Prune the tree using tree_min
	ptree_mod <- prune(tree_full_mod, cp = tree_min)

	summary(ptree_mod)

	# Plot the decision tree
	plot(ptree_mod, uniform = TRUE)

	# Add labels to the decision tree
	text(ptree_mod)


	### model predictions

	pred_ptree_mod <- predict(ptree_mod, newdata = test_set, type = "prob")

	dim(pred_ptree_mod)

	pred_ptree_mod[,2]

	#-----------------------------------------
	# randomForest
	#----------------------------------------

	# reference: http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/

	library(randomForest)

	set.seed(567)

	random_forest <- randomForest(as.factor(target) ~ age + job + marital + education + default + housing + loan + contact + day + month + duration +
	campaign + pdays + previous + poutcome,
	data=training_set,
	importance=TRUE,
	ntree=2000)

	summary(random_forest)
	varImpPlot(random_forest)

	# model predictions
	pred_rforest <- predict(random_forest,test_set, type = "prob")

	# add scores to test data
	test_set$rf_y <- pred_rforest[,2]

	# add new ID field

	test_set$newID <- seq(1:9043)

	# export scored data to csv for visualization in tableau

	write.csv(test_set, file = "bank_mkt_scored.csv")


	#-------------------------------------
	# model evaluation
	#------------------------------------

	### comparing regression models with ROC curves

	# Load the pROC-package
	library(pROC)

	# Construct the objects containing ROC-information
	ROC_predictions_all_full <- roc(test_set$target, predictions_all_full)
	ROC_pred_ptree_mod <- roc(test_set$target, pred_ptree_mod[,2])
	ROC_pred_rforest <- roc(test_set$target, pred_rforest[,2])


	# Compute the AUCs
	auc(ROC_predictions_all_full)
	auc(ROC_pred_ptree_mod)
	auc(ROC_pred_rforest)

	# Draw all ROCs on one plot

	par(mar=c(1,1,1,1))

	plot(ROC_predictions_all_full)
	lines(ROC_pred_ptree_mod, col = "blue")
	lines(ROC_pred_rforest, col = "red")

	# export scored test data from RF for exploration in tableau

	#--------------------------------------------------
	# explore test data scored results
	#-------------------------------------------------

	CrossTable(test_set$housing,test_set$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE)


	### get all factor variables
	is.fact <- sapply(test_set, is.factor)
	tmp <- test_set[,is.fact]
	names(tmp)

	# using dplyr with binary target look at distribution of top predictors
	# duration month day contact housing poutcome age

	# factors
	test_set%>%
	group_by(housing) %>% # group by factor
	summarize(mean = mean(target)) # mean of target

	# numerics
	test_set%>%
	group_by(target) %>% # group by target
	summarize(mean = mean(duration)) # mean of xvar