Created
August 2, 2018 16:38
-
-
Save BioSciEconomist/eb78c3da13809ad13dbb6160af8d52ce to your computer and use it in GitHub Desktop.
Example Predictive Modeling Project in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # ------------------------------------------------------------------ | |
| # |PROGRAM NAME: ex banking model | |
| # |DATE: 4/5/18 | |
| # |CREATED BY: MATT BOGARD | |
| # |UPDATED: 5/8/18 | |
| # |PROJECT FILE: C:\Users\mtb2901\Documents\Tools and References\R References | |
| # |---------------------------------------------------------------- | |
| # | PURPOSE: predictive modeling example | |
| # |------------------------------------------------------------------ | |
| library(dplyr) | |
| library(ggplot2) | |
| # read data | |
| banking <- read.csv("/Users/amandabogard/Google Drive/R Training/bank_marketing.csv", header = TRUE) # https://archive.ics.uci.edu/ml/datasets/bank+marketing | |
| # create binary target | |
| banking$target <- ifelse(banking$y =="no",0,1) | |
| # create toy scoring set | |
| randomSample = function(df,n) { | |
| return (df[sample(nrow(df), n),]) | |
| } | |
| banking_score_data <- randomSample(banking,25000) | |
| # drop target fields | |
| banking_score_data$target <- NULL | |
| # add ID | |
| banking_score_data$ID <- seq(1:nrow(banking_score_data)) | |
| # export to csv | |
| write.csv(banking_score_data, file = "bank_mkt_score.csv", row.names = FALSE) | |
| dim(banking_score_data) | |
| View(banking_score_data[,17:18]) | |
| view(banking) | |
| #----------------------------------------------- | |
| # exploratory analysis | |
| #---------------------------------------------- | |
| summary(banking) | |
| # look at numeric variables | |
| is.fact <- sapply(banking, is.factor) | |
| tmp <- banking[,!is.fact] | |
| vars <- names(tmp) | |
| # vars: "age" "balance" "day" "duration" "campaign" "pdays" "previous" | |
| by(banking$age, banking$y,mean) | |
| by(banking$balance, banking$y,mean) | |
| by(banking$day, banking$y,mean) | |
| by(banking$duration, banking$y,mean) | |
| by(banking$campaign, banking$y,mean) | |
| by(banking$previous, banking$y,mean) | |
| by(banking$pdays, banking$y,mean) | |
| # Load the gmodels package | |
| library(gmodels) | |
| # Call CrossTable() | |
| CrossTable(banking$y) | |
| # look at factor variables | |
| is.fact <- sapply(banking, is.factor) | |
| tmp <- banking[,!is.fact] | |
| vars <- names(tmp) | |
| # "job" "marital" "education" "default" "housing" "loan" "contact" "month" "poutcome" | |
| CrossTable(banking$job,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE) | |
| CrossTable(banking$marital,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE) | |
| CrossTable(banking$education,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE) | |
| CrossTable(banking$default,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE) | |
| CrossTable(banking$housing,banking$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE) | |
| # using dplyr with binary target | |
| banking%>% | |
| group_by(job) %>% # group by factor | |
| summarize(mean = mean(target)) # mean of target | |
| #----------------------------------------------- | |
| # data visualization | |
| #---------------------------------------------- | |
| # simple bar chart | |
| # "job" "marital" "education" "default" "housing" "loan" "contact" "month" "poutcome" | |
| ### categorical variables | |
| # Simple Bar Plot | |
| xvar <- banking$education | |
| counts <- table(xvar) | |
| barplot(counts, main="Car Distribution", | |
| xlab="Number of Gears") | |
| # Grouped Bar Plot by target | |
| xvar <- banking$education | |
| counts <- table(banking$target, xvar) | |
| barplot(counts, main="Car Distribution by Gears and VS", | |
| xlab="Number of Gears", col=c("darkblue","red"), | |
| legend = rownames(counts), beside=TRUE) | |
| # bar chart stacked by target | |
| xvar <- banking$education | |
| # Stacked Bar Plot with Colors and Legend | |
| counts <- table(banking$target, xvar) | |
| barplot(counts, main="Car Distribution by Gears and VS", | |
| xlab="Number of Gears", col=c("darkblue","red"), | |
| legend = rownames(counts)) | |
| ### numerical variables | |
| # look at numeric x's by target - boxplot; | |
| # vars: "age" "balance" "day" "duration" "campaign" "pdays" "previous" | |
| # distributions - histograms | |
| hist(banking$age) | |
| hist(banking$balance) | |
| hist(banking$day) | |
| hist(banking$duration) | |
| hist(banking$campaign) | |
| hist(banking$pdays) | |
| hist(banking$previous) | |
| # Boxplot by target | |
| xvar <- banking$age | |
| boxplot(xvar~target,data=banking, main="Car Milage Data", | |
| xlab="Number of Cylinders", ylab="Miles Per Gallon") | |
| # correlations between x's | |
| # Simple Scatterplot | |
| plot(banking$age, banking$day, main="Scatterplot Example", | |
| xlab="Car Weight ", ylab="Miles Per Gallon ", pch=19) | |
| # Basic Scatterplot Matrix | |
| pairs(~age+balance+day+duration+campaign+pdays,data=banking, | |
| main="Simple Scatterplot Matrix") | |
| # brushed scatterplot using ggplot2 | |
| ggplot(banking,aes(x=age,y=duration, shape = y,color = y)) + geom_point() | |
| # histograms by target | |
| par(mfrow=c(1,2)) | |
| hist(banking$age[banking$y=="no"]) | |
| hist(banking$age[banking$y=="yes"]) | |
| dev.off() # reset par | |
| # overlapping histograms by target | |
| library(ggplot2) | |
| ggplot(banking, aes(x=age, fill=y)) + | |
| geom_histogram(binwidth=.5, alpha=.5, position="identity") | |
| # overlapping densities | |
| ggplot(banking, aes(x=age, colour=y)) + geom_density() | |
| ggplot(banking, aes(x=age, fill=y)) + geom_density(alpha=.3) | |
| ### combining plots like a dashboard | |
| par(mfrow=c(2,2)) | |
| # target distribution | |
| counts <- table(banking$y) | |
| barplot(counts, main="Target Distribution", | |
| xlab="Decision", col=c("darkblue","red")) | |
| # grouped bar | |
| xvar <- banking$education | |
| counts <- table(banking$target, xvar) | |
| barplot(counts, main="Target Distribution by Education", | |
| xlab="Education Level", col=c("darkblue","red")) | |
| #histograms by age | |
| hist(banking$age[banking$y=="no"], xlab = "Age", col = "darkblue", main = "Decision: No") | |
| hist(banking$age[banking$y=="yes"], xlab = "Age", col = "red", main = "Decision: Yes") | |
| dev.off() | |
| #------------------------------------------- | |
| # data partitioning | |
| #------------------------------------------- | |
| set.seed(567) | |
| # Store row numbers for 80% training set: index_train | |
| index_train <- sample(1:nrow(banking), .8* nrow(banking)) | |
| # Create training set: training_set | |
| training_set <- banking[index_train, ] | |
| # Create test set: test_set | |
| test_set <- banking[-index_train, ] | |
| # check distributions | |
| mean(banking$target) | |
| mean(training_set$target) | |
| mean(test_set$target) | |
| #------------------------------------------- | |
| # logistic regression | |
| #------------------------------------------- | |
| names(banking) | |
| # create model formula | |
| (fmla <- target ~ age + job + marital + education + default + housing + loan + contact + day + month + duration + | |
| campaign + pdays + previous + poutcome) | |
| logistic_full<- glm(fmla, family = "binomial", data = training_set) | |
| summary(logistic_full) | |
| predictions_all_full <- predict(logistic_full, newdata = test_set, type = "response") | |
| print(data.frame(predictions_all_full)) | |
| length(predictions_all_full) | |
| #------------------------------------------- | |
| # decision tree | |
| #------------------------------------------- | |
| library(rpart) | |
| # Include rpart.control to relax the complexity parameter to 0.001. | |
| tree_full_default <- rpart(fmla, method = "class", | |
| data = training_set, control = rpart.control(cp = 0.001)) | |
| summary(tree_full_default) | |
| # Plot the decision tree | |
| plot(tree_full_default, uniform = TRUE) | |
| # Add labels to the decision tree | |
| text(tree_full_default) | |
| ### pruning the tree | |
| # Plot the cross-validated error rate as a function of the complexity parameter | |
| plotcp(tree_full_default) | |
| # Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized. | |
| printcp(tree_full_default) | |
| # Create an index for of the row with the minimum xerror | |
| index <- which.min(tree_full_default$cptable[ , "xerror"]) | |
| # Create tree_min | |
| tree_min <- tree_full_default$cptable[index, "CP"] | |
| # Prune the tree using tree_min | |
| ptree <- prune(tree_full_default, cp = tree_min) | |
| summary(ptree) | |
| # Plot the decision tree | |
| plot(ptree, uniform = TRUE) | |
| # Add labels to the decision tree | |
| text(tree_full_default) | |
| ### tree with additional options | |
| # set a seed and run the code to obtain a tree using weights, minsplit and minbucket | |
| set.seed(567) | |
| tree_full_mod <- rpart(fmla, method = "class", | |
| data = training_set, | |
| control = rpart.control(minsplit = 5, minbucket = 300, cp = 0.001)) | |
| summary(tree_full_mod) | |
| # Plot the decision tree | |
| plot(tree_full_mod, uniform = TRUE) | |
| # Add labels to the decision tree | |
| text(tree_full_mod) | |
| ### pruning the tree | |
| # Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized. | |
| printcp(tree_full_mod) | |
| # Create an index for of the row with the minimum xerror | |
| index <- which.min(tree_full_mod$cptable[ , "xerror"]) | |
| # Create tree_min | |
| tree_min <- tree_full_mod$cptable[index, "CP"] | |
| # Prune the tree using tree_min | |
| ptree_mod <- prune(tree_full_mod, cp = tree_min) | |
| summary(ptree_mod) | |
| # Plot the decision tree | |
| plot(ptree_mod, uniform = TRUE) | |
| # Add labels to the decision tree | |
| text(ptree_mod) | |
| ### model predictions | |
| pred_ptree_mod <- predict(ptree_mod, newdata = test_set, type = "prob") | |
| dim(pred_ptree_mod) | |
| pred_ptree_mod[,2] | |
| #----------------------------------------- | |
| # randomForest | |
| #---------------------------------------- | |
| # reference: http://trevorstephens.com/kaggle-titanic-tutorial/r-part-5-random-forests/ | |
| library(randomForest) | |
| set.seed(567) | |
| random_forest <- randomForest(as.factor(target) ~ age + job + marital + education + default + housing + loan + contact + day + month + duration + | |
| campaign + pdays + previous + poutcome, | |
| data=training_set, | |
| importance=TRUE, | |
| ntree=2000) | |
| summary(random_forest) | |
| varImpPlot(random_forest) | |
| # model predictions | |
| pred_rforest <- predict(random_forest,test_set, type = "prob") | |
| # add scores to test data | |
| test_set$rf_y <- pred_rforest[,2] | |
| # add new ID field | |
| test_set$newID <- seq(1:9043) | |
| # export scored data to csv for visualization in tableau | |
| write.csv(test_set, file = "bank_mkt_scored.csv") | |
| #------------------------------------- | |
| # model evaluation | |
| #------------------------------------ | |
| ### comparing regression models with ROC curves | |
| # Load the pROC-package | |
| library(pROC) | |
| # Construct the objects containing ROC-information | |
| ROC_predictions_all_full <- roc(test_set$target, predictions_all_full) | |
| ROC_pred_ptree_mod <- roc(test_set$target, pred_ptree_mod[,2]) | |
| ROC_pred_rforest <- roc(test_set$target, pred_rforest[,2]) | |
| # Compute the AUCs | |
| auc(ROC_predictions_all_full) | |
| auc(ROC_pred_ptree_mod) | |
| auc(ROC_pred_rforest) | |
| # Draw all ROCs on one plot | |
| par(mar=c(1,1,1,1)) | |
| plot(ROC_predictions_all_full) | |
| lines(ROC_pred_ptree_mod, col = "blue") | |
| lines(ROC_pred_rforest, col = "red") | |
| # export scored test data from RF for exploration in tableau | |
| #-------------------------------------------------- | |
| # explore test data scored results | |
| #------------------------------------------------- | |
| CrossTable(test_set$housing,test_set$y,prop.r = TRUE, prop.c =FALSE, prop.t = FALSE, prop.chisq = FALSE) | |
| ### get all factor variables | |
| is.fact <- sapply(test_set, is.factor) | |
| tmp <- test_set[,is.fact] | |
| names(tmp) | |
| # using dplyr with binary target look at distribution of top predictors | |
| # duration month day contact housing poutcome age | |
| # factors | |
| test_set%>% | |
| group_by(housing) %>% # group by factor | |
| summarize(mean = mean(target)) # mean of target | |
| # numerics | |
| test_set%>% | |
| group_by(target) %>% # group by target | |
| summarize(mean = mean(duration)) # mean of xvar |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment