Skip to content

Instantly share code, notes, and snippets.

@ethen8181
Last active December 8, 2016 16:28
Show Gist options
  • Save ethen8181/fb46e587413eec79f27d05d2e7d54fcf to your computer and use it in GitHub Desktop.
Save ethen8181/fb46e587413eec79f27d05d2e7d54fcf to your computer and use it in GitHub Desktop.
test
library(MASS)
library(ROCR)
library(caret)
library(glmnet)
library(magrittr)
library(data.table)
library(doParallel)
setwd('/Users/ethen/Desktop/predictive-project')
# ---------------------------------------------------------------------------------
# preprocessing
path_donation <- 'donation data.csv'
path_code <- 'dmef1code.csv'
path_state_mapping <- 'states.csv'
# main dataset, remove the id column
donation <- fread(path_donation, stringsAsFactors = TRUE)
donation[ , ID := NULL ]
# delete the last row, since it has a questionable '*' CODETYPE
code <- fread(path_code, stringsAsFactors = TRUE)
code <- code[-nrow(code), ]
code[ , CODETYPE := factor( CODETYPE, levels = c('A', 'B', 'C', 'D', 'M') ) ]
# table that maps the states into regions
# https://drive.google.com/open?id=0B5-Q31Rgtkixb0t3MkNTNXI2djA
states <- fread(path_state_mapping, stringsAsFactors = TRUE)
preprocessing <- function(donation, code, states) {
# general preprocessing steps for the donation data
# for people that have donated only once, their CNDOL2 and
# CNCOD3 will be missing (it indicates the previous donation),
# the same goes for SLCOD2 and SLCOD3;
# match CNCOD (Latest Contribution Code),
# SLCOD (Latest Solicitation Code) to the five types in the code file
cols_mapping <- c('CNCOD1', 'CNCOD2', 'CNCOD3',
'SLCOD1', 'SLCOD2', 'SLCOD3')
donation[ , (cols_mapping) := lapply( .SD, function(col) {
code$CODETYPE[ match(col, code$CODE) ]
}), .SDcols = cols_mapping ]
# for the CNCOD and SLCOD replace the one
# that has NA value, with a 'None' category
for(j in cols_mapping) {
na_rows <- which( is.na(donation[[j]]) )
set(donation, i = na_rows, j = j, value = 'None')
}
# drop all the CNDAT (Latest Contribution Date)
# because we can use CNMON (months since latest
# contribution) instead
cols_drop <- c('CNDAT1', 'CNDAT2', 'CNDAT3')
donation[ , (cols_drop) := NULL ]
# if he/she donated once, then the CNDOL2
# (2nd Latest Contribution) should be 0
# the same applies for CNDOL3 (3rd Latest Contribution)
donation[ CNTMLIF == 1, CNDOL2 := 0 ]
donation[ CNTMLIF == 1, CNDOL3 := 0 ]
donation[ CNTMLIF == 2, CNDOL3 := 0 ]
# merge state into regions
# 1. New england: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont
# 2. Mideast: Delaware, District of Columbia, Maryland, New Jersey, New York, and Pennsylvania
# 3. Great Lakes: Illinois, Indiana, Michigan, Ohio, and Wisconsin
# 4. Plains: Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, and South Dakota
# 5. Southeast: Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi,
# North Carolina, South Carolina, Tennessee, Virginia, and West Virginia
# 6. Southwest: Arizona, New Mexico, Oklahoma, and Texas
# 7. Rocky Mountain: Colorado, Idaho, Montana, Utah, and Wyoming
# 8. Far West: Alaska, California, Hawaii, Nevada, Oregon, and Washington
donation[ , region := states$region[ match(STATCODE, states$statecode) ] ]
donation[ , STATCODE := NULL ]
donation[ is.na(region), region := 'Other' ]
# if both CNMON2 and CNMON3 are both missing, meaning that
# they have only donated once, fill it with CNMON1; the same idea
# can be extended to CNMON3
donated_once <- with( donation, is.na(CNMON2) & is.na(CNMON3) )
donation[ donated_once, CNMON2 := CNMON1 ]
donation[ donated_once, CNMON3 := CNMON1 ]
donated_twice <- is.na(donation$CNMON3)
donation[ donated_twice, CNMON3 := CNMON2 ]
# outliers exists amongst these columns (there are
# people that donated a lot more than others); log
# these columns to make the numbers more normally distributed
donation[ , `:=`( CNDOL1 = log(CNDOL1),
CNTRLIF = log(CNTRLIF),
CONLARG = log(CONLARG) ) ]
# binary response for the logistic regression to see
# whether that person donated or not
donation[ , DONATED := as.factor( ifelse(TARGDOL > 0, 'YES', 'NO') ) ]
return(donation)
}
donation <- preprocessing(donation, code, states)
# train/test split
# training and test sets in the ratio of 2:1 by putting every
# 3rd observation in the test set and all remaining observations in
# the training set
test_index <- seq(3, nrow(donation), by = 3)
X_train <- donation[-test_index]
X_test <- donation[test_index]
# remove the obvious outliers from the training set
X_train <- X_train[ TARGDOL != 1500, ]
remove_correlations <- function(X_train, X_test) {
# identify multi-collinearity using variance inflation factor (vif)
# and remove them from both the train and test set;
# conventionally, a vif score larger than 10 should be removed,
# after this procedure three columns including SLCOD2, CNCOD2, CNTRLIF
# and CNMON3 were removed
# assign a random response column to start the process
# and keep removing multi-collinearity until there is none
multi_cor_cols <- 'TARGDOL'
while( length(multi_cor_cols) > 0 ) {
model_lm <- lm(TARGDOL ~.-DONATED, data = X_train)
multi_cor <- car::vif(model_lm)[ , 'GVIF']
max_col <- names( which.max(multi_cor) )
if( multi_cor[[max_col]] > 10 ) {
multi_cor_cols <- max_col
print(multi_cor_cols)
X_train[ , (multi_cor_cols) := NULL ]
X_test[ , (multi_cor_cols) := NULL ]
} else {
multi_cor_cols <- NULL
}
}
removed_cor <- list(train = X_train, test = X_test)
return(removed_cor)
}
removed_cor <- remove_correlations(X_train, X_test)
X_train <- removed_cor$train
X_test <- removed_cor$test
create_interactions <- function(X_train, X_test) {
# create interaction terms across all possible pairs of numeric columns;
# (numeric columns include class of integer and numeric)
# this includes creating, 2nd polynomial degree columns (squaring the column value)
num_cols <- !vapply( X_train, class, character(1) ) %in% 'factor'
num_name <- colnames(X_train)[num_cols]
num_name <- setdiff(num_name, 'TARGDOL')
num_len <- length(num_name)
for(i in 1:num_len) {
col1 <- num_name[i]
for(j in i:num_len) {
col2 <- num_name[j]
col_name <- paste0(col1, '_TIMES_', col2)
X_train[ , (col_name) := X_train[[col1]] * X_train[[col2]] ]
X_test[ , (col_name) := X_test[[col1]] * X_test[[col2]] ]
}
}
created_inter <- list(train = X_train, test = X_test)
return(created_inter)
}
# after creating interaction term, we ended up with a total of
# 96 features. The reference level for the factor columns are
# B for SEX, FarWest for region, A for CNCOD1, CNCOD3, SLCOD1 and SLCOD3
created_inter <- create_interactions(X_train, X_test)
X_train <- created_inter$train
X_test <- created_inter$test
# ---------------------------------------------------------------------------------
# model training
# set the cross validation number for all models
nfolds <- 10
# shuffle the row observations of the training data
# for potential model's stability
set.seed(1234)
X_train <- X_train[ sample( nrow(X_train) ) ]
# use all possible - 1 cores to train the lasso/ridge,
# remember to stop the created cluster afterwards
registerDoParallel(cores = detectCores() - 1)
# specify parameters to search for: alpha = 1 is lasso, 0 is lasso
# and a fix range of lambda values
glmnet_grid <- expand.grid( alpha = c(0, 1),
lambda = c(0, 0.01, 0.05, 0.1, 0.5, 0.6) )
# define training control (10-fold cross validation and MSE evaluation metric)
# and to use ROC to determine the best model for a binary output/reponse;
# we need to add classProbs = TRUE and twoClassSummary in the trainControl
control_class <- trainControl(method = 'cv', number = nfolds, classProbs = TRUE,
summaryFunction = twoClassSummary)
glmnet_class <- train(
DONATED ~.-TARGDOL,
data = X_train,
method = 'glmnet',
metric = 'ROC',
trControl = control_class,
tuneGrid = glmnet_grid
)
# predict TARGDOL
# remember to remove the columns that are related to
# whether or not they made a donation (DONATED)
X_train_donated <- X_train[ TARGDOL > 0, ]
X_train_donated[ , DONATED := NULL ]
# for the regression model, logging the response lead to a inferior result
control_reg <- trainControl(method = 'cv', number = nfolds)
glmnet_reg <- train(
TARGDOL ~.,
data = X_train_donated,
method = 'glmnet',
trControl = control_reg,
tuneGrid = glmnet_grid
)
stopImplicitCluster()
# multiply the probability of donating and the amount that
# they will donate to get the expected value;
# then obtain the top 1000 donators to see the actual amount
get_expected_score <- function(X_test, y_pred_proba, y_pred) {
expected <- y_pred_proba * y_pred
sorted_index <- order(expected, decreasing = TRUE)[1:1000]
expected_score <- sum(X_test[ sorted_index, ][['TARGDOL']])
return(expected_score)
}
# lasso/ridge (for both classification and regression model)
# obtaining a score around 10664.84
y_pred <- predict(glmnet_reg, newdata = X_test)
y_pred_proba <- predict(glmnet_class, newdata = X_test, type = 'prob')[['YES']]
get_expected_score(X_test, y_pred_proba, y_pred)
# 10698.73
model_lm <- lm(TARGDOL ~., data = X_train_donated)
y_pred <- predict(model_lm, newdata = X_test)
get_expected_score(X_test, y_pred_proba, y_pred)
# 10822
model_rlm <- rlm(TARGDOL ~., data = X_train_donated, maxit = 50)
y_pred <- predict(model_rlm, newdata = X_test)
get_expected_score(X_test, y_pred_proba, y_pred)
# what the optimal amount looks like
# score around 23000
setorder(X_test, -TARGDOL)
sum( X_test[1:1000, ][['TARGDOL']] )
# -----------------------------------------------------------------------------
# advanced model training
# only random forest will be explored
library(h2o)
h2o.init(nthreads = -1)
# classification model (random forest)
# specify the response and predictor columns
# and the evaluation metric (auc)
X_train_h2o <- as.h2o(X_train)
X_test_h2o <- as.h2o(X_test)
y <- 'DONATED'
x <- setdiff( colnames(X_train), c('TARGDOL', y) )
grid_id_rf <- 'rs_rf'
stopping_metric <- 'AUC'
# the random search criteria and number trees
# is applicable for all tree models
ntrees <- 300
stopping_tolerance <- 0.05
stopping_rounds <- 5
search_criteria <- list(
strategy = 'RandomDiscrete',
stopping_metric = stopping_metric,
stopping_tolerance = stopping_tolerance,
stopping_rounds = stopping_rounds
)
# random forest's random search hyperparameters
# max depth and sample rate (row sample rate per tree)
max_depth_opt <- 4:15
sample_rate_opt <- seq(from = 0.8, to = 0.95, by = 0.05)
rf_params_opt <- list(
max_depth = max_depth_opt,
sample_rate = sample_rate_opt
)
rf_class <- h2o.grid(
algorithm = 'randomForest',
grid_id = grid_id_rf,
x = x,
y = y,
training_frame = X_train_h2o,
nfolds = nfolds,
ntrees = ntrees,
balance_classes = TRUE,
stopping_metric = stopping_metric,
stopping_tolerance = stopping_tolerance,
stopping_rounds = stopping_rounds,
hyper_params = rf_params_opt,
search_criteria = search_criteria
)
# sort the model by the specified evaluation metric
# and obtain the top one (the best model)
rf_class_sorted <- h2o.getGrid(
grid_id = grid_id_rf,
sort_by = 'auc',
decreasing = TRUE
)
rf_class_best_model <- h2o.getModel(rf_class_sorted@model_ids[[1]])
rf_class_y_pred_proba <- h2o.predict(rf_class_best_model, X_test_h2o)[['YES']] %>%
as.data.table()
# stacking random forest and logistic regression for classification
# and lasso/ridge for regression; the best stacking proportion is
# found through trial and error
# obtaining a score of 10,904
y_pred <- predict(glmnet_reg, newdata = X_test)
y_pred_proba_stack1 <- rf_class_y_pred_proba[['YES']] * 0.21 + y_pred_proba * 0.79
get_expected_score(X_test, y_pred_proba_stack, y_pred)
y_pred <- predict(model_lm, newdata = X_test)
y_pred_proba_stack2 <- rf_class_y_pred_proba[['YES']] * 0.25 + y_pred_proba * 0.75
get_expected_score(X_test, y_pred_proba_stack, y_pred)
y_pred <- predict(model_rlm, newdata = X_test)
y_pred_proba_stack3 <- rf_class_y_pred_proba[['YES']] * 0.3 + y_pred_proba * 0.77
get_expected_score(X_test, y_pred_proba_stack, y_pred)
dt_var_imp <- h2o.varimp(rf_class_best_model)[ , c('variable', 'percentage') ] %>%
as.data.table()
dt_var_imp
h2o.shutdown(prompt = FALSE)
# ---------------------------------------------------------------------------------
# model validation
# linear/robust linear regression's performance
summary(model_lm)$r.squared
summary(model_rlm)
# accessing lasso/ridge's performance coefficients
confusionMatrix(glmnet_class)
glmnet_class$results
glmnet_class$bestTune
glmnet_reg$results
glmnet_reg$bestTune
# obtain auc score
y_predictions <- list(y_pred_proba, y_pred_proba_stack1,
y_pred_proba_stack2, y_pred_proba_stack1)
auc_scores <- lapply( y_predictions, function(y_proba) {
rocr_pred <- ROCR::prediction(y_proba, X_test$DONATED)
auc <- ROCR::performance(rocr_pred, 'auc')@y.values[[1]]
return(auc)
})
auc_scores
# obtain lasso/ridge's estimated coefficients
# we'll will have to select lambda value to get the optimal coefficients,
# and then we only selected the ones that were not shrunk to 0 by the model
coef_class <- coef(glmnet_class$finalModel, s = glmnet_class$bestTune$lambda) %>%
as.matrix()
coef_class <- data.table( names = row.names(coef_class), estimates = coef_class[, 1] )
sum( coef_class[['estimates']] != 0 )
coef_class <- coef_class[ estimates != 0, ]
coef_class
# regression's coefficients
coef_reg <- coef(glmnet_reg$finalModel, s = glmnet_reg$bestTune$lambda) %>%
as.matrix()
coef_reg <- data.table( names = row.names(coef_reg), estimates = coef_reg[, 1] )
sum( coef_reg[['estimates']] != 0 )
coef_reg <- coef_reg[ estimates != 0, ]
coef_reg
# ---------------------------------------------------------------------------------
# visualization
library(ggplot2)
library(ggfortify)
library(gridExtra)
# remove the 0 contribution and the obvious outlier prior to visualization
has_donated <- donation[ TARGDOL > 0 & TARGDOL < 1500, ]
# distribution of TARGDOL
donation_TARGDOL <- donation[ TARGDOL < 1500, .(TARGDOL, RemoveZero = 'KeptZero') ]
has_donated_TARGDOL <- has_donated[ , .(TARGDOL, RemoveZero = 'RemovedZero') ]
distribution_TARGDOL <- rbind(donation_TARGDOL, has_donated_TARGDOL)
ggplot( distribution_TARGDOL, aes(TARGDOL, color = RemoveZero) ) +
geom_density() + theme_bw() +
labs(title = 'Distribution of TARGDOL') +
guides(color = FALSE) +
facet_grid(~ RemoveZero)
ggsave('distribution_TARGDOL.png')
# Percentage of Donors vs. Non-Donors
dt_donated <- data.table( table(donation$DONATED) / nrow(donation) )
setnames( dt_donated, c('donated', 'frequency') )
ggplot( dt_donated, aes(donated, frequency, fill = donated) ) +
geom_col() + theme_bw() +
labs(title = 'Percentage of Donors vs. Non-Donors')
ggsave('donors_and_non_donors.png')
# boxplot of the donated amount between the different
# solicitation and contribution code
slcod1 <- ggplot(has_donated, aes(SLCOD1, TARGDOL, fill = SLCOD1) ) +
geom_boxplot() + theme_bw() +
labs(title = 'TARGDOL > 0 vs. SLCOD1') +
ylim(0, 100)
cncod1 <- ggplot(has_donated, aes(CNCOD1, TARGDOL, fill = CNCOD1) ) +
geom_boxplot() + theme_bw() +
labs(title = 'TARGDOL > 0 vs. CNCOD1') +
ylim(0, 100)
g <- arrangeGrob(slcod1, cncod1, nrow = 2)
ggsave('donated_amount_sl_and_cn_code.png', g)
# see if CNDOL1 (the latest contribution amount) if
# correlated with TARGDOL
ggplot(has_donated, aes(CNDOL1, log(TARGDOL) ) ) +
geom_point() + theme_bw() +
labs(x = 'Log(CNDOL1)', title = 'Log(TARGDOL > 0) vs. Log(CNDOL1)')
ggsave('targdol_cndol.png')
# Proportion of donors in each SLCOD1 category
dt_SLCOD_prop <- data.table( table(has_donated$SLCOD1) / table(donation$SLCOD1) )
setnames( dt_SLCOD_prop, c('SLCOD1', 'Proportion') )
ggplot( dt_SLCOD_prop, aes(SLCOD1, Proportion, fill = SLCOD1) ) +
geom_col() + theme_bw() +
labs(y = 'Proportion of donors',
title = 'Proportion of donors in each SLCOD1 category')
ggsave('donors_proportion_sl_code.png')
# linear model's diagnostic plot
png('lm_diagnostics.png', width = 700, height = 700)
autoplot(model_lm, which = 1:6, ncol = 3, label.size = 3) +
theme_bw()
dev.off()
@kimih0223
Copy link

with(donation,pairs(data.frame(TARGDOL, CNDOL1, CNTRLIF, CONLARG, CONTRFST)))

@mpastor2017
Copy link

predicting

shuffle the row observations of the training data

set.seed(1234)
X_train <- X_train[ sample( nrow(X_train) ) ]

Baseline

##Binomial logistic

##Removing Outlier
X_train <- X_train[TARGDOL < 1500,]

fit <- glm(DONATED ~ . - TARGDOL - CONTRFST, family=binomial, data = X_train)

##Multiple Linear Regression
X_train2 <- X_train[TARGDOL > 0,]
X_train2$DONATED <- NULL

##CONLARG and CNTRLIF collinearity
mult_fit <- lm(TARGDOL ~ .-CONLARG - CNTRLIF, data = X_train2)

summary(mult_fit)
vif(mult_fit)

predicted <- predict(fit, X_test, type="response")
predicted2 <- predict(mult_fit, X_test)
X_test$expected <- predicted * predicted2
X_test2 <- X_test[order(expected, decreasing = TRUE),]
targdol1000 <- sum(head(X_test2$TARGDOL,1000))
expected1000 <- sum(head(X_test2$expected,1000))
targdol1000

Try changing codes to most recent/most frequent solicitation/contrbution
With Outlier Robust is best with 10096.23

##Robust
robust_fit <- rlm(TARGDOL ~ .-CONLARG + CNDOL1:CNTMLIF- CNTRLIF - CNCOD1 - CNCOD2 - CNCOD3 - SLCOD1 - SLCOD2 - SLCOD3, data = X_train2)

summary(robust_fit)
vif(robust_fit)

predicted <- predict(fit, X_test, type="response")
predicted2 <- predict(robust_fit, X_test)
X_test$expected <- predicted * predicted2
X_test2 <- X_test[order(expected, decreasing = TRUE),]
targdol1000 <- sum(head(X_test2$TARGDOL,1000))
expected1000 <- sum(head(X_test2$expected,1000))
targdol1000

@mpastor2017
Copy link

I just ran multiple Regression:
mult_fit <- lm(TARGDOL ~ ., data = X_train_donated)

Removing the 1500 for both the classification and regression

@aronliu
Copy link

aronliu commented Dec 7, 2016

groups <- cut(x=donation$TARGDOL, breaks=c(-1,seq(from=0, to=ceiling(max(X_train$TARGDOL)), by = 5)))
bygroup <- tapply(donation$TARGDOL, groups, length)
names(bygroup)[1] <- "[0]"
barplot(bygroup, xlab="Amount", ylab="Frequency", main="Distribution of TARGDOL")

groups2 <- ifelse(donation$TARGDOL==0,"Non-Donor","Donor")
bygroup2 <- tapply(donation$TARGDOL, groups2, length)
barplot(bygroup2, ylab="Frequency", main="Number of Donors vs. Non-Donors")

donation$CNCOD1 <- droplevels(donation$CNCOD1)
boxplot(TARGDOL~CNCOD1,data=donation[TARGDOL>0,],ylim=c(0,40), xlab="CNCOD1", ylab="TARGDOL", main="TARGDOL vs. CNCOD1 (TARGDOL > 0)")
boxplot(TARGDOL~SLCOD1,data=donation[TARGDOL>0,],ylim=c(0,40), xlab="SLCOD1", ylab="TARGDOL", main="TARGDOL vs. SLCOD1 (TARGDOL > 0)")

plot(donation$CNDOL1, donation$TARGDOL, xlab="Log(CNDOL1)", ylab="TARGDOL", main="TARGDOL vs. Log(CNDOL1)")
plot(donation[TARGDOL>0,]$CNDOL1, donation[TARGDOL>0,]$TARGDOL, xlab="Log(CNDOL1)", ylab="TARGDOL", main="TARGDOL vs. Log(CNDOL1) (TARGDOL > 0)")

donors <- ifelse(donation$TARGDOL==0,0,1)
total <- aggregate(donors ~ donation$SLCOD1, FUN=length)
donated <- aggregate(donors ~ donation$SLCOD1, FUN=sum)
percent <- donated$donors/total$donors
names(percent) <- c("A", "B", "C", "D", "M")
barplot(percent, xlab="SLCOD1", ylab="Proportion of Donors", main="Proportion of Donors in each SLCOD1 Category")

total <- total$donors
names(total) <- c("A", "B", "C", "D", "M")
t <- barplot(total, ylim=c(0,85000), xlab="SLCOD1", ylab="Frequency", main="Amount of People Classified Into Each SLCOD1 Category")
text(x = t, y = total, label = total, pos = 3, cex = 0.8, col = "red")

plot(donation[TARGDOL>0,]$CONLARG, donation[TARGDOL>0,]$TARGDOL, xlab="Log(CONLARG)", ylab="TARGDOL", main="TARGDOL vs. Log(CONLARG) (TARGDOL > 0)")
plot(donation[TARGDOL>0,]$CONTRFST, donation[TARGDOL>0,]$TARGDOL, xlab="CNTRFST", ylab="TARGDOL", main="TARGDOL vs. CNTRFST (TARGDOL > 0)")
plot(donation[TARGDOL>0,]$CNTMLIF, donation[TARGDOL>0,]$TARGDOL, xlab="CNTMLIF", ylab="TARGDOL", main="TARGDOL vs. CNTMLIF (TARGDOL > 0)")

P <- ecdf(donation$CNMON1)
plot(P, xlab="CNMON1", ylab="Cumulative Proportion", main="Cumulative Proportion of CNMON1")

P2 <- ecdf(donation$CNMONL)
plot(P2, xlab="CNMONL", ylab="Cumulative Proportion", main="Cumulative Proportion of CNMONL")

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment