ethen8181 · December 8, 2016 16:28 · mpastor2017 · Nov 30, 2016 · mpastor2017 · Dec 7, 2016
diff --git a/predictive.R b/predictive.R
 library(MASS)
 library(ROCR)
 library(caret)
 library(glmnet)
 library(magrittr)
 library(data.table)
 library(doParallel)
 setwd('/Users/ethen/Desktop/predictive-project')


 # ---------------------------------------------------------------------------------
 # preprocessing
 path_donation <- 'donation data.csv'
 path_code <- 'dmef1code.csv'
 path_state_mapping <- 'states.csv'

 # main dataset, remove the id column
 donation <- fread(path_donation, stringsAsFactors = TRUE)
 donation[ , ID := NULL ]

 # delete the last row, since it has a questionable '*' CODETYPE
 code <- fread(path_code, stringsAsFactors = TRUE)
 code <- code[-nrow(code), ]
 code[ , CODETYPE := factor( CODETYPE, levels = c('A', 'B', 'C', 'D', 'M') ) ]

 # table that maps the states into regions
 # https://drive.google.com/open?id=0B5-Q31Rgtkixb0t3MkNTNXI2djA
 states <- fread(path_state_mapping, stringsAsFactors = TRUE)

 preprocessing <- function(donation, code, states) {
 	# general preprocessing steps for the donation data

 	# for people that have donated only once, their CNDOL2 and
 	# CNCOD3 will be missing (it indicates the previous donation),
 	# the same goes for SLCOD2 and SLCOD3;
 	# match CNCOD (Latest Contribution Code),
 	# SLCOD (Latest Solicitation Code) to the five types in the code file
 	cols_mapping <- c('CNCOD1', 'CNCOD2', 'CNCOD3', 
 	  				  'SLCOD1', 'SLCOD2', 'SLCOD3')

 	donation[ , (cols_mapping) := lapply( .SD, function(col) {
 		code$CODETYPE[ match(col, code$CODE) ]
 	}), .SDcols = cols_mapping ]

 	# for the CNCOD and SLCOD replace the one
 	# that has NA value, with a 'None' category
 	for(j in cols_mapping) {
 		na_rows <- which( is.na(donation[[j]]) )
 		set(donation, i = na_rows, j = j, value = 'None')
 	}

 	# drop all the CNDAT (Latest Contribution Date)
 	# because we can use CNMON (months since latest 
 	# contribution) instead
 	cols_drop <- c('CNDAT1', 'CNDAT2', 'CNDAT3')
 	donation[ , (cols_drop) := NULL ]

 	# if he/she donated once, then the CNDOL2
 	# (2nd Latest Contribution) should be 0
 	# the same applies for CNDOL3 (3rd Latest Contribution)
 	donation[ CNTMLIF == 1, CNDOL2 := 0 ]
 	donation[ CNTMLIF == 1, CNDOL3 := 0 ]
 	donation[ CNTMLIF == 2, CNDOL3 := 0 ]

 	# merge state into regions
 	# 1. New england: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont
 	# 2. Mideast: Delaware, District of Columbia, Maryland, New Jersey, New York, and Pennsylvania
 	# 3. Great Lakes: Illinois, Indiana, Michigan, Ohio, and Wisconsin
 	# 4. Plains: Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, and South Dakota
 	# 5. Southeast: Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, 
 	#    North Carolina, South Carolina, Tennessee, Virginia, and West Virginia
 	# 6. Southwest: Arizona, New Mexico, Oklahoma, and Texas
 	# 7. Rocky Mountain: Colorado, Idaho, Montana, Utah, and Wyoming
 	# 8. Far West: Alaska, California, Hawaii, Nevada, Oregon, and Washington
 	donation[ , region := states$region[ match(STATCODE, states$statecode) ] ]
 	donation[ , STATCODE := NULL ]
 	donation[ is.na(region), region := 'Other' ]

 	# if both CNMON2 and CNMON3 are both missing, meaning that
 	# they have only donated once, fill it with CNMON1; the same idea
 	# can be extended to CNMON3
 	donated_once <- with( donation, is.na(CNMON2) & is.na(CNMON3) )
 	donation[ donated_once, CNMON2 := CNMON1 ]
 	donation[ donated_once, CNMON3 := CNMON1 ]
 	donated_twice <- is.na(donation$CNMON3)
 	donation[ donated_twice, CNMON3 := CNMON2 ]

 	# outliers exists amongst these columns (there are
 	# people that donated a lot more than others); log
 	# these columns to make the numbers more normally distributed
 	donation[ , `:=`( CNDOL1  = log(CNDOL1),
 					  CNTRLIF = log(CNTRLIF),
 					  CONLARG = log(CONLARG) ) ]

 	# binary response for the logistic regression to see
 	# whether that person donated or not
 	donation[ , DONATED := as.factor( ifelse(TARGDOL > 0, 'YES', 'NO') ) ]
 	return(donation)
 }
 donation <- preprocessing(donation, code, states)

 # train/test split
 # training and test sets in the ratio of 2:1 by putting every 
 # 3rd observation in the test set and all remaining observations in 
 # the training set
 test_index <- seq(3, nrow(donation), by = 3)
 X_train <- donation[-test_index]
 X_test  <- donation[test_index]

 # remove the obvious outliers from the training set
 X_train <- X_train[ TARGDOL != 1500, ]

 remove_correlations <- function(X_train, X_test) {
 	# identify multi-collinearity using variance inflation factor (vif)
 	# and remove them from both the train and test set;
 	# conventionally, a vif score larger than 10 should be removed, 
 	# after this procedure three columns including SLCOD2, CNCOD2, CNTRLIF
 	# and CNMON3 were removed

 	# assign a random response column to start the process
 	# and keep removing multi-collinearity until there is none
 	multi_cor_cols <- 'TARGDOL'
 	while( length(multi_cor_cols) > 0 ) {
 		model_lm <- lm(TARGDOL ~.-DONATED, data = X_train)
 		multi_cor <- car::vif(model_lm)[ , 'GVIF']
 		max_col <- names( which.max(multi_cor) )
 		
 		if( multi_cor[[max_col]] > 10 ) {
 			multi_cor_cols <- max_col
 			print(multi_cor_cols)
 			X_train[ , (multi_cor_cols) := NULL ]
 			X_test[  , (multi_cor_cols) := NULL ]
 		} else {
 			multi_cor_cols <- NULL
 		}
 	}	
 	removed_cor <- list(train = X_train, test = X_test)
 	return(removed_cor)
 }
 removed_cor <- remove_correlations(X_train, X_test)
 X_train <- removed_cor$train
 X_test <- removed_cor$test

 create_interactions <- function(X_train, X_test) {
 	# create interaction terms across all possible pairs of numeric columns;
 	# (numeric columns include class of integer and numeric)
 	# this includes creating, 2nd polynomial degree columns (squaring the column value)
 	num_cols <- !vapply( X_train, class, character(1) ) %in% 'factor'
 	num_name <- colnames(X_train)[num_cols]
 	num_name <- setdiff(num_name, 'TARGDOL')
 	num_len  <- length(num_name)	
 	
 	for(i in 1:num_len) {
 		col1 <- num_name[i]	
 		for(j in i:num_len) {
 			col2 <- num_name[j]
 			col_name <- paste0(col1, '_TIMES_', col2)
 			X_train[ , (col_name) := X_train[[col1]] * X_train[[col2]] ]
 			X_test[  , (col_name) := X_test[[col1]] * X_test[[col2]] ]
 		}
 	}
 	created_inter <- list(train = X_train, test = X_test)
 	return(created_inter)
 }

 # after creating interaction term, we ended up with a total of
 # 96 features. The reference level for the factor columns are
 # B for SEX, FarWest for region, A for CNCOD1, CNCOD3, SLCOD1 and SLCOD3
 created_inter <- create_interactions(X_train, X_test)
 X_train <- created_inter$train
 X_test <- created_inter$test

 # ---------------------------------------------------------------------------------
 # model training

 # set the cross validation number for all models
 nfolds <- 10

 # shuffle the row observations of the training data
 # for potential model's stability
 set.seed(1234)
 X_train <- X_train[ sample( nrow(X_train) ) ]

 # use all possible - 1 cores to train the lasso/ridge,
 # remember to stop the created cluster afterwards
 registerDoParallel(cores = detectCores() - 1)

 # specify parameters to search for: alpha = 1 is lasso, 0 is lasso 
 # and a fix range of lambda values
 glmnet_grid <- expand.grid( alpha = c(0, 1), 
 							lambda = c(0, 0.01, 0.05, 0.1, 0.5, 0.6) )

 # define training control (10-fold cross validation and MSE evaluation metric) 
 # and to use ROC to determine the best model for a binary output/reponse;
 # we need to add classProbs = TRUE and twoClassSummary in the trainControl
 control_class <- trainControl(method = 'cv', number = nfolds, classProbs = TRUE,
 							  summaryFunction = twoClassSummary)
 glmnet_class <- train(
 	DONATED ~.-TARGDOL, 
 	data = X_train,
 	method = 'glmnet',
 	metric = 'ROC',
 	trControl = control_class,
 	tuneGrid = glmnet_grid
 )

 # predict TARGDOL
 # remember to remove the columns that are related to
 # whether or not they made a donation (DONATED)
 X_train_donated <- X_train[ TARGDOL > 0, ]
 X_train_donated[ , DONATED := NULL ]

 # for the regression model, logging the response lead to a inferior result
 control_reg <- trainControl(method = 'cv', number = nfolds)
 glmnet_reg <- train(
 	TARGDOL ~., 
 	data = X_train_donated, 
 	method = 'glmnet', 
 	trControl = control_reg, 
 	tuneGrid = glmnet_grid
 )
 stopImplicitCluster()

 # multiply the probability of donating and the amount that
 # they will donate to get the expected value;
 # then obtain the top 1000 donators to see the actual amount
 get_expected_score <- function(X_test, y_pred_proba, y_pred) {
 	expected <- y_pred_proba * y_pred
 	sorted_index <- order(expected, decreasing = TRUE)[1:1000]
 	expected_score <- sum(X_test[ sorted_index, ][['TARGDOL']])
 	return(expected_score)
 }

 # lasso/ridge (for both classification and regression model)
 # obtaining a score around 10664.84
 y_pred <- predict(glmnet_reg, newdata = X_test)
 y_pred_proba <- predict(glmnet_class, newdata = X_test, type = 'prob')[['YES']]
 get_expected_score(X_test, y_pred_proba, y_pred)

 # 10698.73
 model_lm <- lm(TARGDOL ~., data = X_train_donated)
 y_pred <- predict(model_lm, newdata = X_test)
 get_expected_score(X_test, y_pred_proba, y_pred)

 # 10822
 model_rlm <- rlm(TARGDOL ~., data = X_train_donated, maxit = 50)
 y_pred <- predict(model_rlm, newdata = X_test)
 get_expected_score(X_test, y_pred_proba, y_pred)

 # what the optimal amount looks like
 # score around 23000
 setorder(X_test, -TARGDOL)
 sum( X_test[1:1000, ][['TARGDOL']] )


 # -----------------------------------------------------------------------------
 # advanced model training
 # only random forest will be explored
 library(h2o)
 h2o.init(nthreads = -1)

 # classification model (random forest)
 # specify the response and predictor columns
 # and the evaluation metric (auc)
 X_train_h2o <- as.h2o(X_train)
 X_test_h2o <- as.h2o(X_test)
 y <- 'DONATED'
 x <- setdiff( colnames(X_train), c('TARGDOL', y) )
 grid_id_rf <- 'rs_rf'
 stopping_metric <- 'AUC'

 # the random search criteria and number trees
 # is applicable for all tree models
 ntrees <- 300
 stopping_tolerance <- 0.05
 stopping_rounds <- 5
 search_criteria <- list(
 	strategy = 'RandomDiscrete', 
 	stopping_metric = stopping_metric, 
 	stopping_tolerance = stopping_tolerance, 
 	stopping_rounds = stopping_rounds
 )

 # random forest's random search hyperparameters
 # max depth and sample rate (row sample rate per tree)
 max_depth_opt <- 4:15
 sample_rate_opt <- seq(from = 0.8, to = 0.95, by = 0.05)
 rf_params_opt <- list(
 	max_depth = max_depth_opt, 
 	sample_rate = sample_rate_opt
 )

 rf_class <- h2o.grid(
 	algorithm = 'randomForest',
 	grid_id = grid_id_rf,
 	x = x, 
 	y = y,
 	training_frame = X_train_h2o,
 	nfolds = nfolds,
 	ntrees = ntrees,
 	balance_classes = TRUE,
 	stopping_metric = stopping_metric, 
 	stopping_tolerance = stopping_tolerance, 
 	stopping_rounds = stopping_rounds,
 	hyper_params = rf_params_opt,
 	search_criteria = search_criteria
 )

 # sort the model by the specified evaluation metric
 # and obtain the top one (the best model)
 rf_class_sorted <- h2o.getGrid(
 	grid_id = grid_id_rf, 
 	sort_by = 'auc', 
 	decreasing = TRUE
 )
 rf_class_best_model <- h2o.getModel(rf_class_sorted@model_ids[[1]])
 rf_class_y_pred_proba <- h2o.predict(rf_class_best_model, X_test_h2o)[['YES']] %>% 
 					  	 as.data.table()

 # stacking random forest and logistic regression for classification
 # and lasso/ridge for regression; the best stacking proportion is
 # found through trial and error
 # obtaining a score of 10,904
 y_pred <- predict(glmnet_reg, newdata = X_test)
 y_pred_proba_stack1 <- rf_class_y_pred_proba[['YES']] * 0.21 + y_pred_proba * 0.79
 get_expected_score(X_test, y_pred_proba_stack, y_pred)

 y_pred <- predict(model_lm, newdata = X_test)
 y_pred_proba_stack2 <- rf_class_y_pred_proba[['YES']] * 0.25 + y_pred_proba * 0.75
 get_expected_score(X_test, y_pred_proba_stack, y_pred)

 y_pred <- predict(model_rlm, newdata = X_test)
 y_pred_proba_stack3 <- rf_class_y_pred_proba[['YES']] * 0.3 + y_pred_proba * 0.77
 get_expected_score(X_test, y_pred_proba_stack, y_pred)

 dt_var_imp <- h2o.varimp(rf_class_best_model)[ , c('variable', 'percentage') ] %>%
 			  as.data.table()
 dt_var_imp

 h2o.shutdown(prompt = FALSE)

 # ---------------------------------------------------------------------------------
 # model validation

 # linear/robust linear regression's performance
 summary(model_lm)$r.squared
 summary(model_rlm)

 # accessing lasso/ridge's performance coefficients
 confusionMatrix(glmnet_class)
 glmnet_class$results
 glmnet_class$bestTune
 glmnet_reg$results
 glmnet_reg$bestTune

 # obtain auc score
 y_predictions <- list(y_pred_proba, y_pred_proba_stack1, 
 				   	  y_pred_proba_stack2, y_pred_proba_stack1)

 auc_scores <- lapply( y_predictions, function(y_proba) {
 	rocr_pred <- ROCR::prediction(y_proba, X_test$DONATED)
 	auc <- ROCR::performance(rocr_pred, 'auc')@y.values[[1]]
 	return(auc)
 })
 auc_scores

 # obtain lasso/ridge's estimated coefficients
 # we'll will have to select lambda value to get the optimal coefficients,
 # and then we only selected the ones that were not shrunk to 0 by the model
 coef_class <- coef(glmnet_class$finalModel, s = glmnet_class$bestTune$lambda) %>%
 			  as.matrix()
 coef_class <- data.table( names = row.names(coef_class), estimates = coef_class[, 1] )
 sum( coef_class[['estimates']] != 0 )
 coef_class <- coef_class[ estimates != 0, ]
 coef_class

 # regression's coefficients
 coef_reg <- coef(glmnet_reg$finalModel, s = glmnet_reg$bestTune$lambda) %>%
 			as.matrix()
 coef_reg <- data.table( names = row.names(coef_reg), estimates = coef_reg[, 1] )
 sum( coef_reg[['estimates']] != 0 )
 coef_reg <- coef_reg[ estimates != 0, ]
 coef_reg


 # ---------------------------------------------------------------------------------
 # visualization
 library(ggplot2)
 library(ggfortify)
 library(gridExtra)

 # remove the 0 contribution and the obvious outlier prior to visualization
 has_donated <- donation[ TARGDOL > 0 & TARGDOL < 1500, ]

 # distribution of TARGDOL
 donation_TARGDOL <- donation[ TARGDOL < 1500, .(TARGDOL, RemoveZero = 'KeptZero') ]
 has_donated_TARGDOL <- has_donated[ , .(TARGDOL, RemoveZero = 'RemovedZero') ]
 distribution_TARGDOL <- rbind(donation_TARGDOL, has_donated_TARGDOL)

 ggplot( distribution_TARGDOL, aes(TARGDOL, color = RemoveZero) ) +
 geom_density() + theme_bw() +
 labs(title = 'Distribution of TARGDOL') +
 guides(color = FALSE) +
 facet_grid(~ RemoveZero)
 ggsave('distribution_TARGDOL.png')

 # Percentage of Donors vs. Non-Donors
 dt_donated <- data.table( table(donation$DONATED) / nrow(donation) )
 setnames( dt_donated, c('donated', 'frequency') )

 ggplot( dt_donated, aes(donated, frequency, fill = donated) ) +
 geom_col() + theme_bw() + 
 labs(title = 'Percentage of Donors vs. Non-Donors')
 ggsave('donors_and_non_donors.png')

 # boxplot of the donated amount between the different 
 # solicitation and contribution code
 slcod1 <- ggplot(has_donated, aes(SLCOD1, TARGDOL, fill = SLCOD1) ) +
 		  geom_boxplot() + theme_bw() +
 		  labs(title = 'TARGDOL > 0 vs. SLCOD1') +
 		  ylim(0, 100)

 cncod1 <- ggplot(has_donated, aes(CNCOD1, TARGDOL, fill = CNCOD1) ) +
 		  geom_boxplot() + theme_bw() +
 		  labs(title = 'TARGDOL > 0 vs. CNCOD1') +
 		  ylim(0, 100)

 g <- arrangeGrob(slcod1, cncod1, nrow = 2)
 ggsave('donated_amount_sl_and_cn_code.png', g)

 # see if CNDOL1 (the latest contribution amount) if
 # correlated with TARGDOL
 ggplot(has_donated, aes(CNDOL1, log(TARGDOL) ) ) +
 geom_point() + theme_bw() +
 labs(x = 'Log(CNDOL1)', title = 'Log(TARGDOL > 0) vs. Log(CNDOL1)')
 ggsave('targdol_cndol.png')

 # Proportion of donors in each SLCOD1 category
 dt_SLCOD_prop <- data.table( table(has_donated$SLCOD1) / table(donation$SLCOD1) )
 setnames( dt_SLCOD_prop, c('SLCOD1', 'Proportion') )

 ggplot( dt_SLCOD_prop, aes(SLCOD1, Proportion, fill = SLCOD1) ) +
 geom_col() + theme_bw() +
 labs(y = 'Proportion of donors', 
 	 title = 'Proportion of donors in each SLCOD1 category')
 ggsave('donors_proportion_sl_code.png')


 # linear model's diagnostic plot
 png('lm_diagnostics.png', width = 700, height = 700)

 autoplot(model_lm, which = 1:6, ncol = 3, label.size = 3) +
 theme_bw()

 dev.off()
	library(MASS)
	library(ROCR)
	library(caret)
	library(glmnet)
	library(magrittr)
	library(data.table)
	library(doParallel)
	setwd('/Users/ethen/Desktop/predictive-project')


	# ---------------------------------------------------------------------------------
	# preprocessing
	path_donation <- 'donation data.csv'
	path_code <- 'dmef1code.csv'
	path_state_mapping <- 'states.csv'

	# main dataset, remove the id column
	donation <- fread(path_donation, stringsAsFactors = TRUE)
	donation[ , ID := NULL ]

	# delete the last row, since it has a questionable '*' CODETYPE
	code <- fread(path_code, stringsAsFactors = TRUE)
	code <- code[-nrow(code), ]
	code[ , CODETYPE := factor( CODETYPE, levels = c('A', 'B', 'C', 'D', 'M') ) ]

	# table that maps the states into regions
	# https://drive.google.com/open?id=0B5-Q31Rgtkixb0t3MkNTNXI2djA
	states <- fread(path_state_mapping, stringsAsFactors = TRUE)

	preprocessing <- function(donation, code, states) {
	# general preprocessing steps for the donation data

	# for people that have donated only once, their CNDOL2 and
	# CNCOD3 will be missing (it indicates the previous donation),
	# the same goes for SLCOD2 and SLCOD3;
	# match CNCOD (Latest Contribution Code),
	# SLCOD (Latest Solicitation Code) to the five types in the code file
	cols_mapping <- c('CNCOD1', 'CNCOD2', 'CNCOD3',
	'SLCOD1', 'SLCOD2', 'SLCOD3')

	donation[ , (cols_mapping) := lapply( .SD, function(col) {
	code$CODETYPE[ match(col, code$CODE) ]
	}), .SDcols = cols_mapping ]

	# for the CNCOD and SLCOD replace the one
	# that has NA value, with a 'None' category
	for(j in cols_mapping) {
	na_rows <- which( is.na(donation[[j]]) )
	set(donation, i = na_rows, j = j, value = 'None')
	}

	# drop all the CNDAT (Latest Contribution Date)
	# because we can use CNMON (months since latest
	# contribution) instead
	cols_drop <- c('CNDAT1', 'CNDAT2', 'CNDAT3')
	donation[ , (cols_drop) := NULL ]

	# if he/she donated once, then the CNDOL2
	# (2nd Latest Contribution) should be 0
	# the same applies for CNDOL3 (3rd Latest Contribution)
	donation[ CNTMLIF == 1, CNDOL2 := 0 ]
	donation[ CNTMLIF == 1, CNDOL3 := 0 ]
	donation[ CNTMLIF == 2, CNDOL3 := 0 ]

	# merge state into regions
	# 1. New england: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont
	# 2. Mideast: Delaware, District of Columbia, Maryland, New Jersey, New York, and Pennsylvania
	# 3. Great Lakes: Illinois, Indiana, Michigan, Ohio, and Wisconsin
	# 4. Plains: Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, and South Dakota
	# 5. Southeast: Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi,
	# North Carolina, South Carolina, Tennessee, Virginia, and West Virginia
	# 6. Southwest: Arizona, New Mexico, Oklahoma, and Texas
	# 7. Rocky Mountain: Colorado, Idaho, Montana, Utah, and Wyoming
	# 8. Far West: Alaska, California, Hawaii, Nevada, Oregon, and Washington
	donation[ , region := states$region[ match(STATCODE, states$statecode) ] ]
	donation[ , STATCODE := NULL ]
	donation[ is.na(region), region := 'Other' ]

	# if both CNMON2 and CNMON3 are both missing, meaning that
	# they have only donated once, fill it with CNMON1; the same idea
	# can be extended to CNMON3
	donated_once <- with( donation, is.na(CNMON2) & is.na(CNMON3) )
	donation[ donated_once, CNMON2 := CNMON1 ]
	donation[ donated_once, CNMON3 := CNMON1 ]
	donated_twice <- is.na(donation$CNMON3)
	donation[ donated_twice, CNMON3 := CNMON2 ]

	# outliers exists amongst these columns (there are
	# people that donated a lot more than others); log
	# these columns to make the numbers more normally distributed
	donation[ , `:=`( CNDOL1 = log(CNDOL1),
	CNTRLIF = log(CNTRLIF),
	CONLARG = log(CONLARG) ) ]

	# binary response for the logistic regression to see
	# whether that person donated or not
	donation[ , DONATED := as.factor( ifelse(TARGDOL > 0, 'YES', 'NO') ) ]
	return(donation)
	}
	donation <- preprocessing(donation, code, states)

	# train/test split
	# training and test sets in the ratio of 2:1 by putting every
	# 3rd observation in the test set and all remaining observations in
	# the training set
	test_index <- seq(3, nrow(donation), by = 3)
	X_train <- donation[-test_index]
	X_test <- donation[test_index]

	# remove the obvious outliers from the training set
	X_train <- X_train[ TARGDOL != 1500, ]

	remove_correlations <- function(X_train, X_test) {
	# identify multi-collinearity using variance inflation factor (vif)
	# and remove them from both the train and test set;
	# conventionally, a vif score larger than 10 should be removed,
	# after this procedure three columns including SLCOD2, CNCOD2, CNTRLIF
	# and CNMON3 were removed

	# assign a random response column to start the process
	# and keep removing multi-collinearity until there is none
	multi_cor_cols <- 'TARGDOL'
	while( length(multi_cor_cols) > 0 ) {
	model_lm <- lm(TARGDOL ~.-DONATED, data = X_train)
	multi_cor <- car::vif(model_lm)[ , 'GVIF']
	max_col <- names( which.max(multi_cor) )

	if( multi_cor[[max_col]] > 10 ) {
	multi_cor_cols <- max_col
	print(multi_cor_cols)
	X_train[ , (multi_cor_cols) := NULL ]
	X_test[ , (multi_cor_cols) := NULL ]
	} else {
	multi_cor_cols <- NULL
	}
	}
	removed_cor <- list(train = X_train, test = X_test)
	return(removed_cor)
	}
	removed_cor <- remove_correlations(X_train, X_test)
	X_train <- removed_cor$train
	X_test <- removed_cor$test

	create_interactions <- function(X_train, X_test) {
	# create interaction terms across all possible pairs of numeric columns;
	# (numeric columns include class of integer and numeric)
	# this includes creating, 2nd polynomial degree columns (squaring the column value)
	num_cols <- !vapply( X_train, class, character(1) ) %in% 'factor'
	num_name <- colnames(X_train)[num_cols]
	num_name <- setdiff(num_name, 'TARGDOL')
	num_len <- length(num_name)

	for(i in 1:num_len) {
	col1 <- num_name[i]
	for(j in i:num_len) {
	col2 <- num_name[j]
	col_name <- paste0(col1, '_TIMES_', col2)
	X_train[ , (col_name) := X_train[[col1]] * X_train[[col2]] ]
	X_test[ , (col_name) := X_test[[col1]] * X_test[[col2]] ]
	}
	}
	created_inter <- list(train = X_train, test = X_test)
	return(created_inter)
	}

	# after creating interaction term, we ended up with a total of
	# 96 features. The reference level for the factor columns are
	# B for SEX, FarWest for region, A for CNCOD1, CNCOD3, SLCOD1 and SLCOD3
	created_inter <- create_interactions(X_train, X_test)
	X_train <- created_inter$train
	X_test <- created_inter$test

	# ---------------------------------------------------------------------------------
	# model training

	# set the cross validation number for all models
	nfolds <- 10

	# shuffle the row observations of the training data
	# for potential model's stability
	set.seed(1234)
	X_train <- X_train[ sample( nrow(X_train) ) ]

	# use all possible - 1 cores to train the lasso/ridge,
	# remember to stop the created cluster afterwards
	registerDoParallel(cores = detectCores() - 1)

	# specify parameters to search for: alpha = 1 is lasso, 0 is lasso
	# and a fix range of lambda values
	glmnet_grid <- expand.grid( alpha = c(0, 1),
	lambda = c(0, 0.01, 0.05, 0.1, 0.5, 0.6) )

	# define training control (10-fold cross validation and MSE evaluation metric)
	# and to use ROC to determine the best model for a binary output/reponse;
	# we need to add classProbs = TRUE and twoClassSummary in the trainControl
	control_class <- trainControl(method = 'cv', number = nfolds, classProbs = TRUE,
	summaryFunction = twoClassSummary)
	glmnet_class <- train(
	DONATED ~.-TARGDOL,
	data = X_train,
	method = 'glmnet',
	metric = 'ROC',
	trControl = control_class,
	tuneGrid = glmnet_grid
	)

	# predict TARGDOL
	# remember to remove the columns that are related to
	# whether or not they made a donation (DONATED)
	X_train_donated <- X_train[ TARGDOL > 0, ]
	X_train_donated[ , DONATED := NULL ]

	# for the regression model, logging the response lead to a inferior result
	control_reg <- trainControl(method = 'cv', number = nfolds)
	glmnet_reg <- train(
	TARGDOL ~.,
	data = X_train_donated,
	method = 'glmnet',
	trControl = control_reg,
	tuneGrid = glmnet_grid
	)
	stopImplicitCluster()

	# multiply the probability of donating and the amount that
	# they will donate to get the expected value;
	# then obtain the top 1000 donators to see the actual amount
	get_expected_score <- function(X_test, y_pred_proba, y_pred) {
	expected <- y_pred_proba * y_pred
	sorted_index <- order(expected, decreasing = TRUE)[1:1000]
	expected_score <- sum(X_test[ sorted_index, ][['TARGDOL']])
	return(expected_score)
	}

	# lasso/ridge (for both classification and regression model)
	# obtaining a score around 10664.84
	y_pred <- predict(glmnet_reg, newdata = X_test)
	y_pred_proba <- predict(glmnet_class, newdata = X_test, type = 'prob')[['YES']]
	get_expected_score(X_test, y_pred_proba, y_pred)

	# 10698.73
	model_lm <- lm(TARGDOL ~., data = X_train_donated)
	y_pred <- predict(model_lm, newdata = X_test)
	get_expected_score(X_test, y_pred_proba, y_pred)

	# 10822
	model_rlm <- rlm(TARGDOL ~., data = X_train_donated, maxit = 50)
	y_pred <- predict(model_rlm, newdata = X_test)
	get_expected_score(X_test, y_pred_proba, y_pred)

	# what the optimal amount looks like
	# score around 23000
	setorder(X_test, -TARGDOL)
	sum( X_test[1:1000, ][['TARGDOL']] )


	# -----------------------------------------------------------------------------
	# advanced model training
	# only random forest will be explored
	library(h2o)
	h2o.init(nthreads = -1)

	# classification model (random forest)
	# specify the response and predictor columns
	# and the evaluation metric (auc)
	X_train_h2o <- as.h2o(X_train)
	X_test_h2o <- as.h2o(X_test)
	y <- 'DONATED'
	x <- setdiff( colnames(X_train), c('TARGDOL', y) )
	grid_id_rf <- 'rs_rf'
	stopping_metric <- 'AUC'

	# the random search criteria and number trees
	# is applicable for all tree models
	ntrees <- 300
	stopping_tolerance <- 0.05
	stopping_rounds <- 5
	search_criteria <- list(
	strategy = 'RandomDiscrete',
	stopping_metric = stopping_metric,
	stopping_tolerance = stopping_tolerance,
	stopping_rounds = stopping_rounds
	)

	# random forest's random search hyperparameters
	# max depth and sample rate (row sample rate per tree)
	max_depth_opt <- 4:15
	sample_rate_opt <- seq(from = 0.8, to = 0.95, by = 0.05)
	rf_params_opt <- list(
	max_depth = max_depth_opt,
	sample_rate = sample_rate_opt
	)

	rf_class <- h2o.grid(
	algorithm = 'randomForest',
	grid_id = grid_id_rf,
	x = x,
	y = y,
	training_frame = X_train_h2o,
	nfolds = nfolds,
	ntrees = ntrees,
	balance_classes = TRUE,
	stopping_metric = stopping_metric,
	stopping_tolerance = stopping_tolerance,
	stopping_rounds = stopping_rounds,
	hyper_params = rf_params_opt,
	search_criteria = search_criteria
	)

	# sort the model by the specified evaluation metric
	# and obtain the top one (the best model)
	rf_class_sorted <- h2o.getGrid(
	grid_id = grid_id_rf,
	sort_by = 'auc',
	decreasing = TRUE
	)
	rf_class_best_model <- h2o.getModel(rf_class_sorted@model_ids[[1]])
	rf_class_y_pred_proba <- h2o.predict(rf_class_best_model, X_test_h2o)[['YES']] %>%
	as.data.table()

	# stacking random forest and logistic regression for classification
	# and lasso/ridge for regression; the best stacking proportion is
	# found through trial and error
	# obtaining a score of 10,904
	y_pred <- predict(glmnet_reg, newdata = X_test)
	y_pred_proba_stack1 <- rf_class_y_pred_proba[['YES']] * 0.21 + y_pred_proba * 0.79
	get_expected_score(X_test, y_pred_proba_stack, y_pred)

	y_pred <- predict(model_lm, newdata = X_test)
	y_pred_proba_stack2 <- rf_class_y_pred_proba[['YES']] * 0.25 + y_pred_proba * 0.75
	get_expected_score(X_test, y_pred_proba_stack, y_pred)

	y_pred <- predict(model_rlm, newdata = X_test)
	y_pred_proba_stack3 <- rf_class_y_pred_proba[['YES']] * 0.3 + y_pred_proba * 0.77
	get_expected_score(X_test, y_pred_proba_stack, y_pred)

	dt_var_imp <- h2o.varimp(rf_class_best_model)[ , c('variable', 'percentage') ] %>%
	as.data.table()
	dt_var_imp

	h2o.shutdown(prompt = FALSE)

	# ---------------------------------------------------------------------------------
	# model validation

	# linear/robust linear regression's performance
	summary(model_lm)$r.squared
	summary(model_rlm)

	# accessing lasso/ridge's performance coefficients
	confusionMatrix(glmnet_class)
	glmnet_class$results
	glmnet_class$bestTune
	glmnet_reg$results
	glmnet_reg$bestTune

	# obtain auc score
	y_predictions <- list(y_pred_proba, y_pred_proba_stack1,
	y_pred_proba_stack2, y_pred_proba_stack1)

	auc_scores <- lapply( y_predictions, function(y_proba) {
	rocr_pred <- ROCR::prediction(y_proba, X_test$DONATED)
	auc <- ROCR::performance(rocr_pred, 'auc')@y.values[[1]]
	return(auc)
	})
	auc_scores

	# obtain lasso/ridge's estimated coefficients
	# we'll will have to select lambda value to get the optimal coefficients,
	# and then we only selected the ones that were not shrunk to 0 by the model
	coef_class <- coef(glmnet_class$finalModel, s = glmnet_class$bestTune$lambda) %>%
	as.matrix()
	coef_class <- data.table( names = row.names(coef_class), estimates = coef_class[, 1] )
	sum( coef_class[['estimates']] != 0 )
	coef_class <- coef_class[ estimates != 0, ]
	coef_class

	# regression's coefficients
	coef_reg <- coef(glmnet_reg$finalModel, s = glmnet_reg$bestTune$lambda) %>%
	as.matrix()
	coef_reg <- data.table( names = row.names(coef_reg), estimates = coef_reg[, 1] )
	sum( coef_reg[['estimates']] != 0 )
	coef_reg <- coef_reg[ estimates != 0, ]
	coef_reg


	# ---------------------------------------------------------------------------------
	# visualization
	library(ggplot2)
	library(ggfortify)
	library(gridExtra)

	# remove the 0 contribution and the obvious outlier prior to visualization
	has_donated <- donation[ TARGDOL > 0 & TARGDOL < 1500, ]

	# distribution of TARGDOL
	donation_TARGDOL <- donation[ TARGDOL < 1500, .(TARGDOL, RemoveZero = 'KeptZero') ]
	has_donated_TARGDOL <- has_donated[ , .(TARGDOL, RemoveZero = 'RemovedZero') ]
	distribution_TARGDOL <- rbind(donation_TARGDOL, has_donated_TARGDOL)

	ggplot( distribution_TARGDOL, aes(TARGDOL, color = RemoveZero) ) +
	geom_density() + theme_bw() +
	labs(title = 'Distribution of TARGDOL') +
	guides(color = FALSE) +
	facet_grid(~ RemoveZero)
	ggsave('distribution_TARGDOL.png')

	# Percentage of Donors vs. Non-Donors
	dt_donated <- data.table( table(donation$DONATED) / nrow(donation) )
	setnames( dt_donated, c('donated', 'frequency') )

	ggplot( dt_donated, aes(donated, frequency, fill = donated) ) +
	geom_col() + theme_bw() +
	labs(title = 'Percentage of Donors vs. Non-Donors')
	ggsave('donors_and_non_donors.png')

	# boxplot of the donated amount between the different
	# solicitation and contribution code
	slcod1 <- ggplot(has_donated, aes(SLCOD1, TARGDOL, fill = SLCOD1) ) +
	geom_boxplot() + theme_bw() +
	labs(title = 'TARGDOL > 0 vs. SLCOD1') +
	ylim(0, 100)

	cncod1 <- ggplot(has_donated, aes(CNCOD1, TARGDOL, fill = CNCOD1) ) +
	geom_boxplot() + theme_bw() +
	labs(title = 'TARGDOL > 0 vs. CNCOD1') +
	ylim(0, 100)

	g <- arrangeGrob(slcod1, cncod1, nrow = 2)
	ggsave('donated_amount_sl_and_cn_code.png', g)

	# see if CNDOL1 (the latest contribution amount) if
	# correlated with TARGDOL
	ggplot(has_donated, aes(CNDOL1, log(TARGDOL) ) ) +
	geom_point() + theme_bw() +
	labs(x = 'Log(CNDOL1)', title = 'Log(TARGDOL > 0) vs. Log(CNDOL1)')
	ggsave('targdol_cndol.png')

	# Proportion of donors in each SLCOD1 category
	dt_SLCOD_prop <- data.table( table(has_donated$SLCOD1) / table(donation$SLCOD1) )
	setnames( dt_SLCOD_prop, c('SLCOD1', 'Proportion') )

	ggplot( dt_SLCOD_prop, aes(SLCOD1, Proportion, fill = SLCOD1) ) +
	geom_col() + theme_bw() +
	labs(y = 'Proportion of donors',
	title = 'Proportion of donors in each SLCOD1 category')
	ggsave('donors_proportion_sl_code.png')


	# linear model's diagnostic plot
	png('lm_diagnostics.png', width = 700, height = 700)

	autoplot(model_lm, which = 1:6, ncol = 3, label.size = 3) +
	theme_bw()

	dev.off()