Last active
December 8, 2016 16:28
-
-
Save ethen8181/fb46e587413eec79f27d05d2e7d54fcf to your computer and use it in GitHub Desktop.
test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(MASS) | |
library(ROCR) | |
library(caret) | |
library(glmnet) | |
library(magrittr) | |
library(data.table) | |
library(doParallel) | |
setwd('/Users/ethen/Desktop/predictive-project') | |
# --------------------------------------------------------------------------------- | |
# preprocessing | |
path_donation <- 'donation data.csv' | |
path_code <- 'dmef1code.csv' | |
path_state_mapping <- 'states.csv' | |
# main dataset, remove the id column | |
donation <- fread(path_donation, stringsAsFactors = TRUE) | |
donation[ , ID := NULL ] | |
# delete the last row, since it has a questionable '*' CODETYPE | |
code <- fread(path_code, stringsAsFactors = TRUE) | |
code <- code[-nrow(code), ] | |
code[ , CODETYPE := factor( CODETYPE, levels = c('A', 'B', 'C', 'D', 'M') ) ] | |
# table that maps the states into regions | |
# https://drive.google.com/open?id=0B5-Q31Rgtkixb0t3MkNTNXI2djA | |
states <- fread(path_state_mapping, stringsAsFactors = TRUE) | |
preprocessing <- function(donation, code, states) { | |
# general preprocessing steps for the donation data | |
# for people that have donated only once, their CNDOL2 and | |
# CNCOD3 will be missing (it indicates the previous donation), | |
# the same goes for SLCOD2 and SLCOD3; | |
# match CNCOD (Latest Contribution Code), | |
# SLCOD (Latest Solicitation Code) to the five types in the code file | |
cols_mapping <- c('CNCOD1', 'CNCOD2', 'CNCOD3', | |
'SLCOD1', 'SLCOD2', 'SLCOD3') | |
donation[ , (cols_mapping) := lapply( .SD, function(col) { | |
code$CODETYPE[ match(col, code$CODE) ] | |
}), .SDcols = cols_mapping ] | |
# for the CNCOD and SLCOD replace the one | |
# that has NA value, with a 'None' category | |
for(j in cols_mapping) { | |
na_rows <- which( is.na(donation[[j]]) ) | |
set(donation, i = na_rows, j = j, value = 'None') | |
} | |
# drop all the CNDAT (Latest Contribution Date) | |
# because we can use CNMON (months since latest | |
# contribution) instead | |
cols_drop <- c('CNDAT1', 'CNDAT2', 'CNDAT3') | |
donation[ , (cols_drop) := NULL ] | |
# if he/she donated once, then the CNDOL2 | |
# (2nd Latest Contribution) should be 0 | |
# the same applies for CNDOL3 (3rd Latest Contribution) | |
donation[ CNTMLIF == 1, CNDOL2 := 0 ] | |
donation[ CNTMLIF == 1, CNDOL3 := 0 ] | |
donation[ CNTMLIF == 2, CNDOL3 := 0 ] | |
# merge state into regions | |
# 1. New england: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont | |
# 2. Mideast: Delaware, District of Columbia, Maryland, New Jersey, New York, and Pennsylvania | |
# 3. Great Lakes: Illinois, Indiana, Michigan, Ohio, and Wisconsin | |
# 4. Plains: Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, and South Dakota | |
# 5. Southeast: Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, | |
# North Carolina, South Carolina, Tennessee, Virginia, and West Virginia | |
# 6. Southwest: Arizona, New Mexico, Oklahoma, and Texas | |
# 7. Rocky Mountain: Colorado, Idaho, Montana, Utah, and Wyoming | |
# 8. Far West: Alaska, California, Hawaii, Nevada, Oregon, and Washington | |
donation[ , region := states$region[ match(STATCODE, states$statecode) ] ] | |
donation[ , STATCODE := NULL ] | |
donation[ is.na(region), region := 'Other' ] | |
# if both CNMON2 and CNMON3 are both missing, meaning that | |
# they have only donated once, fill it with CNMON1; the same idea | |
# can be extended to CNMON3 | |
donated_once <- with( donation, is.na(CNMON2) & is.na(CNMON3) ) | |
donation[ donated_once, CNMON2 := CNMON1 ] | |
donation[ donated_once, CNMON3 := CNMON1 ] | |
donated_twice <- is.na(donation$CNMON3) | |
donation[ donated_twice, CNMON3 := CNMON2 ] | |
# outliers exists amongst these columns (there are | |
# people that donated a lot more than others); log | |
# these columns to make the numbers more normally distributed | |
donation[ , `:=`( CNDOL1 = log(CNDOL1), | |
CNTRLIF = log(CNTRLIF), | |
CONLARG = log(CONLARG) ) ] | |
# binary response for the logistic regression to see | |
# whether that person donated or not | |
donation[ , DONATED := as.factor( ifelse(TARGDOL > 0, 'YES', 'NO') ) ] | |
return(donation) | |
} | |
donation <- preprocessing(donation, code, states) | |
# train/test split | |
# training and test sets in the ratio of 2:1 by putting every | |
# 3rd observation in the test set and all remaining observations in | |
# the training set | |
test_index <- seq(3, nrow(donation), by = 3) | |
X_train <- donation[-test_index] | |
X_test <- donation[test_index] | |
# remove the obvious outliers from the training set | |
X_train <- X_train[ TARGDOL != 1500, ] | |
remove_correlations <- function(X_train, X_test) { | |
# identify multi-collinearity using variance inflation factor (vif) | |
# and remove them from both the train and test set; | |
# conventionally, a vif score larger than 10 should be removed, | |
# after this procedure three columns including SLCOD2, CNCOD2, CNTRLIF | |
# and CNMON3 were removed | |
# assign a random response column to start the process | |
# and keep removing multi-collinearity until there is none | |
multi_cor_cols <- 'TARGDOL' | |
while( length(multi_cor_cols) > 0 ) { | |
model_lm <- lm(TARGDOL ~.-DONATED, data = X_train) | |
multi_cor <- car::vif(model_lm)[ , 'GVIF'] | |
max_col <- names( which.max(multi_cor) ) | |
if( multi_cor[[max_col]] > 10 ) { | |
multi_cor_cols <- max_col | |
print(multi_cor_cols) | |
X_train[ , (multi_cor_cols) := NULL ] | |
X_test[ , (multi_cor_cols) := NULL ] | |
} else { | |
multi_cor_cols <- NULL | |
} | |
} | |
removed_cor <- list(train = X_train, test = X_test) | |
return(removed_cor) | |
} | |
removed_cor <- remove_correlations(X_train, X_test) | |
X_train <- removed_cor$train | |
X_test <- removed_cor$test | |
create_interactions <- function(X_train, X_test) { | |
# create interaction terms across all possible pairs of numeric columns; | |
# (numeric columns include class of integer and numeric) | |
# this includes creating, 2nd polynomial degree columns (squaring the column value) | |
num_cols <- !vapply( X_train, class, character(1) ) %in% 'factor' | |
num_name <- colnames(X_train)[num_cols] | |
num_name <- setdiff(num_name, 'TARGDOL') | |
num_len <- length(num_name) | |
for(i in 1:num_len) { | |
col1 <- num_name[i] | |
for(j in i:num_len) { | |
col2 <- num_name[j] | |
col_name <- paste0(col1, '_TIMES_', col2) | |
X_train[ , (col_name) := X_train[[col1]] * X_train[[col2]] ] | |
X_test[ , (col_name) := X_test[[col1]] * X_test[[col2]] ] | |
} | |
} | |
created_inter <- list(train = X_train, test = X_test) | |
return(created_inter) | |
} | |
# after creating interaction term, we ended up with a total of | |
# 96 features. The reference level for the factor columns are | |
# B for SEX, FarWest for region, A for CNCOD1, CNCOD3, SLCOD1 and SLCOD3 | |
created_inter <- create_interactions(X_train, X_test) | |
X_train <- created_inter$train | |
X_test <- created_inter$test | |
# --------------------------------------------------------------------------------- | |
# model training | |
# set the cross validation number for all models | |
nfolds <- 10 | |
# shuffle the row observations of the training data | |
# for potential model's stability | |
set.seed(1234) | |
X_train <- X_train[ sample( nrow(X_train) ) ] | |
# use all possible - 1 cores to train the lasso/ridge, | |
# remember to stop the created cluster afterwards | |
registerDoParallel(cores = detectCores() - 1) | |
# specify parameters to search for: alpha = 1 is lasso, 0 is lasso | |
# and a fix range of lambda values | |
glmnet_grid <- expand.grid( alpha = c(0, 1), | |
lambda = c(0, 0.01, 0.05, 0.1, 0.5, 0.6) ) | |
# define training control (10-fold cross validation and MSE evaluation metric) | |
# and to use ROC to determine the best model for a binary output/reponse; | |
# we need to add classProbs = TRUE and twoClassSummary in the trainControl | |
control_class <- trainControl(method = 'cv', number = nfolds, classProbs = TRUE, | |
summaryFunction = twoClassSummary) | |
glmnet_class <- train( | |
DONATED ~.-TARGDOL, | |
data = X_train, | |
method = 'glmnet', | |
metric = 'ROC', | |
trControl = control_class, | |
tuneGrid = glmnet_grid | |
) | |
# predict TARGDOL | |
# remember to remove the columns that are related to | |
# whether or not they made a donation (DONATED) | |
X_train_donated <- X_train[ TARGDOL > 0, ] | |
X_train_donated[ , DONATED := NULL ] | |
# for the regression model, logging the response lead to a inferior result | |
control_reg <- trainControl(method = 'cv', number = nfolds) | |
glmnet_reg <- train( | |
TARGDOL ~., | |
data = X_train_donated, | |
method = 'glmnet', | |
trControl = control_reg, | |
tuneGrid = glmnet_grid | |
) | |
stopImplicitCluster() | |
# multiply the probability of donating and the amount that | |
# they will donate to get the expected value; | |
# then obtain the top 1000 donators to see the actual amount | |
get_expected_score <- function(X_test, y_pred_proba, y_pred) { | |
expected <- y_pred_proba * y_pred | |
sorted_index <- order(expected, decreasing = TRUE)[1:1000] | |
expected_score <- sum(X_test[ sorted_index, ][['TARGDOL']]) | |
return(expected_score) | |
} | |
# lasso/ridge (for both classification and regression model) | |
# obtaining a score around 10664.84 | |
y_pred <- predict(glmnet_reg, newdata = X_test) | |
y_pred_proba <- predict(glmnet_class, newdata = X_test, type = 'prob')[['YES']] | |
get_expected_score(X_test, y_pred_proba, y_pred) | |
# 10698.73 | |
model_lm <- lm(TARGDOL ~., data = X_train_donated) | |
y_pred <- predict(model_lm, newdata = X_test) | |
get_expected_score(X_test, y_pred_proba, y_pred) | |
# 10822 | |
model_rlm <- rlm(TARGDOL ~., data = X_train_donated, maxit = 50) | |
y_pred <- predict(model_rlm, newdata = X_test) | |
get_expected_score(X_test, y_pred_proba, y_pred) | |
# what the optimal amount looks like | |
# score around 23000 | |
setorder(X_test, -TARGDOL) | |
sum( X_test[1:1000, ][['TARGDOL']] ) | |
# ----------------------------------------------------------------------------- | |
# advanced model training | |
# only random forest will be explored | |
library(h2o) | |
h2o.init(nthreads = -1) | |
# classification model (random forest) | |
# specify the response and predictor columns | |
# and the evaluation metric (auc) | |
X_train_h2o <- as.h2o(X_train) | |
X_test_h2o <- as.h2o(X_test) | |
y <- 'DONATED' | |
x <- setdiff( colnames(X_train), c('TARGDOL', y) ) | |
grid_id_rf <- 'rs_rf' | |
stopping_metric <- 'AUC' | |
# the random search criteria and number trees | |
# is applicable for all tree models | |
ntrees <- 300 | |
stopping_tolerance <- 0.05 | |
stopping_rounds <- 5 | |
search_criteria <- list( | |
strategy = 'RandomDiscrete', | |
stopping_metric = stopping_metric, | |
stopping_tolerance = stopping_tolerance, | |
stopping_rounds = stopping_rounds | |
) | |
# random forest's random search hyperparameters | |
# max depth and sample rate (row sample rate per tree) | |
max_depth_opt <- 4:15 | |
sample_rate_opt <- seq(from = 0.8, to = 0.95, by = 0.05) | |
rf_params_opt <- list( | |
max_depth = max_depth_opt, | |
sample_rate = sample_rate_opt | |
) | |
rf_class <- h2o.grid( | |
algorithm = 'randomForest', | |
grid_id = grid_id_rf, | |
x = x, | |
y = y, | |
training_frame = X_train_h2o, | |
nfolds = nfolds, | |
ntrees = ntrees, | |
balance_classes = TRUE, | |
stopping_metric = stopping_metric, | |
stopping_tolerance = stopping_tolerance, | |
stopping_rounds = stopping_rounds, | |
hyper_params = rf_params_opt, | |
search_criteria = search_criteria | |
) | |
# sort the model by the specified evaluation metric | |
# and obtain the top one (the best model) | |
rf_class_sorted <- h2o.getGrid( | |
grid_id = grid_id_rf, | |
sort_by = 'auc', | |
decreasing = TRUE | |
) | |
rf_class_best_model <- h2o.getModel(rf_class_sorted@model_ids[[1]]) | |
rf_class_y_pred_proba <- h2o.predict(rf_class_best_model, X_test_h2o)[['YES']] %>% | |
as.data.table() | |
# stacking random forest and logistic regression for classification | |
# and lasso/ridge for regression; the best stacking proportion is | |
# found through trial and error | |
# obtaining a score of 10,904 | |
y_pred <- predict(glmnet_reg, newdata = X_test) | |
y_pred_proba_stack1 <- rf_class_y_pred_proba[['YES']] * 0.21 + y_pred_proba * 0.79 | |
get_expected_score(X_test, y_pred_proba_stack, y_pred) | |
y_pred <- predict(model_lm, newdata = X_test) | |
y_pred_proba_stack2 <- rf_class_y_pred_proba[['YES']] * 0.25 + y_pred_proba * 0.75 | |
get_expected_score(X_test, y_pred_proba_stack, y_pred) | |
y_pred <- predict(model_rlm, newdata = X_test) | |
y_pred_proba_stack3 <- rf_class_y_pred_proba[['YES']] * 0.3 + y_pred_proba * 0.77 | |
get_expected_score(X_test, y_pred_proba_stack, y_pred) | |
dt_var_imp <- h2o.varimp(rf_class_best_model)[ , c('variable', 'percentage') ] %>% | |
as.data.table() | |
dt_var_imp | |
h2o.shutdown(prompt = FALSE) | |
# --------------------------------------------------------------------------------- | |
# model validation | |
# linear/robust linear regression's performance | |
summary(model_lm)$r.squared | |
summary(model_rlm) | |
# accessing lasso/ridge's performance coefficients | |
confusionMatrix(glmnet_class) | |
glmnet_class$results | |
glmnet_class$bestTune | |
glmnet_reg$results | |
glmnet_reg$bestTune | |
# obtain auc score | |
y_predictions <- list(y_pred_proba, y_pred_proba_stack1, | |
y_pred_proba_stack2, y_pred_proba_stack1) | |
auc_scores <- lapply( y_predictions, function(y_proba) { | |
rocr_pred <- ROCR::prediction(y_proba, X_test$DONATED) | |
auc <- ROCR::performance(rocr_pred, 'auc')@y.values[[1]] | |
return(auc) | |
}) | |
auc_scores | |
# obtain lasso/ridge's estimated coefficients | |
# we'll will have to select lambda value to get the optimal coefficients, | |
# and then we only selected the ones that were not shrunk to 0 by the model | |
coef_class <- coef(glmnet_class$finalModel, s = glmnet_class$bestTune$lambda) %>% | |
as.matrix() | |
coef_class <- data.table( names = row.names(coef_class), estimates = coef_class[, 1] ) | |
sum( coef_class[['estimates']] != 0 ) | |
coef_class <- coef_class[ estimates != 0, ] | |
coef_class | |
# regression's coefficients | |
coef_reg <- coef(glmnet_reg$finalModel, s = glmnet_reg$bestTune$lambda) %>% | |
as.matrix() | |
coef_reg <- data.table( names = row.names(coef_reg), estimates = coef_reg[, 1] ) | |
sum( coef_reg[['estimates']] != 0 ) | |
coef_reg <- coef_reg[ estimates != 0, ] | |
coef_reg | |
# --------------------------------------------------------------------------------- | |
# visualization | |
library(ggplot2) | |
library(ggfortify) | |
library(gridExtra) | |
# remove the 0 contribution and the obvious outlier prior to visualization | |
has_donated <- donation[ TARGDOL > 0 & TARGDOL < 1500, ] | |
# distribution of TARGDOL | |
donation_TARGDOL <- donation[ TARGDOL < 1500, .(TARGDOL, RemoveZero = 'KeptZero') ] | |
has_donated_TARGDOL <- has_donated[ , .(TARGDOL, RemoveZero = 'RemovedZero') ] | |
distribution_TARGDOL <- rbind(donation_TARGDOL, has_donated_TARGDOL) | |
ggplot( distribution_TARGDOL, aes(TARGDOL, color = RemoveZero) ) + | |
geom_density() + theme_bw() + | |
labs(title = 'Distribution of TARGDOL') + | |
guides(color = FALSE) + | |
facet_grid(~ RemoveZero) | |
ggsave('distribution_TARGDOL.png') | |
# Percentage of Donors vs. Non-Donors | |
dt_donated <- data.table( table(donation$DONATED) / nrow(donation) ) | |
setnames( dt_donated, c('donated', 'frequency') ) | |
ggplot( dt_donated, aes(donated, frequency, fill = donated) ) + | |
geom_col() + theme_bw() + | |
labs(title = 'Percentage of Donors vs. Non-Donors') | |
ggsave('donors_and_non_donors.png') | |
# boxplot of the donated amount between the different | |
# solicitation and contribution code | |
slcod1 <- ggplot(has_donated, aes(SLCOD1, TARGDOL, fill = SLCOD1) ) + | |
geom_boxplot() + theme_bw() + | |
labs(title = 'TARGDOL > 0 vs. SLCOD1') + | |
ylim(0, 100) | |
cncod1 <- ggplot(has_donated, aes(CNCOD1, TARGDOL, fill = CNCOD1) ) + | |
geom_boxplot() + theme_bw() + | |
labs(title = 'TARGDOL > 0 vs. CNCOD1') + | |
ylim(0, 100) | |
g <- arrangeGrob(slcod1, cncod1, nrow = 2) | |
ggsave('donated_amount_sl_and_cn_code.png', g) | |
# see if CNDOL1 (the latest contribution amount) if | |
# correlated with TARGDOL | |
ggplot(has_donated, aes(CNDOL1, log(TARGDOL) ) ) + | |
geom_point() + theme_bw() + | |
labs(x = 'Log(CNDOL1)', title = 'Log(TARGDOL > 0) vs. Log(CNDOL1)') | |
ggsave('targdol_cndol.png') | |
# Proportion of donors in each SLCOD1 category | |
dt_SLCOD_prop <- data.table( table(has_donated$SLCOD1) / table(donation$SLCOD1) ) | |
setnames( dt_SLCOD_prop, c('SLCOD1', 'Proportion') ) | |
ggplot( dt_SLCOD_prop, aes(SLCOD1, Proportion, fill = SLCOD1) ) + | |
geom_col() + theme_bw() + | |
labs(y = 'Proportion of donors', | |
title = 'Proportion of donors in each SLCOD1 category') | |
ggsave('donors_proportion_sl_code.png') | |
# linear model's diagnostic plot | |
png('lm_diagnostics.png', width = 700, height = 700) | |
autoplot(model_lm, which = 1:6, ncol = 3, label.size = 3) + | |
theme_bw() | |
dev.off() | |
I just ran multiple Regression:
mult_fit <- lm(TARGDOL ~ ., data = X_train_donated)
Removing the 1500 for both the classification and regression
groups <- cut(x=donation$TARGDOL, breaks=c(-1,seq(from=0, to=ceiling(max(X_train$TARGDOL)), by = 5)))
bygroup <- tapply(donation$TARGDOL, groups, length)
names(bygroup)[1] <- "[0]"
barplot(bygroup, xlab="Amount", ylab="Frequency", main="Distribution of TARGDOL")
groups2 <- ifelse(donation$TARGDOL==0,"Non-Donor","Donor")
bygroup2 <- tapply(donation$TARGDOL, groups2, length)
barplot(bygroup2, ylab="Frequency", main="Number of Donors vs. Non-Donors")
donation$CNCOD1 <- droplevels(donation$CNCOD1)
boxplot(TARGDOL~CNCOD1,data=donation[TARGDOL>0,],ylim=c(0,40), xlab="CNCOD1", ylab="TARGDOL", main="TARGDOL vs. CNCOD1 (TARGDOL > 0)")
boxplot(TARGDOL~SLCOD1,data=donation[TARGDOL>0,],ylim=c(0,40), xlab="SLCOD1", ylab="TARGDOL", main="TARGDOL vs. SLCOD1 (TARGDOL > 0)")
plot(donation$CNDOL1, donation$TARGDOL, xlab="Log(CNDOL1)", ylab="TARGDOL", main="TARGDOL vs. Log(CNDOL1)")
plot(donation[TARGDOL>0,]$CNDOL1, donation[TARGDOL>0,]$TARGDOL, xlab="Log(CNDOL1)", ylab="TARGDOL", main="TARGDOL vs. Log(CNDOL1) (TARGDOL > 0)")
donors <- ifelse(donation$TARGDOL==0,0,1)
total <- aggregate(donors ~ donation$SLCOD1, FUN=length)
donated <- aggregate(donors ~ donation$SLCOD1, FUN=sum)
percent <- donated$donors/total$donors
names(percent) <- c("A", "B", "C", "D", "M")
barplot(percent, xlab="SLCOD1", ylab="Proportion of Donors", main="Proportion of Donors in each SLCOD1 Category")
total <- total$donors
names(total) <- c("A", "B", "C", "D", "M")
t <- barplot(total, ylim=c(0,85000), xlab="SLCOD1", ylab="Frequency", main="Amount of People Classified Into Each SLCOD1 Category")
text(x = t, y = total, label = total, pos = 3, cex = 0.8, col = "red")
plot(donation[TARGDOL>0,]$CONLARG, donation[TARGDOL>0,]$TARGDOL, xlab="Log(CONLARG)", ylab="TARGDOL", main="TARGDOL vs. Log(CONLARG) (TARGDOL > 0)")
plot(donation[TARGDOL>0,]$CONTRFST, donation[TARGDOL>0,]$TARGDOL, xlab="CNTRFST", ylab="TARGDOL", main="TARGDOL vs. CNTRFST (TARGDOL > 0)")
plot(donation[TARGDOL>0,]$CNTMLIF, donation[TARGDOL>0,]$TARGDOL, xlab="CNTMLIF", ylab="TARGDOL", main="TARGDOL vs. CNTMLIF (TARGDOL > 0)")
P <- ecdf(donation$CNMON1)
plot(P, xlab="CNMON1", ylab="Cumulative Proportion", main="Cumulative Proportion of CNMON1")
P2 <- ecdf(donation$CNMONL)
plot(P2, xlab="CNMONL", ylab="Cumulative Proportion", main="Cumulative Proportion of CNMONL")
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
predicting
shuffle the row observations of the training data
set.seed(1234)
X_train <- X_train[ sample( nrow(X_train) ) ]
Baseline
##Binomial logistic
##Removing Outlier
X_train <- X_train[TARGDOL < 1500,]
fit <- glm(DONATED ~ . - TARGDOL - CONTRFST, family=binomial, data = X_train)
##Multiple Linear Regression
X_train2 <- X_train[TARGDOL > 0,]
X_train2$DONATED <- NULL
##CONLARG and CNTRLIF collinearity
mult_fit <- lm(TARGDOL ~ .-CONLARG - CNTRLIF, data = X_train2)
summary(mult_fit)
vif(mult_fit)
predicted <- predict(fit, X_test, type="response")
predicted2 <- predict(mult_fit, X_test)
X_test$expected <- predicted * predicted2
X_test2 <- X_test[order(expected, decreasing = TRUE),]
targdol1000 <- sum(head(X_test2$TARGDOL,1000))
expected1000 <- sum(head(X_test2$expected,1000))
targdol1000
Try changing codes to most recent/most frequent solicitation/contrbution
With Outlier Robust is best with 10096.23
##Robust
robust_fit <- rlm(TARGDOL ~ .-CONLARG + CNDOL1:CNTMLIF- CNTRLIF - CNCOD1 - CNCOD2 - CNCOD3 - SLCOD1 - SLCOD2 - SLCOD3, data = X_train2)
summary(robust_fit)
vif(robust_fit)
predicted <- predict(fit, X_test, type="response")
predicted2 <- predict(robust_fit, X_test)
X_test$expected <- predicted * predicted2
X_test2 <- X_test[order(expected, decreasing = TRUE),]
targdol1000 <- sum(head(X_test2$TARGDOL,1000))
expected1000 <- sum(head(X_test2$expected,1000))
targdol1000