-
-
Save ethen8181/fb46e587413eec79f27d05d2e7d54fcf to your computer and use it in GitHub Desktop.
library(MASS) | |
library(ROCR) | |
library(caret) | |
library(glmnet) | |
library(magrittr) | |
library(data.table) | |
library(doParallel) | |
setwd('/Users/ethen/Desktop/predictive-project') | |
# --------------------------------------------------------------------------------- | |
# preprocessing | |
path_donation <- 'donation data.csv' | |
path_code <- 'dmef1code.csv' | |
path_state_mapping <- 'states.csv' | |
# main dataset, remove the id column | |
donation <- fread(path_donation, stringsAsFactors = TRUE) | |
donation[ , ID := NULL ] | |
# delete the last row, since it has a questionable '*' CODETYPE | |
code <- fread(path_code, stringsAsFactors = TRUE) | |
code <- code[-nrow(code), ] | |
code[ , CODETYPE := factor( CODETYPE, levels = c('A', 'B', 'C', 'D', 'M') ) ] | |
# table that maps the states into regions | |
# https://drive.google.com/open?id=0B5-Q31Rgtkixb0t3MkNTNXI2djA | |
states <- fread(path_state_mapping, stringsAsFactors = TRUE) | |
preprocessing <- function(donation, code, states) { | |
# general preprocessing steps for the donation data | |
# for people that have donated only once, their CNDOL2 and | |
# CNCOD3 will be missing (it indicates the previous donation), | |
# the same goes for SLCOD2 and SLCOD3; | |
# match CNCOD (Latest Contribution Code), | |
# SLCOD (Latest Solicitation Code) to the five types in the code file | |
cols_mapping <- c('CNCOD1', 'CNCOD2', 'CNCOD3', | |
'SLCOD1', 'SLCOD2', 'SLCOD3') | |
donation[ , (cols_mapping) := lapply( .SD, function(col) { | |
code$CODETYPE[ match(col, code$CODE) ] | |
}), .SDcols = cols_mapping ] | |
# for the CNCOD and SLCOD replace the one | |
# that has NA value, with a 'None' category | |
for(j in cols_mapping) { | |
na_rows <- which( is.na(donation[[j]]) ) | |
set(donation, i = na_rows, j = j, value = 'None') | |
} | |
# drop all the CNDAT (Latest Contribution Date) | |
# because we can use CNMON (months since latest | |
# contribution) instead | |
cols_drop <- c('CNDAT1', 'CNDAT2', 'CNDAT3') | |
donation[ , (cols_drop) := NULL ] | |
# if he/she donated once, then the CNDOL2 | |
# (2nd Latest Contribution) should be 0 | |
# the same applies for CNDOL3 (3rd Latest Contribution) | |
donation[ CNTMLIF == 1, CNDOL2 := 0 ] | |
donation[ CNTMLIF == 1, CNDOL3 := 0 ] | |
donation[ CNTMLIF == 2, CNDOL3 := 0 ] | |
# merge state into regions | |
# 1. New england: Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island and Vermont | |
# 2. Mideast: Delaware, District of Columbia, Maryland, New Jersey, New York, and Pennsylvania | |
# 3. Great Lakes: Illinois, Indiana, Michigan, Ohio, and Wisconsin | |
# 4. Plains: Iowa, Kansas, Minnesota, Missouri, Nebraska, North Dakota, and South Dakota | |
# 5. Southeast: Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, | |
# North Carolina, South Carolina, Tennessee, Virginia, and West Virginia | |
# 6. Southwest: Arizona, New Mexico, Oklahoma, and Texas | |
# 7. Rocky Mountain: Colorado, Idaho, Montana, Utah, and Wyoming | |
# 8. Far West: Alaska, California, Hawaii, Nevada, Oregon, and Washington | |
donation[ , region := states$region[ match(STATCODE, states$statecode) ] ] | |
donation[ , STATCODE := NULL ] | |
donation[ is.na(region), region := 'Other' ] | |
# if both CNMON2 and CNMON3 are both missing, meaning that | |
# they have only donated once, fill it with CNMON1; the same idea | |
# can be extended to CNMON3 | |
donated_once <- with( donation, is.na(CNMON2) & is.na(CNMON3) ) | |
donation[ donated_once, CNMON2 := CNMON1 ] | |
donation[ donated_once, CNMON3 := CNMON1 ] | |
donated_twice <- is.na(donation$CNMON3) | |
donation[ donated_twice, CNMON3 := CNMON2 ] | |
# outliers exists amongst these columns (there are | |
# people that donated a lot more than others); log | |
# these columns to make the numbers more normally distributed | |
donation[ , `:=`( CNDOL1 = log(CNDOL1), | |
CNTRLIF = log(CNTRLIF), | |
CONLARG = log(CONLARG) ) ] | |
# binary response for the logistic regression to see | |
# whether that person donated or not | |
donation[ , DONATED := as.factor( ifelse(TARGDOL > 0, 'YES', 'NO') ) ] | |
return(donation) | |
} | |
donation <- preprocessing(donation, code, states) | |
# train/test split | |
# training and test sets in the ratio of 2:1 by putting every | |
# 3rd observation in the test set and all remaining observations in | |
# the training set | |
test_index <- seq(3, nrow(donation), by = 3) | |
X_train <- donation[-test_index] | |
X_test <- donation[test_index] | |
# remove the obvious outliers from the training set | |
X_train <- X_train[ TARGDOL != 1500, ] | |
remove_correlations <- function(X_train, X_test) { | |
# identify multi-collinearity using variance inflation factor (vif) | |
# and remove them from both the train and test set; | |
# conventionally, a vif score larger than 10 should be removed, | |
# after this procedure three columns including SLCOD2, CNCOD2, CNTRLIF | |
# and CNMON3 were removed | |
# assign a random response column to start the process | |
# and keep removing multi-collinearity until there is none | |
multi_cor_cols <- 'TARGDOL' | |
while( length(multi_cor_cols) > 0 ) { | |
model_lm <- lm(TARGDOL ~.-DONATED, data = X_train) | |
multi_cor <- car::vif(model_lm)[ , 'GVIF'] | |
max_col <- names( which.max(multi_cor) ) | |
if( multi_cor[[max_col]] > 10 ) { | |
multi_cor_cols <- max_col | |
print(multi_cor_cols) | |
X_train[ , (multi_cor_cols) := NULL ] | |
X_test[ , (multi_cor_cols) := NULL ] | |
} else { | |
multi_cor_cols <- NULL | |
} | |
} | |
removed_cor <- list(train = X_train, test = X_test) | |
return(removed_cor) | |
} | |
removed_cor <- remove_correlations(X_train, X_test) | |
X_train <- removed_cor$train | |
X_test <- removed_cor$test | |
create_interactions <- function(X_train, X_test) { | |
# create interaction terms across all possible pairs of numeric columns; | |
# (numeric columns include class of integer and numeric) | |
# this includes creating, 2nd polynomial degree columns (squaring the column value) | |
num_cols <- !vapply( X_train, class, character(1) ) %in% 'factor' | |
num_name <- colnames(X_train)[num_cols] | |
num_name <- setdiff(num_name, 'TARGDOL') | |
num_len <- length(num_name) | |
for(i in 1:num_len) { | |
col1 <- num_name[i] | |
for(j in i:num_len) { | |
col2 <- num_name[j] | |
col_name <- paste0(col1, '_TIMES_', col2) | |
X_train[ , (col_name) := X_train[[col1]] * X_train[[col2]] ] | |
X_test[ , (col_name) := X_test[[col1]] * X_test[[col2]] ] | |
} | |
} | |
created_inter <- list(train = X_train, test = X_test) | |
return(created_inter) | |
} | |
# after creating interaction term, we ended up with a total of | |
# 96 features. The reference level for the factor columns are | |
# B for SEX, FarWest for region, A for CNCOD1, CNCOD3, SLCOD1 and SLCOD3 | |
created_inter <- create_interactions(X_train, X_test) | |
X_train <- created_inter$train | |
X_test <- created_inter$test | |
# --------------------------------------------------------------------------------- | |
# model training | |
# set the cross validation number for all models | |
nfolds <- 10 | |
# shuffle the row observations of the training data | |
# for potential model's stability | |
set.seed(1234) | |
X_train <- X_train[ sample( nrow(X_train) ) ] | |
# use all possible - 1 cores to train the lasso/ridge, | |
# remember to stop the created cluster afterwards | |
registerDoParallel(cores = detectCores() - 1) | |
# specify parameters to search for: alpha = 1 is lasso, 0 is lasso | |
# and a fix range of lambda values | |
glmnet_grid <- expand.grid( alpha = c(0, 1), | |
lambda = c(0, 0.01, 0.05, 0.1, 0.5, 0.6) ) | |
# define training control (10-fold cross validation and MSE evaluation metric) | |
# and to use ROC to determine the best model for a binary output/reponse; | |
# we need to add classProbs = TRUE and twoClassSummary in the trainControl | |
control_class <- trainControl(method = 'cv', number = nfolds, classProbs = TRUE, | |
summaryFunction = twoClassSummary) | |
glmnet_class <- train( | |
DONATED ~.-TARGDOL, | |
data = X_train, | |
method = 'glmnet', | |
metric = 'ROC', | |
trControl = control_class, | |
tuneGrid = glmnet_grid | |
) | |
# predict TARGDOL | |
# remember to remove the columns that are related to | |
# whether or not they made a donation (DONATED) | |
X_train_donated <- X_train[ TARGDOL > 0, ] | |
X_train_donated[ , DONATED := NULL ] | |
# for the regression model, logging the response lead to a inferior result | |
control_reg <- trainControl(method = 'cv', number = nfolds) | |
glmnet_reg <- train( | |
TARGDOL ~., | |
data = X_train_donated, | |
method = 'glmnet', | |
trControl = control_reg, | |
tuneGrid = glmnet_grid | |
) | |
stopImplicitCluster() | |
# multiply the probability of donating and the amount that | |
# they will donate to get the expected value; | |
# then obtain the top 1000 donators to see the actual amount | |
get_expected_score <- function(X_test, y_pred_proba, y_pred) { | |
expected <- y_pred_proba * y_pred | |
sorted_index <- order(expected, decreasing = TRUE)[1:1000] | |
expected_score <- sum(X_test[ sorted_index, ][['TARGDOL']]) | |
return(expected_score) | |
} | |
# lasso/ridge (for both classification and regression model) | |
# obtaining a score around 10664.84 | |
y_pred <- predict(glmnet_reg, newdata = X_test) | |
y_pred_proba <- predict(glmnet_class, newdata = X_test, type = 'prob')[['YES']] | |
get_expected_score(X_test, y_pred_proba, y_pred) | |
# 10698.73 | |
model_lm <- lm(TARGDOL ~., data = X_train_donated) | |
y_pred <- predict(model_lm, newdata = X_test) | |
get_expected_score(X_test, y_pred_proba, y_pred) | |
# 10822 | |
model_rlm <- rlm(TARGDOL ~., data = X_train_donated, maxit = 50) | |
y_pred <- predict(model_rlm, newdata = X_test) | |
get_expected_score(X_test, y_pred_proba, y_pred) | |
# what the optimal amount looks like | |
# score around 23000 | |
setorder(X_test, -TARGDOL) | |
sum( X_test[1:1000, ][['TARGDOL']] ) | |
# ----------------------------------------------------------------------------- | |
# advanced model training | |
# only random forest will be explored | |
library(h2o) | |
h2o.init(nthreads = -1) | |
# classification model (random forest) | |
# specify the response and predictor columns | |
# and the evaluation metric (auc) | |
X_train_h2o <- as.h2o(X_train) | |
X_test_h2o <- as.h2o(X_test) | |
y <- 'DONATED' | |
x <- setdiff( colnames(X_train), c('TARGDOL', y) ) | |
grid_id_rf <- 'rs_rf' | |
stopping_metric <- 'AUC' | |
# the random search criteria and number trees | |
# is applicable for all tree models | |
ntrees <- 300 | |
stopping_tolerance <- 0.05 | |
stopping_rounds <- 5 | |
search_criteria <- list( | |
strategy = 'RandomDiscrete', | |
stopping_metric = stopping_metric, | |
stopping_tolerance = stopping_tolerance, | |
stopping_rounds = stopping_rounds | |
) | |
# random forest's random search hyperparameters | |
# max depth and sample rate (row sample rate per tree) | |
max_depth_opt <- 4:15 | |
sample_rate_opt <- seq(from = 0.8, to = 0.95, by = 0.05) | |
rf_params_opt <- list( | |
max_depth = max_depth_opt, | |
sample_rate = sample_rate_opt | |
) | |
rf_class <- h2o.grid( | |
algorithm = 'randomForest', | |
grid_id = grid_id_rf, | |
x = x, | |
y = y, | |
training_frame = X_train_h2o, | |
nfolds = nfolds, | |
ntrees = ntrees, | |
balance_classes = TRUE, | |
stopping_metric = stopping_metric, | |
stopping_tolerance = stopping_tolerance, | |
stopping_rounds = stopping_rounds, | |
hyper_params = rf_params_opt, | |
search_criteria = search_criteria | |
) | |
# sort the model by the specified evaluation metric | |
# and obtain the top one (the best model) | |
rf_class_sorted <- h2o.getGrid( | |
grid_id = grid_id_rf, | |
sort_by = 'auc', | |
decreasing = TRUE | |
) | |
rf_class_best_model <- h2o.getModel(rf_class_sorted@model_ids[[1]]) | |
rf_class_y_pred_proba <- h2o.predict(rf_class_best_model, X_test_h2o)[['YES']] %>% | |
as.data.table() | |
# stacking random forest and logistic regression for classification | |
# and lasso/ridge for regression; the best stacking proportion is | |
# found through trial and error | |
# obtaining a score of 10,904 | |
y_pred <- predict(glmnet_reg, newdata = X_test) | |
y_pred_proba_stack1 <- rf_class_y_pred_proba[['YES']] * 0.21 + y_pred_proba * 0.79 | |
get_expected_score(X_test, y_pred_proba_stack, y_pred) | |
y_pred <- predict(model_lm, newdata = X_test) | |
y_pred_proba_stack2 <- rf_class_y_pred_proba[['YES']] * 0.25 + y_pred_proba * 0.75 | |
get_expected_score(X_test, y_pred_proba_stack, y_pred) | |
y_pred <- predict(model_rlm, newdata = X_test) | |
y_pred_proba_stack3 <- rf_class_y_pred_proba[['YES']] * 0.3 + y_pred_proba * 0.77 | |
get_expected_score(X_test, y_pred_proba_stack, y_pred) | |
dt_var_imp <- h2o.varimp(rf_class_best_model)[ , c('variable', 'percentage') ] %>% | |
as.data.table() | |
dt_var_imp | |
h2o.shutdown(prompt = FALSE) | |
# --------------------------------------------------------------------------------- | |
# model validation | |
# linear/robust linear regression's performance | |
summary(model_lm)$r.squared | |
summary(model_rlm) | |
# accessing lasso/ridge's performance coefficients | |
confusionMatrix(glmnet_class) | |
glmnet_class$results | |
glmnet_class$bestTune | |
glmnet_reg$results | |
glmnet_reg$bestTune | |
# obtain auc score | |
y_predictions <- list(y_pred_proba, y_pred_proba_stack1, | |
y_pred_proba_stack2, y_pred_proba_stack1) | |
auc_scores <- lapply( y_predictions, function(y_proba) { | |
rocr_pred <- ROCR::prediction(y_proba, X_test$DONATED) | |
auc <- ROCR::performance(rocr_pred, 'auc')@y.values[[1]] | |
return(auc) | |
}) | |
auc_scores | |
# obtain lasso/ridge's estimated coefficients | |
# we'll will have to select lambda value to get the optimal coefficients, | |
# and then we only selected the ones that were not shrunk to 0 by the model | |
coef_class <- coef(glmnet_class$finalModel, s = glmnet_class$bestTune$lambda) %>% | |
as.matrix() | |
coef_class <- data.table( names = row.names(coef_class), estimates = coef_class[, 1] ) | |
sum( coef_class[['estimates']] != 0 ) | |
coef_class <- coef_class[ estimates != 0, ] | |
coef_class | |
# regression's coefficients | |
coef_reg <- coef(glmnet_reg$finalModel, s = glmnet_reg$bestTune$lambda) %>% | |
as.matrix() | |
coef_reg <- data.table( names = row.names(coef_reg), estimates = coef_reg[, 1] ) | |
sum( coef_reg[['estimates']] != 0 ) | |
coef_reg <- coef_reg[ estimates != 0, ] | |
coef_reg | |
# --------------------------------------------------------------------------------- | |
# visualization | |
library(ggplot2) | |
library(ggfortify) | |
library(gridExtra) | |
# remove the 0 contribution and the obvious outlier prior to visualization | |
has_donated <- donation[ TARGDOL > 0 & TARGDOL < 1500, ] | |
# distribution of TARGDOL | |
donation_TARGDOL <- donation[ TARGDOL < 1500, .(TARGDOL, RemoveZero = 'KeptZero') ] | |
has_donated_TARGDOL <- has_donated[ , .(TARGDOL, RemoveZero = 'RemovedZero') ] | |
distribution_TARGDOL <- rbind(donation_TARGDOL, has_donated_TARGDOL) | |
ggplot( distribution_TARGDOL, aes(TARGDOL, color = RemoveZero) ) + | |
geom_density() + theme_bw() + | |
labs(title = 'Distribution of TARGDOL') + | |
guides(color = FALSE) + | |
facet_grid(~ RemoveZero) | |
ggsave('distribution_TARGDOL.png') | |
# Percentage of Donors vs. Non-Donors | |
dt_donated <- data.table( table(donation$DONATED) / nrow(donation) ) | |
setnames( dt_donated, c('donated', 'frequency') ) | |
ggplot( dt_donated, aes(donated, frequency, fill = donated) ) + | |
geom_col() + theme_bw() + | |
labs(title = 'Percentage of Donors vs. Non-Donors') | |
ggsave('donors_and_non_donors.png') | |
# boxplot of the donated amount between the different | |
# solicitation and contribution code | |
slcod1 <- ggplot(has_donated, aes(SLCOD1, TARGDOL, fill = SLCOD1) ) + | |
geom_boxplot() + theme_bw() + | |
labs(title = 'TARGDOL > 0 vs. SLCOD1') + | |
ylim(0, 100) | |
cncod1 <- ggplot(has_donated, aes(CNCOD1, TARGDOL, fill = CNCOD1) ) + | |
geom_boxplot() + theme_bw() + | |
labs(title = 'TARGDOL > 0 vs. CNCOD1') + | |
ylim(0, 100) | |
g <- arrangeGrob(slcod1, cncod1, nrow = 2) | |
ggsave('donated_amount_sl_and_cn_code.png', g) | |
# see if CNDOL1 (the latest contribution amount) if | |
# correlated with TARGDOL | |
ggplot(has_donated, aes(CNDOL1, log(TARGDOL) ) ) + | |
geom_point() + theme_bw() + | |
labs(x = 'Log(CNDOL1)', title = 'Log(TARGDOL > 0) vs. Log(CNDOL1)') | |
ggsave('targdol_cndol.png') | |
# Proportion of donors in each SLCOD1 category | |
dt_SLCOD_prop <- data.table( table(has_donated$SLCOD1) / table(donation$SLCOD1) ) | |
setnames( dt_SLCOD_prop, c('SLCOD1', 'Proportion') ) | |
ggplot( dt_SLCOD_prop, aes(SLCOD1, Proportion, fill = SLCOD1) ) + | |
geom_col() + theme_bw() + | |
labs(y = 'Proportion of donors', | |
title = 'Proportion of donors in each SLCOD1 category') | |
ggsave('donors_proportion_sl_code.png') | |
# linear model's diagnostic plot | |
png('lm_diagnostics.png', width = 700, height = 700) | |
autoplot(model_lm, which = 1:6, ncol = 3, label.size = 3) + | |
theme_bw() | |
dev.off() | |
predicting
shuffle the row observations of the training data
set.seed(1234)
X_train <- X_train[ sample( nrow(X_train) ) ]
Baseline
##Binomial logistic
##Removing Outlier
X_train <- X_train[TARGDOL < 1500,]
fit <- glm(DONATED ~ . - TARGDOL - CONTRFST, family=binomial, data = X_train)
##Multiple Linear Regression
X_train2 <- X_train[TARGDOL > 0,]
X_train2$DONATED <- NULL
##CONLARG and CNTRLIF collinearity
mult_fit <- lm(TARGDOL ~ .-CONLARG - CNTRLIF, data = X_train2)
summary(mult_fit)
vif(mult_fit)
predicted <- predict(fit, X_test, type="response")
predicted2 <- predict(mult_fit, X_test)
X_test$expected <- predicted * predicted2
X_test2 <- X_test[order(expected, decreasing = TRUE),]
targdol1000 <- sum(head(X_test2$TARGDOL,1000))
expected1000 <- sum(head(X_test2$expected,1000))
targdol1000
Try changing codes to most recent/most frequent solicitation/contrbution
With Outlier Robust is best with 10096.23
##Robust
robust_fit <- rlm(TARGDOL ~ .-CONLARG + CNDOL1:CNTMLIF- CNTRLIF - CNCOD1 - CNCOD2 - CNCOD3 - SLCOD1 - SLCOD2 - SLCOD3, data = X_train2)
summary(robust_fit)
vif(robust_fit)
predicted <- predict(fit, X_test, type="response")
predicted2 <- predict(robust_fit, X_test)
X_test$expected <- predicted * predicted2
X_test2 <- X_test[order(expected, decreasing = TRUE),]
targdol1000 <- sum(head(X_test2$TARGDOL,1000))
expected1000 <- sum(head(X_test2$expected,1000))
targdol1000
I just ran multiple Regression:
mult_fit <- lm(TARGDOL ~ ., data = X_train_donated)
Removing the 1500 for both the classification and regression
groups <- cut(x=donation$TARGDOL, breaks=c(-1,seq(from=0, to=ceiling(max(X_train$TARGDOL)), by = 5)))
bygroup <- tapply(donation$TARGDOL, groups, length)
names(bygroup)[1] <- "[0]"
barplot(bygroup, xlab="Amount", ylab="Frequency", main="Distribution of TARGDOL")
groups2 <- ifelse(donation$TARGDOL==0,"Non-Donor","Donor")
bygroup2 <- tapply(donation$TARGDOL, groups2, length)
barplot(bygroup2, ylab="Frequency", main="Number of Donors vs. Non-Donors")
donation$CNCOD1 <- droplevels(donation$CNCOD1)
boxplot(TARGDOL~CNCOD1,data=donation[TARGDOL>0,],ylim=c(0,40), xlab="CNCOD1", ylab="TARGDOL", main="TARGDOL vs. CNCOD1 (TARGDOL > 0)")
boxplot(TARGDOL~SLCOD1,data=donation[TARGDOL>0,],ylim=c(0,40), xlab="SLCOD1", ylab="TARGDOL", main="TARGDOL vs. SLCOD1 (TARGDOL > 0)")
plot(donation$CNDOL1, donation$TARGDOL, xlab="Log(CNDOL1)", ylab="TARGDOL", main="TARGDOL vs. Log(CNDOL1)")
plot(donation[TARGDOL>0,]$CNDOL1, donation[TARGDOL>0,]$TARGDOL, xlab="Log(CNDOL1)", ylab="TARGDOL", main="TARGDOL vs. Log(CNDOL1) (TARGDOL > 0)")
donors <- ifelse(donation$TARGDOL==0,0,1)
total <- aggregate(donors ~ donation$SLCOD1, FUN=length)
donated <- aggregate(donors ~ donation$SLCOD1, FUN=sum)
percent <- donated$donors/total$donors
names(percent) <- c("A", "B", "C", "D", "M")
barplot(percent, xlab="SLCOD1", ylab="Proportion of Donors", main="Proportion of Donors in each SLCOD1 Category")
total <- total$donors
names(total) <- c("A", "B", "C", "D", "M")
t <- barplot(total, ylim=c(0,85000), xlab="SLCOD1", ylab="Frequency", main="Amount of People Classified Into Each SLCOD1 Category")
text(x = t, y = total, label = total, pos = 3, cex = 0.8, col = "red")
plot(donation[TARGDOL>0,]$CONLARG, donation[TARGDOL>0,]$TARGDOL, xlab="Log(CONLARG)", ylab="TARGDOL", main="TARGDOL vs. Log(CONLARG) (TARGDOL > 0)")
plot(donation[TARGDOL>0,]$CONTRFST, donation[TARGDOL>0,]$TARGDOL, xlab="CNTRFST", ylab="TARGDOL", main="TARGDOL vs. CNTRFST (TARGDOL > 0)")
plot(donation[TARGDOL>0,]$CNTMLIF, donation[TARGDOL>0,]$TARGDOL, xlab="CNTMLIF", ylab="TARGDOL", main="TARGDOL vs. CNTMLIF (TARGDOL > 0)")
P <- ecdf(donation$CNMON1)
plot(P, xlab="CNMON1", ylab="Cumulative Proportion", main="Cumulative Proportion of CNMON1")
P2 <- ecdf(donation$CNMONL)
plot(P2, xlab="CNMONL", ylab="Cumulative Proportion", main="Cumulative Proportion of CNMONL")
with(donation,pairs(data.frame(TARGDOL, CNDOL1, CNTRLIF, CONLARG, CONTRFST)))