Last active
December 18, 2015 19:19
-
-
Save dylanjf/5832136 to your computer and use it in GitHub Desktop.
greedy selection R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#read and separate data | |
amazon_train = read.csv("C:/Users/dylanjf/Desktop/amazon/train.csv") | |
amazon_train = amazon_train[,-10] | |
amazon_Ytrain = as.matrix(amazon_train[,1]) | |
amazon_train = amazon_train[,-1] | |
amazon_test = read.csv("C:/Users/dylanjf/Desktop/amazon/test.csv") | |
amazon_test = amazon_test[,c(1:8)] | |
amazon_comb = rbind(amazon_train, amazon_test) | |
amazon_comb = as.matrix(amazon_comb) | |
####shameless stealing of groupData function#### | |
# group data => create combinations of a given order | |
groupData <- function(xmat, degree) | |
{ | |
require(foreach, quietly = T) | |
# indices of combinations | |
xind <- combn(1:ncol(xmat), degree) | |
# storage structure for the result | |
agx <- foreach(ii = 1:ncol(xind), .combine = cbind ) %do% | |
{ | |
x <- xmat[,xind[1,ii]] | |
for (jj in 2:nrow(xind)) | |
{ | |
x <- paste(x, xmat[,xind[jj,ii]], sep = "_") | |
} | |
x | |
} | |
colnames(agx) <- paste(paste("f", degree, sep = ""), 1:ncol(agx), sep = "_") | |
return(agx) | |
} | |
####end shameless stealing#### | |
amazon_X = cbind(amazon_comb, groupData(amazon_comb, 2), groupData(amazon_comb, 3)) | |
amazon_Xtrain = amazon_X[1:nrow(amazon_train),] | |
amazon_Xtest = amazon_X[(nrow(amazon_train)+1):nrow(amazon_X),] | |
amazon_Ytest = numeric() | |
rm(amazon_comb) | |
rm(amazon_train) | |
rm(amazon_test) | |
rm(amazon_X) | |
### | |
#function for forward stepwise logistic regression fitting to determine optimal features to encode | |
### | |
set.seed(508) | |
require(glmnet, quietly = T) | |
require(Matrix, quietly = T) | |
#initialize optimization metrics | |
cv_max_auc = 0.5 #target minimum.... represents random guess | |
cv_fold_auc = numeric() | |
cv_train_auc = numeric() | |
#initialize feature holding | |
best_col = numeric() | |
num_features = numeric() | |
###############big inefficient for loop that does everything################### | |
for(i in 1:ncol(amazon_Xtrain)){ | |
#add columns selected + iteration column | |
colName = colnames(amazon_Xtrain)[c(best_col, i)] | |
vars = as.data.frame(amazon_Xtrain[,c(best_col, i)]) | |
colnames(vars) = colName | |
#encode into sparse model | |
vars = sparse.model.matrix(~ . - 1, data = vars) | |
#10 fold logistic regression w/ lasso reg ran to obtain max mean AUC on validation set | |
for(j in 1:10){ | |
cv_train = cv.glmnet(x = vars, y = amazon_Ytrain[,1], family = "binomial", type.measure = "auc") | |
cv_fold_auc[j] = max(cv_train$cvm) | |
} | |
cv_train_auc[i] = mean(cv_fold_auc) | |
#reset cv fold auc | |
cv_fold_auc = numeric() | |
#determining if new column is useful. if so, adding to the model and raising auc bar | |
if(cv_train_auc[i] > cv_max_auc){ | |
#for next iteration: know indecies of the columns to keep | |
best_col = c(best_col, i) | |
#store how many important features from the current set to plot against auc | |
best_features = which(coef(cv_train, s = cv_train$lambda.min) > 0) | |
num_features[i] = length(best_features) | |
#raise auc bar | |
cv_max_auc = cv_train_auc[i] | |
} | |
#live update | |
for(k in 1){ | |
print(cat('Feature Loop', i, 'complete. Max validation AUC:', cv_max_auc, 'Number of features:', num_features[i])) | |
} | |
} | |
print(best_col) | |
amazon_Xtrain = amazon_Xtrain[,best_col] | |
#... etc | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment