Skip to content

Instantly share code, notes, and snippets.

@dylanjf
Last active December 18, 2015 19:19
Show Gist options
  • Save dylanjf/5832136 to your computer and use it in GitHub Desktop.
Save dylanjf/5832136 to your computer and use it in GitHub Desktop.
greedy selection R
#read and separate data
amazon_train = read.csv("C:/Users/dylanjf/Desktop/amazon/train.csv")
amazon_train = amazon_train[,-10]
amazon_Ytrain = as.matrix(amazon_train[,1])
amazon_train = amazon_train[,-1]
amazon_test = read.csv("C:/Users/dylanjf/Desktop/amazon/test.csv")
amazon_test = amazon_test[,c(1:8)]
amazon_comb = rbind(amazon_train, amazon_test)
amazon_comb = as.matrix(amazon_comb)
####shameless stealing of groupData function####
# group data => create combinations of a given order
groupData <- function(xmat, degree)
{
require(foreach, quietly = T)
# indices of combinations
xind <- combn(1:ncol(xmat), degree)
# storage structure for the result
agx <- foreach(ii = 1:ncol(xind), .combine = cbind ) %do%
{
x <- xmat[,xind[1,ii]]
for (jj in 2:nrow(xind))
{
x <- paste(x, xmat[,xind[jj,ii]], sep = "_")
}
x
}
colnames(agx) <- paste(paste("f", degree, sep = ""), 1:ncol(agx), sep = "_")
return(agx)
}
####end shameless stealing####
amazon_X = cbind(amazon_comb, groupData(amazon_comb, 2), groupData(amazon_comb, 3))
amazon_Xtrain = amazon_X[1:nrow(amazon_train),]
amazon_Xtest = amazon_X[(nrow(amazon_train)+1):nrow(amazon_X),]
amazon_Ytest = numeric()
rm(amazon_comb)
rm(amazon_train)
rm(amazon_test)
rm(amazon_X)
###
#function for forward stepwise logistic regression fitting to determine optimal features to encode
###
set.seed(508)
require(glmnet, quietly = T)
require(Matrix, quietly = T)
#initialize optimization metrics
cv_max_auc = 0.5 #target minimum.... represents random guess
cv_fold_auc = numeric()
cv_train_auc = numeric()
#initialize feature holding
best_col = numeric()
num_features = numeric()
###############big inefficient for loop that does everything###################
for(i in 1:ncol(amazon_Xtrain)){
#add columns selected + iteration column
colName = colnames(amazon_Xtrain)[c(best_col, i)]
vars = as.data.frame(amazon_Xtrain[,c(best_col, i)])
colnames(vars) = colName
#encode into sparse model
vars = sparse.model.matrix(~ . - 1, data = vars)
#10 fold logistic regression w/ lasso reg ran to obtain max mean AUC on validation set
for(j in 1:10){
cv_train = cv.glmnet(x = vars, y = amazon_Ytrain[,1], family = "binomial", type.measure = "auc")
cv_fold_auc[j] = max(cv_train$cvm)
}
cv_train_auc[i] = mean(cv_fold_auc)
#reset cv fold auc
cv_fold_auc = numeric()
#determining if new column is useful. if so, adding to the model and raising auc bar
if(cv_train_auc[i] > cv_max_auc){
#for next iteration: know indecies of the columns to keep
best_col = c(best_col, i)
#store how many important features from the current set to plot against auc
best_features = which(coef(cv_train, s = cv_train$lambda.min) > 0)
num_features[i] = length(best_features)
#raise auc bar
cv_max_auc = cv_train_auc[i]
}
#live update
for(k in 1){
print(cat('Feature Loop', i, 'complete. Max validation AUC:', cv_max_auc, 'Number of features:', num_features[i]))
}
}
print(best_col)
amazon_Xtrain = amazon_Xtrain[,best_col]
#... etc
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment