dylanjf · December 18, 2015 11:19
diff --git a/gistfile1.txt b/gistfile1.txt
 amazon_test$ACTION1 = 1

 #this is really just a placeholder variable so that the training examples wont be removed when I call match. 
 #also, make sure you put this in the same column as it is in the training set...



 amazon_test = sparse.model.matrix(~. - 1, data = amazon_test)


 #turn the test set into a sparse matrix. the training is in this format already.



 amazon_train = amazon_train[, c(which(complete.cases(match(colnames(amazon_train), colnames(amazon_test))) == TRUE))]



 #to break down what this does, match takes the second argument and returns the numerical position of where it
 #exists in the first argument.  If it doesn't find it, it returns NA.  so I'm telling it to look through a
 #character vector of column names in the test file, relate them to what columns they correspond to in
 #training, and only keep those.  The complete.cases wrapper removes all the NAs from the vector and gives
 #me a nice numerical list.




 amazon_test = amazon_test[, c(which(complete.cases(match(colnames(amazon_test), colnames(amazon_train))) == TRUE))]



 #not only that, you need to run it again to take away the dummies that are in the test set but not in training



 dim(amazon_train) ; dim(amazon_test)

 #should have the same number of columns now
	amazon_test$ACTION1 = 1

	#this is really just a placeholder variable so that the training examples wont be removed when I call match.
	#also, make sure you put this in the same column as it is in the training set...



	amazon_test = sparse.model.matrix(~. - 1, data = amazon_test)


	#turn the test set into a sparse matrix. the training is in this format already.



	amazon_train = amazon_train[, c(which(complete.cases(match(colnames(amazon_train), colnames(amazon_test))) == TRUE))]



	#to break down what this does, match takes the second argument and returns the numerical position of where it
	#exists in the first argument. If it doesn't find it, it returns NA. so I'm telling it to look through a
	#character vector of column names in the test file, relate them to what columns they correspond to in
	#training, and only keep those. The complete.cases wrapper removes all the NAs from the vector and gives
	#me a nice numerical list.




	amazon_test = amazon_test[, c(which(complete.cases(match(colnames(amazon_test), colnames(amazon_train))) == TRUE))]



	#not only that, you need to run it again to take away the dummies that are in the test set but not in training



	dim(amazon_train) ; dim(amazon_test)

	#should have the same number of columns now