geneorama · January 25, 2016 20:04
diff --git a/randomForest_factor_extrapolation.R b/randomForest_factor_extrapolation.R

 rm(list=ls())
 library(randomForest)

 ################################################################################
 ## EXAMPLE 1 (ver_1)
 ## Two data frames are created (test and train)
 ## They have their own independent factor levels, and some of the levels in 
 ## `test_ver1` do not appear in `train_ver1`
 ## 
 ## A model created on `train_ver1` will fail to predict on the previously 
 ## unseen factor levels.
 ##
 ################################################################################

 train_ver1 <- data.frame(x1 = c("f", "d", "b", "d"),
                         x2 = c("e", "c", "c", "d"),
                         y = c("yes", "yes", "no", "yes"),
                         stringsAsFactors = TRUE)

 test_ver1 <- data.frame(x1 = c("b", "unwanted_char"),  ## "unwanted_char" is not in train
                        x2 = c("a", "e"),              ## "a" is not in train
                        y = c("no", "yes"),
                        stringsAsFactors = TRUE)


 forest_ver1 <- randomForest(y ~ x1 + x2, 
                            data = train_ver1, 
                            ntree = 500)
 predict(object = forest_ver1, newdata = test, type = 'response')
 # Error in predict.randomForest(object = forest_ver1, newdata = test_ver1,  : 
 #   New factor levels not present in the training data


 ################################################################################
 ## EXAMPLE 2 (ver_2)
 ## In most text books / papers / examples there is a single data.frame with
 ## known factor levels.  This single data.frame is split into test / train,
 ## so the factor levels match in both subsets.
 ##
 ## This is equivalent to taking the levels in train and test and applying them
 ## to both factors.
 ##
 ################################################################################


 (all_levels_x1 <- sort(unique(c(levels(train_ver1$x1), levels(test_ver1$x1)))))
 # [1] "a"             "b"             "c"             "unwanted_char"

 (all_levels_x2 <- sort(unique(c(levels(train_ver1$x2), levels(test_ver1$x2)))))
 # [1] "a" "b" "c" "e"

 train_ver2 <- train_ver1
 test_ver2 <- test_ver1
 levels(train_ver2$x1) <- all_levels_x1
 levels(train_ver2$x2) <- all_levels_x2
 levels(test_ver2$x1) <- all_levels_x1
 levels(test_ver2$x2) <- all_levels_x2

 forest_ver2 <- randomForest(y ~ x1 + x2, 
                            data = train_ver2, 
                            ntree = 500)
 predict(object = forest_ver2, newdata = test_ver2, type = 'response')
 #   1   2 
 #  no yes 
 # Levels: no yes


 ## The question is:
 ## How is the random forest model able predict on factor levels that are not
 ## seen in the training data?



 ################################################################################
 ## Other related questions
 ## (This is what I thought you were asking until I re-read the question a few
 ## times)
 ################################################################################

 ## How do you make sure that your factor levels in test and train match?
 ## e.g. 
 as.numeric(factor(c("a", "b", "c")))  ## You build a model on "a", "b", and "c"
 # [1] 1 2 3
 as.numeric(factor(c("b", "d")))       ## But you see "b" and "d" in production
 # [1] 1 2
 # How do you make sure that "b" is a "2"?
 # What should "d" be, "NA" or "4"?


 ## No matter how you handle "d", what is the implication?
 ## Suppose that "a", "b", and "c" are highly correlated with a positive 
 ## outcome, and NA is highly correlated with a negative outcome.  What if you
 ## want to treat newly observed levels as NA?
 ##
 ## Factors are the best and worst thing about R
 ##

	rm(list=ls())
	library(randomForest)

	################################################################################
	## EXAMPLE 1 (ver_1)
	## Two data frames are created (test and train)
	## They have their own independent factor levels, and some of the levels in
	## `test_ver1` do not appear in `train_ver1`
	##
	## A model created on `train_ver1` will fail to predict on the previously
	## unseen factor levels.
	##
	################################################################################

	train_ver1 <- data.frame(x1 = c("f", "d", "b", "d"),
	x2 = c("e", "c", "c", "d"),
	y = c("yes", "yes", "no", "yes"),
	stringsAsFactors = TRUE)

	test_ver1 <- data.frame(x1 = c("b", "unwanted_char"), ## "unwanted_char" is not in train
	x2 = c("a", "e"), ## "a" is not in train
	y = c("no", "yes"),
	stringsAsFactors = TRUE)


	forest_ver1 <- randomForest(y ~ x1 + x2,
	data = train_ver1,
	ntree = 500)
	predict(object = forest_ver1, newdata = test, type = 'response')
	# Error in predict.randomForest(object = forest_ver1, newdata = test_ver1, :
	# New factor levels not present in the training data


	################################################################################
	## EXAMPLE 2 (ver_2)
	## In most text books / papers / examples there is a single data.frame with
	## known factor levels. This single data.frame is split into test / train,
	## so the factor levels match in both subsets.
	##
	## This is equivalent to taking the levels in train and test and applying them
	## to both factors.
	##
	################################################################################


	(all_levels_x1 <- sort(unique(c(levels(train_ver1$x1), levels(test_ver1$x1)))))
	# [1] "a" "b" "c" "unwanted_char"

	(all_levels_x2 <- sort(unique(c(levels(train_ver1$x2), levels(test_ver1$x2)))))
	# [1] "a" "b" "c" "e"

	train_ver2 <- train_ver1
	test_ver2 <- test_ver1
	levels(train_ver2$x1) <- all_levels_x1
	levels(train_ver2$x2) <- all_levels_x2
	levels(test_ver2$x1) <- all_levels_x1
	levels(test_ver2$x2) <- all_levels_x2

	forest_ver2 <- randomForest(y ~ x1 + x2,
	data = train_ver2,
	ntree = 500)
	predict(object = forest_ver2, newdata = test_ver2, type = 'response')
	# 1 2
	# no yes
	# Levels: no yes


	## The question is:
	## How is the random forest model able predict on factor levels that are not
	## seen in the training data?



	################################################################################
	## Other related questions
	## (This is what I thought you were asking until I re-read the question a few
	## times)
	################################################################################

	## How do you make sure that your factor levels in test and train match?
	## e.g.
	as.numeric(factor(c("a", "b", "c"))) ## You build a model on "a", "b", and "c"
	# [1] 1 2 3
	as.numeric(factor(c("b", "d"))) ## But you see "b" and "d" in production
	# [1] 1 2
	# How do you make sure that "b" is a "2"?
	# What should "d" be, "NA" or "4"?


	## No matter how you handle "d", what is the implication?
	## Suppose that "a", "b", and "c" are highly correlated with a positive
	## outcome, and NA is highly correlated with a negative outcome. What if you
	## want to treat newly observed levels as NA?
	##
	## Factors are the best and worst thing about R
	##