Last active
January 25, 2016 20:04
-
-
Save geneorama/6aa6c343506c47b980f0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
library(randomForest) | |
################################################################################ | |
## EXAMPLE 1 (ver_1) | |
## Two data frames are created (test and train) | |
## They have their own independent factor levels, and some of the levels in | |
## `test_ver1` do not appear in `train_ver1` | |
## | |
## A model created on `train_ver1` will fail to predict on the previously | |
## unseen factor levels. | |
## | |
################################################################################ | |
train_ver1 <- data.frame(x1 = c("f", "d", "b", "d"), | |
x2 = c("e", "c", "c", "d"), | |
y = c("yes", "yes", "no", "yes"), | |
stringsAsFactors = TRUE) | |
test_ver1 <- data.frame(x1 = c("b", "unwanted_char"), ## "unwanted_char" is not in train | |
x2 = c("a", "e"), ## "a" is not in train | |
y = c("no", "yes"), | |
stringsAsFactors = TRUE) | |
forest_ver1 <- randomForest(y ~ x1 + x2, | |
data = train_ver1, | |
ntree = 500) | |
predict(object = forest_ver1, newdata = test, type = 'response') | |
# Error in predict.randomForest(object = forest_ver1, newdata = test_ver1, : | |
# New factor levels not present in the training data | |
################################################################################ | |
## EXAMPLE 2 (ver_2) | |
## In most text books / papers / examples there is a single data.frame with | |
## known factor levels. This single data.frame is split into test / train, | |
## so the factor levels match in both subsets. | |
## | |
## This is equivalent to taking the levels in train and test and applying them | |
## to both factors. | |
## | |
################################################################################ | |
(all_levels_x1 <- sort(unique(c(levels(train_ver1$x1), levels(test_ver1$x1))))) | |
# [1] "a" "b" "c" "unwanted_char" | |
(all_levels_x2 <- sort(unique(c(levels(train_ver1$x2), levels(test_ver1$x2))))) | |
# [1] "a" "b" "c" "e" | |
train_ver2 <- train_ver1 | |
test_ver2 <- test_ver1 | |
levels(train_ver2$x1) <- all_levels_x1 | |
levels(train_ver2$x2) <- all_levels_x2 | |
levels(test_ver2$x1) <- all_levels_x1 | |
levels(test_ver2$x2) <- all_levels_x2 | |
forest_ver2 <- randomForest(y ~ x1 + x2, | |
data = train_ver2, | |
ntree = 500) | |
predict(object = forest_ver2, newdata = test_ver2, type = 'response') | |
# 1 2 | |
# no yes | |
# Levels: no yes | |
## The question is: | |
## How is the random forest model able predict on factor levels that are not | |
## seen in the training data? | |
################################################################################ | |
## Other related questions | |
## (This is what I thought you were asking until I re-read the question a few | |
## times) | |
################################################################################ | |
## How do you make sure that your factor levels in test and train match? | |
## e.g. | |
as.numeric(factor(c("a", "b", "c"))) ## You build a model on "a", "b", and "c" | |
# [1] 1 2 3 | |
as.numeric(factor(c("b", "d"))) ## But you see "b" and "d" in production | |
# [1] 1 2 | |
# How do you make sure that "b" is a "2"? | |
# What should "d" be, "NA" or "4"? | |
## No matter how you handle "d", what is the implication? | |
## Suppose that "a", "b", and "c" are highly correlated with a positive | |
## outcome, and NA is highly correlated with a negative outcome. What if you | |
## want to treat newly observed levels as NA? | |
## | |
## Factors are the best and worst thing about R | |
## | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment