Last active
January 19, 2019 04:15
-
-
Save philipp-baumann/40c20a578e6adfe55911a1f8813b93a9 to your computer and use it in GitHub Desktop.
Cubist: Special character values in factorial predictors
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
pkgs <- c("mlbench", "Cubist") | |
lapply(pkgs, library, character.only = TRUE) | |
# Example data set | |
data(BostonHousing) | |
# Test with only 2 factorial predictors | |
boston_housing <- BostonHousing[, c("crim", "zn", "medv")] | |
# Convert numeric `crim` and `zn` to factors | |
boston_housing$zn <- as.factor(boston_housing$zn) | |
boston_housing$crim <- as.factor(boston_housing$crim) | |
## See https://www.rulequest.com/cubist-win.html for exceptions | |
## Special characters (comma, colon, period, vertical bar `|') can appear in | |
## names and values, but must be prefixed by the escape character `\'. | |
## For example, the name "Filch, Grabbit, and Co." would be written as `Filch\, | |
## Grabbit\, and Co\.'. (However, it is not necessary to escape colons in times | |
## and periods in numbers.) | |
# Test (0) Some special characters --------------------------------------------- | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Fine, works | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
# Test (1.1) including comma "," ----------------------------------------------- | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?,") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Error | |
# Out of bonds because extra comma in data file, and GNU Rulequest cannot | |
# parse generated data file correctly because | |
# comma, colon, period, vertical bar `|' have special meaning | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
# See how ".data" and ".names" look like | |
write.table(Cubist:::makeDataFile( | |
x = boston_housing[, -3], y = BostonHousing$medv), | |
file = "dataFile_small.txt") | |
write.table(Cubist:::makeNamesFile( | |
x = boston_housing[, -3], y = BostonHousing$medv), | |
file = "namesFile_small.txt") | |
# Test (1.2) escape the comma "," ---------------------------------------------- | |
# Need to recreate boston_housing, otherwise RStudio crashes...?strange | |
# -> probably because Cubist uses a lot of global variables, there was | |
# an error before, maybe to do with details of C call and garbage collector... | |
boston_housing <- BostonHousing[, c("crim", "zn", "medv")] | |
# Convert numeric `crim` and `zn` to factors | |
boston_housing$zn <- as.factor(boston_housing$zn) | |
boston_housing$crim <- as.factor(boston_housing$crim) | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?\\,") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Works! | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
# Test (2.1) including vertical bar ("|") -------------------------------------- | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?|") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Fails! | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
# Test (2.2) escape "|" -------------------------------------------------------- | |
boston_housing <- BostonHousing[, c("crim", "zn", "medv")] | |
# Convert numeric `crim` and `zn` to factors | |
boston_housing$zn <- as.factor(boston_housing$zn) | |
boston_housing$crim <- as.factor(boston_housing$crim) | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?\\|") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Prob not work because "|" has special meaning and indicates comments in | |
# Cubist file | |
# No solution | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
# Test (2.1) including the period (".") ---------------------------------------- | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?.") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Fails! | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
# Test (2.1) escape "." -------------------------------------------------------- | |
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?\\.") | |
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$") | |
# Escaping does not work | |
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv, | |
committees = 10) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment