Skip to content

Instantly share code, notes, and snippets.

@philipp-baumann
Last active January 19, 2019 04:15
Show Gist options
  • Save philipp-baumann/40c20a578e6adfe55911a1f8813b93a9 to your computer and use it in GitHub Desktop.
Save philipp-baumann/40c20a578e6adfe55911a1f8813b93a9 to your computer and use it in GitHub Desktop.
Cubist: Special character values in factorial predictors
pkgs <- c("mlbench", "Cubist")
lapply(pkgs, library, character.only = TRUE)
# Example data set
data(BostonHousing)
# Test with only 2 factorial predictors
boston_housing <- BostonHousing[, c("crim", "zn", "medv")]
# Convert numeric `crim` and `zn` to factors
boston_housing$zn <- as.factor(boston_housing$zn)
boston_housing$crim <- as.factor(boston_housing$crim)
## See https://www.rulequest.com/cubist-win.html for exceptions
## Special characters (comma, colon, period, vertical bar `|') can appear in
## names and values, but must be prefixed by the escape character `\'.
## For example, the name "Filch, Grabbit, and Co." would be written as `Filch\,
## Grabbit\, and Co\.'. (However, it is not necessary to escape colons in times
## and periods in numbers.)
# Test (0) Some special characters ---------------------------------------------
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Fine, works
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
# Test (1.1) including comma "," -----------------------------------------------
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?,")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Error
# Out of bonds because extra comma in data file, and GNU Rulequest cannot
# parse generated data file correctly because
# comma, colon, period, vertical bar `|' have special meaning
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
# See how ".data" and ".names" look like
write.table(Cubist:::makeDataFile(
x = boston_housing[, -3], y = BostonHousing$medv),
file = "dataFile_small.txt")
write.table(Cubist:::makeNamesFile(
x = boston_housing[, -3], y = BostonHousing$medv),
file = "namesFile_small.txt")
# Test (1.2) escape the comma "," ----------------------------------------------
# Need to recreate boston_housing, otherwise RStudio crashes...?strange
# -> probably because Cubist uses a lot of global variables, there was
# an error before, maybe to do with details of C call and garbage collector...
boston_housing <- BostonHousing[, c("crim", "zn", "medv")]
# Convert numeric `crim` and `zn` to factors
boston_housing$zn <- as.factor(boston_housing$zn)
boston_housing$crim <- as.factor(boston_housing$crim)
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?\\,")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Works!
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
# Test (2.1) including vertical bar ("|") --------------------------------------
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?|")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Fails!
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
# Test (2.2) escape "|" --------------------------------------------------------
boston_housing <- BostonHousing[, c("crim", "zn", "medv")]
# Convert numeric `crim` and `zn` to factors
boston_housing$zn <- as.factor(boston_housing$zn)
boston_housing$crim <- as.factor(boston_housing$crim)
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?\\|")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Prob not work because "|" has special meaning and indicates comments in
# Cubist file
# No solution
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
# Test (2.1) including the period (".") ----------------------------------------
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?.")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Fails!
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
# Test (2.1) escape "." --------------------------------------------------------
levels(boston_housing$zn) <- c(levels(boston_housing$zn)[-1], "a@_è$?\\.")
levels(boston_housing$crim) <- c(levels(boston_housing$crim)[-1], "a@_è$")
# Escaping does not work
mod_housing <- cubist(x = boston_housing[, -c(3)], y = boston_housing$medv,
committees = 10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment