Last active
December 10, 2020 05:27
-
-
Save mrdwab/0e50d7601e99027d94451572bbaf7db4 to your computer and use it in GitHub Desktop.
Testing different approaches for replacing characters with integers. https://stackoverflow.com/q/65227663/1270695
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## If your replacement is just a sequence of integers the length of the unique values being factored, | |
## you can create a function like this which should be quite fast. | |
fac2int <- function(x, levels, labels = levels, exclude = NA, ordered = is.ordered(x), nmax = NA) { | |
as.integer(factor(x, levels, labels, exclude, ordered, nmax)) | |
} | |
### DIFFERENT APPROACHES TO TEST | |
fun_datamatrix <- function() { | |
df[] <- data.matrix(as.data.frame(lapply(df, factor, levels = df2$Group))) | |
df | |
} | |
fun_match <- function() { | |
df[] <- df2$Value[match(unlist(df), df2$Group)] | |
df | |
} | |
fun_lapply <- function() { | |
df[] <- lapply(df, function(x) | |
as.integer(as.character(factor(x, levels = df2$Group, labels = df2$Value)))) | |
df | |
} | |
fun_matrix <- function() { | |
df[] <- as.integer(as.character(factor(as.matrix(df), levels = df2$Group, | |
labels = df2$Value))) | |
df | |
} | |
fun_dplyr <- function() { | |
df <- df %>% mutate(across(.fns = ~df2$Value[match(., df2$Group)])) | |
df | |
} | |
fun_fac2int <- function() { | |
df[] <- lapply(df, fac2int, levels = df2$Group) | |
df | |
} | |
### MAKE SOME LARGER SAMPLE DATA | |
nrow <- 10000 | |
ncol <- 100 | |
set.seed(1) | |
df <- setNames(data.frame(replicate(ncol, sample(c(sample(c(letters, LETTERS), 20), NA), nrow, TRUE), | |
simplify = FALSE)), paste0("V", sequence(ncol))) | |
df2 <- data.frame(Group = c(letters, LETTERS), Value = 1:52) | |
### BENCHMARK THE DIFFERENT APPROACHES | |
bench::mark(fun_datamatrix(), fun_match(), fun_lapply(), fun_matrix(), fun_dplyr(), fun_fac2int()) | |
# # A tibble: 6 x 13 | |
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc | |
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> | |
# 1 fun_datamatrix() 809.35ms 809.35ms 1.24 77.5MB 4.94 1 4 | |
# 2 fun_match() 1.86s 1.86s 0.538 103.6MB 1.62 1 3 | |
# 3 fun_lapply() 689.02ms 689.02ms 1.45 38.9MB 1.45 1 1 | |
# 4 fun_matrix() 915.66ms 915.66ms 1.09 111.4MB 3.28 1 3 | |
# 5 fun_dplyr() 284.5ms 326.96ms 3.06 19.8MB 1.53 2 1 | |
# 6 fun_fac2int() 99.83ms 184.53ms 5.97 31.3MB 4.48 4 3 | |
# # … with 5 more variables: total_time <bch:tm>, result <list>, memory <list>, | |
# # time <list>, gc <list> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment