Last active
December 6, 2020 00:01
-
-
Save mrdwab/97243716dd7a0c11f395820a28b92e10 to your computer and use it in GitHub Desktop.
Testing the different options shared at https://stackoverflow.com/q/65151555/1270695
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## SETUP: Sample data and packages | |
library(data.table) | |
library(readr) | |
library(dplyr) | |
library(iotools) | |
n <- 5000 | |
set.seed(1) | |
vals_row <- sample(2000, n, TRUE) | |
DT <- data.table(ID = 1:n, | |
vals = sapply(vals_row, function(x) | |
paste(sample(100, x, TRUE), collapse = ";"))) | |
DT[sample(n, n*.05), vals := NA] | |
## FUNCTIONS TO TEST | |
### Custom function to split, sum, and add the column back into the data.table | |
split_sum <- function(string){ | |
string %>% | |
stringr::str_split(';') %>% | |
magrittr::extract2(1) %>% | |
as.double() %>% | |
sum() | |
} | |
fun_baraliuh <- function() { | |
DT %>% | |
mutate(vals = map_dbl(vals, split_sum)) | |
} | |
### fread, forced to read a specified number of columns | |
fun_a5 <- function(col, sep) { | |
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1 | |
rowSums(fread(text = c(paste0("V_", sequence(cols), collapse = sep), col), | |
sep = sep, fill = TRUE, header = TRUE), na.rm = TRUE) | |
} | |
### read_delim, forced to read a specified number of columns as a specific type | |
fun_dave <- function(col, sep) { | |
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1 | |
rowSums(read_delim(c(paste0("V_", sequence(cols), collapse = sep), col), | |
delim = ";", col_names = TRUE, | |
col_types = paste(rep("i", cols), collapse = "")), | |
na.rm = TRUE) | |
} | |
### read.table, forced to read a specified number of columns and with other arguments | |
### specified for optimization of reading speeed | |
fun_base <- function(col, sep) { | |
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1 | |
rowSums(read.table(text = c(paste0("V_", sequence(cols), collapse = sep), col), | |
sep = sep, fill = TRUE, header = TRUE, | |
blank.lines.skip = FALSE, colClasses = "integer"), | |
na.rm = TRUE) | |
} | |
### dstrsplit, forced to read a specific number of columns | |
fun_iotools_d <- function(col, sep) { | |
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1 | |
rowSums(dstrsplit(col, col_types = rep("integer", cols), sep = sep), na.rm = TRUE) | |
} | |
### mstrsplit, forced to read a specified number of columns | |
fun_iotools_m <- function(col, sep) { | |
cols <- max(stringi::stri_count_fixed(col, sep), na.rm = TRUE) + 1 | |
rowSums(mstrsplit(col, sep = ";", type = "integer", ncol = cols), na.rm = TRUE) | |
} | |
## BENCHMARKING | |
bench::mark(fun_baraliuh(), fun_a5(DT$vals, ";"), fun_base(DT$vals, ";"), | |
fun_dave(DT$vals, ";"), fun_iotools_d(DT$vals, ";"), | |
fun_iotools_m(DT$vals, ";"), check = FALSE) | |
# # A tibble: 6 x 13 | |
# expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc | |
# <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> | |
# 1 fun_baraliuh() 1.06s 1.06s 0.939 71.9MB 0 1 0 | |
# 2 fun_a5(DT$vals, ";") 268.64ms 280.84ms 3.56 151.7MB 1.78 2 1 | |
# 3 fun_base(DT$vals, ";") 622.49ms 622.49ms 1.61 192.1MB 1.61 1 1 | |
# 4 fun_dave(DT$vals, ";") 692.94ms 692.94ms 1.44 89.5MB 1.44 1 1 | |
# 5 fun_iotools_d(DT$vals, ";") 528.12ms 528.12ms 1.89 77MB 0 1 0 | |
# 6 fun_iotools_m(DT$vals, ";") 281.8ms 285.08ms 3.51 38.2MB 0 2 0 | |
# # … with 5 more variables: total_time <bch:tm>, result <list>, memory <list>, time <list>, | |
# # gc <list> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment