Skip to content

Instantly share code, notes, and snippets.

@oseiskar
Last active September 18, 2019 20:29
Show Gist options
  • Save oseiskar/c8117c5eed1c4522f7e0 to your computer and use it in GitHub Desktop.
Save oseiskar/c8117c5eed1c4522f7e0 to your computer and use it in GitHub Desktop.
library(data.table)
library(microbenchmark)
performanceTest <- function (nrows=100000, ncols=400, quote=TRUE, col.type='character') {
message(paste0('generating test table, col type ', col.type))
x <- c(1:nrows)
dt <- data.table(col1=x)
col_generators = list(
character = function(x) paste0('char', x),
double = function(x) as.double(x / 0.9),
integer = function(x) x,
factor = function(x) as.factor(paste0('factor', x %% 1000))
)
if (col.type != 'mixed') {
col_generators <- col_generators[col.type]
}
for (col in c(1:ncols)) {
col_generator <- col_generators[[((col-1) %% length(col_generators))+1]]
dt[, (paste0('col', col)) := col_generator(x)]
}
message(paste0('dt of size ', nrow(dt), 'x', ncol(dt), ' quoting ', quote))
timeIt <- function(func) {
f <- tempfile()
t0 <- get_nanotime()
func(dt, f)
time_diff <- (get_nanotime() - t0) * 1e-9
file.remove(f)
time_diff
}
alternatives = list(
fwrite = function(dt, f) {
fwrite(dt, f, quote=quote)
},
write.csv = function(dt, f) {
write.csv(dt, f, quote=quote, row.names=FALSE)
}
)
n_rounds <- 3
results <- rbindlist(
lapply(1:n_rounds,
function (i) {
round <- list()
for (alt in names(alternatives)) {
message(alt, ' ', i)
round[[alt]] <- timeIt(alternatives[[alt]])
}
round
}
)
)
means <- results[, lapply(.SD, mean)][, lapply(.SD, as.numeric)]
fwrite_time <- means[, fwrite]
speedup <- means[, write.csv] / fwrite_time
list(speedup=speedup, fwrite_time=fwrite_time)
}
performanceTable <- function() {
tab <- data.table(expand.grid(list(
`col.type` = c('character', 'double', 'integer', 'factor', 'mixed'),
`quote` = c(TRUE, FALSE)
)))
for (j in 1:nrow(tab)) {
r <- performanceTest(col.type=tab[j, `col.type`], quote=tab[j, `quote`])
tab[j, speedup := r$speedup]
tab[j, fwrite_time := r$fwrite_time]
}
tab
}
@jangorecki
Copy link

@oseiskar Sys.time() won't time nicely, for a reason why see gc argument ?system.time.

@oseiskar
Copy link
Author

Ran again with this version:

col.type quote speedup fwrite_time (s)
character TRUE 2.03 8.941
double TRUE 3.27 17.885
integer TRUE 4.33 4.943
factor TRUE 2.31 8.409
mixed TRUE 2.99 10.181
character FALSE 2.15 7.594
factor FALSE 2.33 7.643
mixed FALSE 3.027 9.749

@oseiskar
Copy link
Author

Updated code to use microbenchmark::get_nanotime() (thanks @MichaelChirico ).
Results are similar.

col.type quote speedup fwrite_time (s)
character TRUE 2.00 8.91
double TRUE 3.25 17.85
integer TRUE 4.35 4.92
factor TRUE 2.21 8.5
mixed TRUE 2.91 10.31
character FALSE 2.17 7.57
factor FALSE 2.30 7.75
mixed FALSE 3.03 9.68

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment