Last active
March 3, 2024 07:46
-
-
Save shackett/95c0e5d01882f89779594fc7ceb150c9 to your computer and use it in GitHub Desktop.
Parallelizing fread for fast file reading -- split a file into mc.cores pieces and then aggregate them
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
parallel_fread <- function(path, mc.cores = parallel::detectCores(), header = TRUE, ...) { | |
stopifnot(file.exists(path)) | |
dots <- list(...) | |
fread_args <- dots[intersect(names(formals(fread)), names(dots))] | |
if (any(c("skip", "nrows") %in% fread_args)) { | |
stop(paste(intersect(c("skip", "nrows"), fread_args), collapse = " & "), " cannot be provided") | |
} | |
# calculate number of rows in file | |
file_rows <- as.integer(system2("wc", | |
args = c("-l", | |
path, | |
" | awk '{print $1}'"), | |
stdout = TRUE)) | |
split_breaks <- ceiling(seq(ifelse(header, 1, 0), file_rows, length.out = mc.cores + 1)) | |
frame_splits <- data.frame(start = split_breaks[1:mc.cores] + 1, | |
end = split_breaks[2:(mc.cores+1)]) %>% | |
dplyr::mutate(nrows = end - start + 1) | |
output <- parallel::mclapply(1:nrow(frame_splits), | |
function(i){ | |
do.call(fread, | |
append(list(input = path, | |
skip = frame_splits$start[i]-1, | |
nrows = frame_splits$nrows[i]), | |
fread_args)) | |
}, mc.cores = mc.cores) %>% | |
dplyr::bind_rows() | |
if (header) { | |
colnames(output) <- scan(path, what = "character", nlines = 1, quiet = TRUE) | |
} | |
output | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment