Created
May 29, 2015 09:05
-
-
Save russellpierce/2f69fc2ac8d37ed724d9 to your computer and use it in GitHub Desktop.
Provided the right tools are installed, i.e. xz and pigz, will offload the compression handling to an external program and leave R free to do the data import. This ends up being quite a bit more efficient for large files. Some tweaks may be needed for operating systems other than Ubuntu; there may be additional dependencies on the github repo dr…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(parallel) | |
saveRDS.xz <- function(object,file,threads=parallel::detectCores()) { | |
pxzAvail <- any(grepl("(XZ Utils)",system("pxz -V",intern=TRUE))) | |
if (pxzAvail) { | |
con <- pipe(paste0("pxz -T",threads," > ",file),"wb") | |
base::saveRDS(object, file = con) | |
close(con) | |
} else { | |
saveRDS(object,file=file,compress="xz") | |
} | |
} | |
#pxz does not appear to decompress in parallel and appears compute limited in R, but offloading the decompression to a seperate thread provides some speed benefit | |
#this code should work for any regular RDS files saved using xz compression | |
readRDS.xz <- function(file,threads=parallel::detectCores()) { | |
con <- pipe(paste0("pxz -d -k -c -T",threads," ",file)) | |
object <- readRDS(file = con) | |
close(con) | |
return(object) | |
} | |
saveRDS.gz <- function(object,file,threads=parallel::detectCores(),compression_level=6) { | |
con <- pipe(paste0("pigz -c",compression_level," -p",threads," > ",file),"wb") | |
saveRDS(object, file = con) | |
close(con) | |
} | |
#pxz does not appear to decompress in parallel and appears compute limited in R, but offloading the decompression to a seperate thread provides some speed benefit | |
#this code should work for any regular RDS files saved using xz compression | |
readRDS.gz <- function(file,threads=parallel::detectCores()) { | |
con <- pipe(paste0("pigz -d -c -p",threads," ",file)) | |
object <- base::readRDS(file = con) | |
close(con) | |
return(object) | |
} | |
readRDS.p <- function(file,threads=parallel::detectCores()) { | |
#Hypothetically we could use initial bytes to determine file format, but here we use the Linux command file because the readBin implementation was not immediately obvious | |
if (!file.exists(file)) {stop(paste0(file," does not exist!"))} | |
fileDetails <- system2("file",args=file,stdout=TRUE) | |
selector <- sapply(c("gzip","XZ"),function (x) {grepl(x,fileDetails)}) | |
format <- names(selector)[selector] | |
if (length(format)==0) {format <- "not found"} | |
if (format == "gzip") { | |
object <- readRDS.gz(file, threads=threads) | |
} else if (format == "XZ") { | |
object <- readRDS.xz(file, threads=threads) | |
} else { | |
object <- base::readRDS(file) | |
} | |
return(object) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment