Last active
March 29, 2017 13:42
-
-
Save rBatt/ea3c72f575acb9224a7e010a7a2f602f to your computer and use it in GitHub Desktop.
How much space does one save by reformatting the coverage .txt files? For SGW
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(reshape2) | |
hasDT <- library(data.table, logical.return=TRUE) # load package, if not installed return FALSE | |
# ================== | |
# = Make Fake Data = | |
# ================== | |
# ---- Options affecting size of 'big' ---- | |
nScaff <- 50 # number of scaffolds (or 'pages') | |
nIndex <- 100 # number of loci or indices (or 'words') | |
# ---- Scaffold is combination of letters and numbers; a character ---- | |
sL <- function() sample(LETTERS, 3, replace=TRUE) # 3 letters per scaffold name | |
sN <- function() sample(1:9, 5, replace=TRUE) # 5 numbers per scaffold name | |
scaffs <- replicate(nScaff, paste0(c(sL(),sN()),collapse="")) # make scaffold name | |
# ---- index is the "word" and is an integer ---- | |
index <- 1:nIndex # generate 'indices' | |
# ---- value is the coverage or some measured value of interest; is an integer ---- | |
value <- sample(1:4, length(scaffs)*length(index), replace=TRUE) # coverage is either 1, 2, 3, or 4 | |
# ---- combine these elements into a data.table / data.frame ---- | |
# note that data.table is a lot faster, though not noticable for small stuff | |
if(hasDT){ # if you have data.table installed, use it | |
big <- data.table::CJ(scaffold=scaffs,index=index)[,value:=value] # CJ is cross join | |
# note: don't need the data.table::CJ(), can just use CJ(). Namespace explicitly stated for clarity. | |
}else{ # otherwise just use data.frame | |
big <- cbind(expand.grid(scaffold=scaffs, index=index), value=value) | |
# note: expand.grid() is analagous to CJ(); cbind() is slow compared to data.table's := | |
} | |
# =============================== | |
# = 'Cast' into a matrix/ array = | |
# =============================== | |
# change object dimensions | |
# SPACE-SAVING STEP | |
smaller <- reshape2::acast(data=big, formula=index~scaffold) # syntax is rows ~ columns | |
# note: if there was another dimension, you could dow row~column~z~4thD etc. | |
# also, can specify value.var=value if there are other columns that aren't dimensions/ fill values | |
# ==================================== | |
# = Compare object sizes in R memory = | |
# ==================================== | |
bigMem <- print(object.size(big), units='Kb') | |
smallerMem <- print(object.size(smaller), units='Kb') | |
# ============================================ | |
# = Compare object sizes saved as text files = | |
# ============================================ | |
# ---- names ---- | |
bTxt <- "~/Desktop/big.txt" | |
sTxt <- "~/Desktop/smaller.txt" | |
# ---- do big ---- | |
write.table(big, file=bTxt, sep="\t") # write | |
bigTxt <- file.size(bTxt) # size | |
file.remove(bTxt) # remove | |
# ---- do small ---- | |
write.table(smaller, file=sTxt, sep="\t") # write | |
smallerTxt <- file.size(sTxt) # size | |
file.remove(sTxt) # remove | |
# ============================================== | |
# = Compare object sizes saved as .RData files = | |
# ============================================== | |
# ---- names ---- | |
bRD <- "~/Desktop/big.RData" | |
sRD <- "~/Desktop/smaller.RData" | |
# ---- do big ---- | |
save(big, file=bRD) # write | |
bigRD <- file.size(bRD) # size | |
file.remove(bRD) # remove | |
# ---- do small ---- | |
save(smaller, file=sRD) # write | |
smallerRD <- file.size(sRD) # size | |
file.remove(sRD) # remove | |
# ==================== | |
# = Comparison Table = | |
# ==================== | |
sizes <- c(bigMem, smallerMem, bigTxt, smallerTxt, bigRD, smallerRD) # object sizes in bytes | |
rowN <- c("big (original)","small (matrix)") # row names | |
colN <- c("in R memory","as .txt file","as .RData file") # column names | |
sumMat <- matrix(sizes, nrow=2, ncol=3, dimnames=list(rowN,colN))/1E3 # in KB | |
smallPercent <- apply(sumMat, 2, function(x)x[2]/x[1]) # size of smaller as a percent of big | |
(sumMat2 <- rbind(sumMat, smallPercent=smallPercent)) # assign and print |
Also, if files are currently being saved as non-reshaped (big) .txt files, if you reshaped them AND compressed them as .RData files, that'd be 2% of the original file size.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The answer:
in R's memory, the reshaped object is 1/3 of the original. Saved on disk as a .txt file, it's 10% of the original. Saved on disk as a compressed (default) .RData it is 82% of the original's size.