rBatt · March 29, 2017 13:42 · rBatt · Mar 29, 2017
diff --git a/reformatCoverage.R b/reformatCoverage.R
 library(reshape2)
 hasDT <- library(data.table, logical.return=TRUE) # load package, if not installed return FALSE

 # ==================
 # = Make Fake Data =
 # ==================
 # ---- Options affecting size of 'big' ----
 nScaff <- 50 # number of scaffolds (or 'pages')
 nIndex <- 100 # number of loci or indices (or 'words')

 # ---- Scaffold is combination of letters and numbers; a character ----
 sL <- function() sample(LETTERS, 3, replace=TRUE) # 3 letters per scaffold name
 sN <- function() sample(1:9, 5, replace=TRUE) # 5 numbers per scaffold name
 scaffs <- replicate(nScaff, paste0(c(sL(),sN()),collapse="")) # make scaffold name

 # ---- index is the "word" and is an integer ----
 index <- 1:nIndex # generate 'indices'

 # ---- value is the coverage or some measured value of interest; is an integer ----
 value <- sample(1:4, length(scaffs)*length(index), replace=TRUE) # coverage is either 1, 2, 3, or 4

 # ---- combine these elements into a data.table / data.frame ----
 # note that data.table is a lot faster, though not noticable for small stuff
 if(hasDT){ # if you have data.table installed, use it
 	big <- data.table::CJ(scaffold=scaffs,index=index)[,value:=value] # CJ is cross join
 	# note: don't need the data.table::CJ(), can just use CJ(). Namespace explicitly stated for clarity.
 }else{ # otherwise just use data.frame
 	big <- cbind(expand.grid(scaffold=scaffs, index=index), value=value)
 	# note: expand.grid() is analagous to CJ(); cbind() is slow compared to data.table's :=
 }

 # ===============================
 # = 'Cast' into a matrix/ array =
 # ===============================
 # change object dimensions
 # SPACE-SAVING STEP
 smaller <- reshape2::acast(data=big, formula=index~scaffold) # syntax is rows ~ columns
 # note: if there was another dimension, you could dow row~column~z~4thD etc.
 # also, can specify value.var=value if there are other columns that aren't dimensions/ fill values


 # ====================================
 # = Compare object sizes in R memory =
 # ====================================
 bigMem <- print(object.size(big), units='Kb')
 smallerMem <- print(object.size(smaller), units='Kb')


 # ============================================
 # = Compare object sizes saved as text files =
 # ============================================
 # ---- names ----
 bTxt <- "~/Desktop/big.txt"
 sTxt <- "~/Desktop/smaller.txt"

 # ---- do big ----
 write.table(big, file=bTxt, sep="\t") # write
 bigTxt <- file.size(bTxt) # size
 file.remove(bTxt) # remove

 # ---- do small ----
 write.table(smaller, file=sTxt, sep="\t") # write
 smallerTxt <- file.size(sTxt) # size
 file.remove(sTxt) # remove

 # ==============================================
 # = Compare object sizes saved as .RData files =
 # ==============================================
 # ---- names ----
 bRD <- "~/Desktop/big.RData"
 sRD <- "~/Desktop/smaller.RData"

 # ---- do big ----
 save(big, file=bRD) # write
 bigRD <- file.size(bRD) # size
 file.remove(bRD) # remove

 # ---- do small ----
 save(smaller, file=sRD) # write
 smallerRD <- file.size(sRD) # size
 file.remove(sRD) # remove

 # ====================
 # = Comparison Table =
 # ====================
 sizes <- c(bigMem, smallerMem, bigTxt, smallerTxt, bigRD, smallerRD) # object sizes in bytes
 rowN <- c("big (original)","small (matrix)") # row names
 colN <- c("in R memory","as .txt file","as .RData file") # column names
 sumMat <- matrix(sizes, nrow=2, ncol=3, dimnames=list(rowN,colN))/1E3 # in KB
 smallPercent <- apply(sumMat, 2, function(x)x[2]/x[1]) # size of smaller as a percent of big
 (sumMat2 <- rbind(sumMat, smallPercent=smallPercent)) # assign and print
	library(reshape2)
	hasDT <- library(data.table, logical.return=TRUE) # load package, if not installed return FALSE

	# ==================
	# = Make Fake Data =
	# ==================
	# ---- Options affecting size of 'big' ----
	nScaff <- 50 # number of scaffolds (or 'pages')
	nIndex <- 100 # number of loci or indices (or 'words')

	# ---- Scaffold is combination of letters and numbers; a character ----
	sL <- function() sample(LETTERS, 3, replace=TRUE) # 3 letters per scaffold name
	sN <- function() sample(1:9, 5, replace=TRUE) # 5 numbers per scaffold name
	scaffs <- replicate(nScaff, paste0(c(sL(),sN()),collapse="")) # make scaffold name

	# ---- index is the "word" and is an integer ----
	index <- 1:nIndex # generate 'indices'

	# ---- value is the coverage or some measured value of interest; is an integer ----
	value <- sample(1:4, length(scaffs)*length(index), replace=TRUE) # coverage is either 1, 2, 3, or 4

	# ---- combine these elements into a data.table / data.frame ----
	# note that data.table is a lot faster, though not noticable for small stuff
	if(hasDT){ # if you have data.table installed, use it
	big <- data.table::CJ(scaffold=scaffs,index=index)[,value:=value] # CJ is cross join
	# note: don't need the data.table::CJ(), can just use CJ(). Namespace explicitly stated for clarity.
	}else{ # otherwise just use data.frame
	big <- cbind(expand.grid(scaffold=scaffs, index=index), value=value)
	# note: expand.grid() is analagous to CJ(); cbind() is slow compared to data.table's :=
	}

	# ===============================
	# = 'Cast' into a matrix/ array =
	# ===============================
	# change object dimensions
	# SPACE-SAVING STEP
	smaller <- reshape2::acast(data=big, formula=index~scaffold) # syntax is rows ~ columns
	# note: if there was another dimension, you could dow row~column~z~4thD etc.
	# also, can specify value.var=value if there are other columns that aren't dimensions/ fill values


	# ====================================
	# = Compare object sizes in R memory =
	# ====================================
	bigMem <- print(object.size(big), units='Kb')
	smallerMem <- print(object.size(smaller), units='Kb')


	# ============================================
	# = Compare object sizes saved as text files =
	# ============================================
	# ---- names ----
	bTxt <- "~/Desktop/big.txt"
	sTxt <- "~/Desktop/smaller.txt"

	# ---- do big ----
	write.table(big, file=bTxt, sep="\t") # write
	bigTxt <- file.size(bTxt) # size
	file.remove(bTxt) # remove

	# ---- do small ----
	write.table(smaller, file=sTxt, sep="\t") # write
	smallerTxt <- file.size(sTxt) # size
	file.remove(sTxt) # remove

	# ==============================================
	# = Compare object sizes saved as .RData files =
	# ==============================================
	# ---- names ----
	bRD <- "~/Desktop/big.RData"
	sRD <- "~/Desktop/smaller.RData"

	# ---- do big ----
	save(big, file=bRD) # write
	bigRD <- file.size(bRD) # size
	file.remove(bRD) # remove

	# ---- do small ----
	save(smaller, file=sRD) # write
	smallerRD <- file.size(sRD) # size
	file.remove(sRD) # remove

	# ====================
	# = Comparison Table =
	# ====================
	sizes <- c(bigMem, smallerMem, bigTxt, smallerTxt, bigRD, smallerRD) # object sizes in bytes
	rowN <- c("big (original)","small (matrix)") # row names
	colN <- c("in R memory","as .txt file","as .RData file") # column names
	sumMat <- matrix(sizes, nrow=2, ncol=3, dimnames=list(rowN,colN))/1E3 # in KB
	smallPercent <- apply(sumMat, 2, function(x)x[2]/x[1]) # size of smaller as a percent of big
	(sumMat2 <- rbind(sumMat, smallPercent=smallPercent)) # assign and print