Last active
August 29, 2015 14:09
-
-
Save mikelove/17eae9d41d5396343260 to your computer and use it in GitHub Desktop.
recommended chunk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The idea: a function that helps decide, for reduce by range, | |
# how to optimally chunk GRanges into a GRangesList. | |
# Practically, this involves sampling the size of the imported data | |
# for a subset of cells in the (ranges, files) matrix, and | |
# asking/estimating the amount of memory available for each worker. | |
# user code | |
library(GenomicFiles) | |
register(SerialParam()) | |
# use 4 BigWigs from | |
# ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeCshlLongRnaSeq/ | |
files <- list.files("~/data/bigwig/",full=TRUE) | |
library(TxDb.Hsapiens.UCSC.hg19.knownGene) | |
si <- seqinfo(TxDb.Hsapiens.UCSC.hg19.knownGene) | |
ranges <- tileGenome(si,tilewidth=1e7,cut.last.tile.in.chrom=TRUE) | |
ranges <- keepStandardChromosomes(ranges) | |
### BEGIN: my proposed functions | |
#' most likely an unexported function | |
#' | |
#' @param ranges the ranges over which to sample | |
#' @param files the files over which to sample | |
#' @param IMPORT the function which will import data one range and one file | |
#' at a time. The point is to observe the maximal memory size in R of the object | |
#' which will be operated on in the MAP and REDUCE calls. | |
#' @param minWidth given that tileGenome() produces tiles of varying width, | |
#' this allows the user to insist that the sampling should be only over tiles | |
#' of a given minWidth. | |
#' | |
#' @return the function returns the object size in bytes of one imported object | |
#' from the (ranges, files) matrix | |
#' | |
sampleRandomRangeAndFile <- function(ranges, files, IMPORT, minWidth=NULL) { | |
if (is.null(minWidth)) { | |
idx <- sample(seq_along(ranges),1) | |
} else { | |
idx <- sample(which(width(ranges) >= minWidth),1) | |
} | |
range <- ranges[idx] | |
file <- files[sample(seq_along(files),1)] | |
object.size(IMPORT(range, file)) | |
} | |
#' the user-facing function | |
#' | |
#' @param ceiling the maximal memory in megabytes to be used over all files for a single range | |
#' @param upperBoundFunction a function which takes an upper bound as an esimate from | |
#' the vector of sizes of the sample objects | |
#' @param ranges the ranges over which to sample | |
#' @param files to files over which to sample | |
#' @param IMPORT the function which will import data one range and one file | |
#' at a time. The point is to observe the maximal memory size in R of the object | |
#' which will be operated on in the MAP and REDUCE calls. | |
#' @param minWidth given that tileGenome() produces tiles of varying width, | |
#' this allows the user to insist that the sampling should be only over tiles | |
#' of a given minWidth. | |
#' @param n the number of cells to sample | |
#' | |
#' @return the recommended number of ranges to chunk at one time | |
#' | |
recommendedChunk <- function(ceiling, upperBoundFunction, ranges, files, IMPORT, minWidth=NULL, n=20) { | |
taste <- replicate(n, sampleRandomRangeAndFile(ranges=ranges, files=files, | |
IMPORT=IMPORT, minWidth=minWidth)) | |
upper.bound <- upperBoundFunction(taste) / 2^20 | |
ceiling / (upper.bound * length(files)) | |
} | |
### END: my proposed functions | |
# user code: | |
IMPORT <- function(range, file) { | |
import(file, which = range, as = "Rle", format = "bw")[range] | |
} | |
upperBoundFunction <- function(x) mean(x) + 2 * sd(x) | |
(rc <- recommendedChunk(50, upperBoundFunction, ranges, files, IMPORT, minWidth=1e7)) | |
(nchunks <- round(length(ranges)/rc)) | |
f <- factor(sort(rep(seq_len(nchunks), length=length(ranges)))) | |
mean(table(f)) | |
rangeslist <- split(ranges, f) | |
## | |
MAP <- function(range, file) { | |
rle <- import(file, which = range, as = "Rle", format = "bw")[range] | |
mean(rle) | |
} | |
REDUCE <- function(mapped) { | |
Reduce(cbind, mapped) | |
} | |
rangeslist.sub <- rangeslist[1:2] | |
(res <- reduceByRange(rangeslist.sub, files, MAP, REDUCE)) | |
(resOut <- do.call(rbind, res)) | |
sessionInfo() | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
R Under development (unstable) (2014-10-15 r66774) -- "Unsuffered Consequences" | |
Copyright (C) 2014 The R Foundation for Statistical Computing | |
Platform: x86_64-apple-darwin12.5.0 (64-bit) | |
R is free software and comes with ABSOLUTELY NO WARRANTY. | |
You are welcome to redistribute it under certain conditions. | |
Type 'license()' or 'licence()' for distribution details. | |
Natural language support but running in an English locale | |
R is a collaborative project with many contributors. | |
Type 'contributors()' for more information and | |
'citation()' on how to cite R or R packages in publications. | |
Type 'demo()' for some demos, 'help()' for on-line help, or | |
'help.start()' for an HTML browser interface to help. | |
Type 'q()' to quit R. | |
[Previously saved workspace restored] | |
Bioconductor version 3.1 (BiocInstaller 1.17.1), ?biocLite for help | |
> # The idea: a function that helps decide, for reduce by range, | |
> # how to optimally chunk GRanges into a GRangesList. | |
> | |
> # Practically, this involves sampling the size of the imported data | |
> # for a subset of cells in the (ranges, files) matrix, and | |
> # asking/estimating the amount of memory available for each worker. | |
> | |
> | |
> | |
> # user code | |
> | |
> library(GenomicFiles) | |
Loading required package: BiocGenerics | |
Loading required package: parallel | |
Attaching package: ‘BiocGenerics’ | |
The following objects are masked from ‘package:parallel’: | |
clusterApply, clusterApplyLB, clusterCall, clusterEvalQ, | |
clusterExport, clusterMap, parApply, parCapply, parLapply, | |
parLapplyLB, parRapply, parSapply, parSapplyLB | |
The following object is masked from ‘package:stats’: | |
xtabs | |
The following objects are masked from ‘package:base’: | |
anyDuplicated, append, as.data.frame, as.vector, cbind, colnames, | |
do.call, duplicated, eval, evalq, Filter, Find, get, intersect, | |
is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, | |
pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rep.int, | |
rownames, sapply, setdiff, sort, table, tapply, union, unique, | |
unlist, unsplit | |
Loading required package: S4Vectors | |
Loading required package: stats4 | |
Loading required package: IRanges | |
Loading required package: Rsamtools | |
Loading required package: GenomeInfoDb | |
Loading required package: GenomicRanges | |
Loading required package: XVector | |
Loading required package: Biostrings | |
Loading required package: rtracklayer | |
Loading required package: BiocParallel | |
> register(SerialParam()) | |
> # use 4 BigWigs from | |
> # ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/encodeDCC/wgEncodeCshlLongRnaSeq/ | |
> files <- list.files("~/data/bigwig/",full=TRUE) | |
> library(TxDb.Hsapiens.UCSC.hg19.knownGene) | |
Loading required package: GenomicFeatures | |
Loading required package: AnnotationDbi | |
Loading required package: Biobase | |
Welcome to Bioconductor | |
Vignettes contain introductory material; view with | |
'browseVignettes()'. To cite Bioconductor, see | |
'citation("Biobase")', and for packages 'citation("pkgname")'. | |
Attaching package: ‘AnnotationDbi’ | |
The following object is masked from ‘package:GenomeInfoDb’: | |
species | |
> si <- seqinfo(TxDb.Hsapiens.UCSC.hg19.knownGene) | |
> ranges <- tileGenome(si,tilewidth=1e7,cut.last.tile.in.chrom=TRUE) | |
> ranges <- keepStandardChromosomes(ranges) | |
> | |
> | |
> ### BEGIN: my proposed functions | |
> | |
> #' most likely an unexported function | |
> #' | |
> #' @param ranges the ranges over which to sample | |
> #' @param files the files over which to sample | |
> #' @param IMPORT the function which will import data one range and one file | |
> #' at a time. The point is to observe the maximal memory size in R of the object | |
> #' which will be operated on in the MAP and REDUCE calls. | |
> #' @param minWidth given that tileGenome() produces tiles of varying width, | |
> #' this allows the user to insist that the sampling should be only over tiles | |
> #' of a given minWidth. | |
> #' | |
> #' @return the function returns the object size in bytes of one imported object | |
> #' from the (ranges, files) matrix | |
> #' | |
> sampleRandomRangeAndFile <- function(ranges, files, IMPORT, minWidth=NULL) { | |
+ if (is.null(minWidth)) { | |
+ idx <- sample(seq_along(ranges),1) | |
+ } else { | |
+ idx <- sample(which(width(ranges) >= minWidth),1) | |
+ } | |
+ range <- ranges[idx] | |
+ file <- files[sample(seq_along(files),1)] | |
+ object.size(IMPORT(range, file)) | |
+ } | |
> | |
> #' the user-facing function | |
> #' | |
> #' @param ceiling the maximal memory in megabytes to be used over all files for a single range | |
> #' @param upperBoundFunction a function which takes an upper bound as an esimate from | |
> #' the vector of sizes of the sample objects | |
> #' @param ranges the ranges over which to sample | |
> #' @param files to files over which to sample | |
> #' @param IMPORT the function which will import data one range and one file | |
> #' at a time. The point is to observe the maximal memory size in R of the object | |
> #' which will be operated on in the MAP and REDUCE calls. | |
> #' @param minWidth given that tileGenome() produces tiles of varying width, | |
> #' this allows the user to insist that the sampling should be only over tiles | |
> #' of a given minWidth. | |
> #' @param n the number of cells to sample | |
> #' | |
> #' @return the recommended number of ranges to chunk at one time | |
> #' | |
> recommendedChunk <- function(ceiling, upperBoundFunction, ranges, files, IMPORT, minWidth=NULL, n=20) { | |
+ taste <- replicate(n, sampleRandomRangeAndFile(ranges=ranges, files=files, | |
+ IMPORT=IMPORT, minWidth=minWidth)) | |
+ upper.bound <- upperBoundFunction(taste) / 2^20 | |
+ ceiling / (upper.bound * length(files)) | |
+ } | |
> | |
> ### END: my proposed functions | |
> | |
> | |
> # user code: | |
> | |
> IMPORT <- function(range, file) { | |
+ import(file, which = range, as = "Rle", format = "bw")[range] | |
+ } | |
> | |
> upperBoundFunction <- function(x) mean(x) + 2 * sd(x) | |
> | |
> (rc <- recommendedChunk(50, upperBoundFunction, ranges, files, IMPORT, minWidth=1e7)) | |
[1] 5.774204 | |
> | |
> (nchunks <- round(length(ranges)/rc)) | |
[1] 56 | |
> f <- factor(sort(rep(seq_len(nchunks), length=length(ranges)))) | |
> mean(table(f)) | |
[1] 5.767857 | |
> rangeslist <- split(ranges, f) | |
> | |
> ## | |
> | |
> MAP <- function(range, file) { | |
+ rle <- import(file, which = range, as = "Rle", format = "bw")[range] | |
+ mean(rle) | |
+ } | |
> | |
> REDUCE <- function(mapped) { | |
+ Reduce(cbind, mapped) | |
+ } | |
> | |
> rangeslist.sub <- rangeslist[1:2] | |
> (res <- reduceByRange(rangeslist.sub, files, MAP, REDUCE)) | |
$`1` | |
init | |
chr1 2.772341 2.2201188 3.0041389 2.3258021 | |
chr1 3.025716 3.1525184 2.4992048 2.6526405 | |
chr1 1.320788 1.0617699 1.7907983 1.5505196 | |
chr1 1.151013 0.9161564 1.4714571 1.1631958 | |
chr1 1.235252 0.9907793 1.6580848 1.4077788 | |
chr1 1.084928 0.8862725 0.4555268 0.3595216 | |
$`2` | |
init | |
chr1 0.6801988 0.5708432 2.3426269 1.8456729 | |
chr1 0.3163817 0.2278151 0.4011951 0.2862280 | |
chr1 0.2942333 0.2352593 0.3319902 0.2584270 | |
chr1 22.2271557 21.0384108 0.9220905 0.7089621 | |
chr1 1.0939601 1.4992705 0.5730236 0.4314522 | |
chr1 0.8636410 0.6543298 1.3248102 0.9992793 | |
> (resOut <- do.call(rbind, res)) | |
init | |
chr1 2.7723412 2.2201188 3.0041389 2.3258021 | |
chr1 3.0257160 3.1525184 2.4992048 2.6526405 | |
chr1 1.3207877 1.0617699 1.7907983 1.5505196 | |
chr1 1.1510127 0.9161564 1.4714571 1.1631958 | |
chr1 1.2352518 0.9907793 1.6580848 1.4077788 | |
chr1 1.0849282 0.8862725 0.4555268 0.3595216 | |
chr1 0.6801988 0.5708432 2.3426269 1.8456729 | |
chr1 0.3163817 0.2278151 0.4011951 0.2862280 | |
chr1 0.2942333 0.2352593 0.3319902 0.2584270 | |
chr1 22.2271557 21.0384108 0.9220905 0.7089621 | |
chr1 1.0939601 1.4992705 0.5730236 0.4314522 | |
chr1 0.8636410 0.6543298 1.3248102 0.9992793 | |
> sessionInfo() | |
R Under development (unstable) (2014-10-15 r66774) | |
Platform: x86_64-apple-darwin12.5.0 (64-bit) | |
locale: | |
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 | |
attached base packages: | |
[1] stats4 parallel stats graphics grDevices datasets utils | |
[8] methods base | |
other attached packages: | |
[1] TxDb.Hsapiens.UCSC.hg19.knownGene_3.0.0 | |
[2] GenomicFeatures_1.19.6 | |
[3] AnnotationDbi_1.29.1 | |
[4] Biobase_2.27.0 | |
[5] GenomicFiles_1.3.8 | |
[6] BiocParallel_1.1.5 | |
[7] rtracklayer_1.27.5 | |
[8] Rsamtools_1.19.9 | |
[9] Biostrings_2.35.4 | |
[10] XVector_0.7.2 | |
[11] GenomicRanges_1.19.10 | |
[12] GenomeInfoDb_1.3.7 | |
[13] IRanges_2.1.14 | |
[14] S4Vectors_0.5.7 | |
[15] BiocGenerics_0.13.2 | |
[16] RUnit_0.4.27 | |
[17] devtools_1.6.1 | |
[18] slidify_0.4.5 | |
[19] knitr_1.7 | |
[20] BiocInstaller_1.17.1 | |
loaded via a namespace (and not attached): | |
[1] base64enc_0.1-2 BatchJobs_1.5 BBmisc_1.8 | |
[4] biomaRt_2.23.1 bitops_1.0-6 brew_1.0-6 | |
[7] checkmate_1.5.0 codetools_0.2-9 DBI_0.3.1 | |
[10] digest_0.6.4 evaluate_0.5.5 fail_1.2 | |
[13] foreach_1.4.2 formatR_1.0 GenomicAlignments_1.3.8 | |
[16] iterators_1.0.7 markdown_0.7.4 RCurl_1.95-4.3 | |
[19] RSQLite_1.0.0 sendmailR_1.2-1 stringr_0.6.2 | |
[22] tools_3.2.0 whisker_0.3-2 XML_3.98-1.1 | |
[25] yaml_2.1.13 zlibbioc_1.13.0 | |
> | |
> | |
> proc.time() | |
user system elapsed | |
23.510 0.729 24.338 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment