Created
February 25, 2020 08:36
-
-
Save ATpoint/ceb0a13d3bcd1821c98c4354e94fd519 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require(data.table) | |
require(microbenchmark) | |
require(GenomicRanges) | |
x <- fread(" | |
chrom start end hgnc | |
1 100 200 MYC | |
1 150 300 MYC | |
1 400 500 MYC | |
1 150 230 TP53 | |
1 200 350 TP53 | |
1 420 550 TP53") | |
fromZx <- function(Input){ | |
return(Input[, as.data.table(reduce(IRanges(start, end))), by = hgnc]) | |
} | |
fromZx(x) | |
fromMike <- function(Input){ | |
gr <- GRanges(Input) | |
# in short: | |
final <- unlist(reduce(split(gr, gr$hgnc))) | |
# split into GRangesList | |
grl <- split(gr, gr$hgnc) | |
# all(names(grl) == sort(unique(gr$hgnc)) | |
# reduce each element (independently reduce ranges for each gene) | |
grl_redux <- reduce(grl) # element-wise, like lapply(grl, reduce) | |
# all(names(grl_redux) == names(grl)) & all(lengths(grl_redux) <= lengths(grl)) | |
# return single GRanges, with rownames derived from hgnc | |
final <- unlist(grl_redux) | |
# all(names(final) == names(grl_redux)) | |
return(final) | |
} | |
fromMike(x) | |
fromMe <- function(Input){ | |
x.gr <- GRanges(seqnames = paste(Input$chrom, Input$hgnc, sep="_"), | |
ranges = IRanges(start = Input$start, | |
end = Input$end)) | |
reduced.gr <- GenomicRanges::reduce(x.gr) | |
splitted <- strsplit(as.character(seqnames(reduced.gr)), split="_") | |
new.seqnames <- unlist(lapply(splitted, function(x)x[1])) | |
final <- GRanges(seqnames = new.seqnames, | |
ranges = ranges(reduced.gr)) | |
elementMetadata(final) <- unlist(lapply(splitted, function(x)x[2])) | |
return(final) | |
} | |
fromMe(x) | |
Speed benchmarks: | |
> microbenchmark(fromZx(x)) | |
Unit: milliseconds | |
expr min lq mean median uq max neval | |
fromZx(x) 5.662879 6.76689 9.452377 7.903199 11.01084 22.11675 100 | |
> microbenchmark(fromMike(x)) | |
Unit: milliseconds | |
expr min lq mean median uq max neval | |
fromMike(x) 97.53267 103.5992 126.5775 113.4482 131.6864 725.7022 100 | |
> microbenchmark(fromMe(x)) | |
Unit: milliseconds | |
expr min lq mean median uq max neval | |
fromMe(x) 29.36052 32.10575 40.09418 35.39014 42.27503 163.3626 100 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is what I have, yes on bigger data yours is most efficient, mike's is 1.8x and data.table is 22x times slower.