Created
June 9, 2011 15:20
-
-
Save malcook/1016957 to your computer and use it in GitHub Desktop.
bamTabulateGaps : For a bam file containing gapped alignments tabulate the (unstranded) coverage of all gaps therein.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(IRanges) # for: coverage, psetdiff, etc | |
library(GenomicRanges) # for: readGappedAlignments, | |
library(Rsamtools) # for: reading bam files scanBam, countBam etc | |
library(rtracklayer) # for: track file IO (bed, wig, etc) | |
library(sqldf) # for: querying dataframes using SQL\ | |
bamTabulateGaps <- function(bamPath, | |
bedPath=paste(gsub('.bam$','',bamPath),'.junctions.bed',sep=''), | |
trackName=gsub('\\..*','',basename(bamPath)), | |
trackLine=sprintf("track name=%s graphType=junctions",trackName), | |
... ){ | |
### Tabulate the (unstranded) coverage of all gaps present in the | |
### gapped alignments in <bamPath>, such as may be produced by a | |
### splice-aware aligner (e.g. GSNAP). | |
### | |
### Optionally, write a bed formatted trackfile at <bedPath> (unless | |
### <bedPath> is provided as ''). | |
### | |
### Include in the bed file an initial <trackLine> declaring the | |
### <trackName> as well as 'graphType=junctions'. | |
### | |
### <bedPath> defaults to the bamPath with any trailing .bam removed, | |
### suffixed with 'junctions.bed'. | |
### | |
### <trackName> defaults to the basename of <bamPath> without any | |
### .extensions. | |
### | |
### 'graphType=junctions' is a convention established (AFAIK) by | |
### TopHat and respected by IGV (at least) causing it to display the | |
### regions using arc-shaped glyphs with a weight proportional to the | |
### score (up to 50). | |
### | |
### Additional options in <...> are passed to bed.export. | |
### | |
### Values: a list of the tabulatedGaps, as a data.frame and the | |
### bedPath | |
### | |
### AUTHOR: [email protected] | |
### | |
### EXAMPLE | |
### 1) using this code from gist (requires the R devtools library installed) | |
### RScript --default-packages=devtools,utils,methods,stats -e "suppressPackageStartupMessages(source_gist(1016957))" -e "invisible(do.call(bamTabulateGaps,as.list(commandArgs(TRUE))))" foo.bam | |
GA <- readGappedAlignments(bamPath) | |
GA.gapped <- | |
## which ones have gaps? | |
GA[ngap(GA) != 0] | |
GA.gapped.grg <- | |
## a GRanges, with one component per alignment (which looses | |
## spliced internals). | |
granges(GA.gapped); | |
GA.gapped.grglist <- | |
## a GRangesList of GRanges, one per alignment | |
grglist(GA.gapped); | |
J2J.grglist <- | |
## the gapped regions (presumably intronic), as GRangesList, | |
## indexed by read. | |
psetdiff(GA.gapped.grg,GA.gapped.grglist) | |
J2J.GRanges <- | |
## the gapped regions, as genome wide GRanges | |
unlist(J2J.grglist) | |
## Offset to include the last base of upstream exon and first base | |
## of downstream exon: | |
start(J2J.GRanges) <- start(J2J.GRanges) - 1 | |
end(J2J.GRanges) <- end(J2J.GRanges) + 1 | |
J2JDF <- | |
## coerce to data.frame so we can query using sqldf. | |
as.data.frame(J2J.GRanges) | |
tabulatedGaps <- | |
## Count the number of reads grouped by chromosome,start,end,strand | |
sqldf(paste("SELECT seqnames AS space,start,end, count(*) AS score", | |
"FROM J2JDF", | |
"GROUP BY seqnames,start,end")) | |
if ('' != bedPath) { | |
write(trackLine,file=bedPath) | |
export.bed(tabulatedGaps,bedPath,append=TRUE) | |
} | |
list(bedPath=bedPath, | |
tabulatedGaps=tabulatedGaps | |
) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment