malcook · June 9, 2011 15:20
diff --git a/bamTabulateGaps.R b/bamTabulateGaps.R
 library(IRanges)       # for: coverage, psetdiff, etc
 library(GenomicRanges) # for: readGappedAlignments, 
 library(Rsamtools)     # for: reading bam files scanBam, countBam etc
 library(rtracklayer)   # for: track file IO (bed, wig, etc)
 library(sqldf)         # for: querying dataframes using SQL\

 bamTabulateGaps <- function(bamPath,
                            bedPath=paste(gsub('.bam$','',bamPath),'.junctions.bed',sep=''),
                            trackName=gsub('\\..*','',basename(bamPath)),
                            trackLine=sprintf("track name=%s graphType=junctions",trackName),
                            ...  ){
 ### Tabulate the (unstranded) coverage of all gaps present in the
 ### gapped alignments in <bamPath>, such as may be produced by a
 ### splice-aware aligner (e.g. GSNAP).
 ###
 ### Optionally, write a bed formatted trackfile at <bedPath> (unless
 ### <bedPath> is provided as '').
 ###
 ### Include in the bed file an initial <trackLine> declaring the
 ### <trackName> as well as 'graphType=junctions'.
 ###
 ### <bedPath> defaults to the bamPath with any trailing .bam removed,
 ### suffixed with 'junctions.bed'.
 ###
 ### <trackName> defaults to the basename of <bamPath> without any
 ### .extensions.
 ###
 ### 'graphType=junctions' is a convention established (AFAIK) by
 ### TopHat and respected by IGV (at least) causing it to display the
 ### regions using arc-shaped glyphs with a weight proportional to the
 ### score (up to 50).
 ###
 ### Additional options in <...> are passed to bed.export.
 ###
 ### Values: a list of the tabulatedGaps, as a data.frame and the
 ### bedPath
 ###
 ### AUTHOR: [email protected]
 ###
 ### EXAMPLE
 ### 1) using this code from gist (requires the R devtools library installed)
 ### RScript --default-packages=devtools,utils,methods,stats -e "suppressPackageStartupMessages(source_gist(1016957))" -e "invisible(do.call(bamTabulateGaps,as.list(commandArgs(TRUE))))"  foo.bam  
  GA <- readGappedAlignments(bamPath)
  GA.gapped <-
    ## which ones have gaps?
    GA[ngap(GA) != 0]
  GA.gapped.grg <-
    ## a GRanges, with one component per alignment (which looses
    ## spliced internals).
    granges(GA.gapped);
  GA.gapped.grglist <-
    ## a GRangesList of GRanges, one per alignment
    grglist(GA.gapped);
  J2J.grglist <-
    ## the gapped regions (presumably intronic), as GRangesList,
    ## indexed by read.
    psetdiff(GA.gapped.grg,GA.gapped.grglist)
  J2J.GRanges <-
    ## the gapped regions, as genome wide GRanges
    unlist(J2J.grglist)
  ## Offset to include the last base of upstream exon and first base
  ## of downstream exon:
  start(J2J.GRanges) <- start(J2J.GRanges) - 1
  end(J2J.GRanges)   <- end(J2J.GRanges)   + 1
  J2JDF <-
    ## coerce to data.frame so we can query using sqldf.
    as.data.frame(J2J.GRanges)
  tabulatedGaps <-
    ## Count the number of reads grouped by chromosome,start,end,strand
    sqldf(paste("SELECT seqnames AS space,start,end, count(*) AS score",
                "FROM J2JDF",
                "GROUP BY seqnames,start,end"))
  if ('' != bedPath) {
    write(trackLine,file=bedPath)
    export.bed(tabulatedGaps,bedPath,append=TRUE)
  }
  list(bedPath=bedPath,
       tabulatedGaps=tabulatedGaps
       )
 }
	library(IRanges) # for: coverage, psetdiff, etc
	library(GenomicRanges) # for: readGappedAlignments,
	library(Rsamtools) # for: reading bam files scanBam, countBam etc
	library(rtracklayer) # for: track file IO (bed, wig, etc)
	library(sqldf) # for: querying dataframes using SQL\

	bamTabulateGaps <- function(bamPath,
	bedPath=paste(gsub('.bam$','',bamPath),'.junctions.bed',sep=''),
	trackName=gsub('\\..*','',basename(bamPath)),
	trackLine=sprintf("track name=%s graphType=junctions",trackName),
	... ){
	### Tabulate the (unstranded) coverage of all gaps present in the
	### gapped alignments in <bamPath>, such as may be produced by a
	### splice-aware aligner (e.g. GSNAP).
	###
	### Optionally, write a bed formatted trackfile at <bedPath> (unless
	### <bedPath> is provided as '').
	###
	### Include in the bed file an initial <trackLine> declaring the
	### <trackName> as well as 'graphType=junctions'.
	###
	### <bedPath> defaults to the bamPath with any trailing .bam removed,
	### suffixed with 'junctions.bed'.
	###
	### <trackName> defaults to the basename of <bamPath> without any
	### .extensions.
	###
	### 'graphType=junctions' is a convention established (AFAIK) by
	### TopHat and respected by IGV (at least) causing it to display the
	### regions using arc-shaped glyphs with a weight proportional to the
	### score (up to 50).
	###
	### Additional options in <...> are passed to bed.export.
	###
	### Values: a list of the tabulatedGaps, as a data.frame and the
	### bedPath
	###
	### AUTHOR: [email protected]
	###
	### EXAMPLE
	### 1) using this code from gist (requires the R devtools library installed)
	### RScript --default-packages=devtools,utils,methods,stats -e "suppressPackageStartupMessages(source_gist(1016957))" -e "invisible(do.call(bamTabulateGaps,as.list(commandArgs(TRUE))))" foo.bam
	GA <- readGappedAlignments(bamPath)
	GA.gapped <-
	## which ones have gaps?
	GA[ngap(GA) != 0]
	GA.gapped.grg <-
	## a GRanges, with one component per alignment (which looses
	## spliced internals).
	granges(GA.gapped);
	GA.gapped.grglist <-
	## a GRangesList of GRanges, one per alignment
	grglist(GA.gapped);
	J2J.grglist <-
	## the gapped regions (presumably intronic), as GRangesList,
	## indexed by read.
	psetdiff(GA.gapped.grg,GA.gapped.grglist)
	J2J.GRanges <-
	## the gapped regions, as genome wide GRanges
	unlist(J2J.grglist)
	## Offset to include the last base of upstream exon and first base
	## of downstream exon:
	start(J2J.GRanges) <- start(J2J.GRanges) - 1
	end(J2J.GRanges) <- end(J2J.GRanges) + 1
	J2JDF <-
	## coerce to data.frame so we can query using sqldf.
	as.data.frame(J2J.GRanges)
	tabulatedGaps <-
	## Count the number of reads grouped by chromosome,start,end,strand
	sqldf(paste("SELECT seqnames AS space,start,end, count(*) AS score",
	"FROM J2JDF",
	"GROUP BY seqnames,start,end"))
	if ('' != bedPath) {
	write(trackLine,file=bedPath)
	export.bed(tabulatedGaps,bedPath,append=TRUE)
	}
	list(bedPath=bedPath,
	tabulatedGaps=tabulatedGaps
	)
	}