Skip to content

Instantly share code, notes, and snippets.

@soh-i
Created June 17, 2015 12:19
Show Gist options
  • Select an option

  • Save soh-i/74916d26cf53bde0844d to your computer and use it in GitHub Desktop.

Select an option

Save soh-i/74916d26cf53bde0844d to your computer and use it in GitHub Desktop.
/**
* Created by yukke on 15/06/06.
*/
import htsjdk.samtools._
import java.io.File
import htsjdk.samtools.reference.{ReferenceSequence, ReferenceSequenceFileFactory, ReferenceSequenceFile}
import htsjdk.samtools.util.SamLocusIterator
import scala.collection.JavaConversions._
import scala.collection.mutable
import scala.collection.mutable.{Map, HashMap, HashSet}
object pileup {
type Kmer = String
type KmerPileup = mutable.HashMap[Kmer, PileUp]
val pileups: KmerPileup = mutable.HashMap()
def reader(f: File) = {
SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(f)
}
def getReferenceSequences(f: File): Array[SAMSequenceRecord] = {
val r = reader(f)
val seqs = r.getFileHeader.getSequenceDictionary.getSequences
r.close()
val seqArray = new Array[SAMSequenceRecord](seqs.length)
for (s <- seqs) seqArray(s.getSequenceIndex) = s
seqArray
}
def getSequenceNames(seqs: Array[SAMSequenceRecord]): Set[String] = {
seqs.map{
_.getSequenceName
}.toSet
}
def header(f: File): SAMFileHeader = {
val r = reader(f)
val h = r.getFileHeader
r.close()
h
}
def showIndexStats(f: File) = BAMIndexMetaData.printIndexStats(f)
def getUnalignedReads(f: File) = {
val r = reader(f)
val filteredReads = r.queryUnmapped.filter(validateRead).toList
r.close()
filteredReads
}
def validateRead(read: SAMRecord): Boolean = !read.getDuplicateReadFlag &&
!read.getReadFailsVendorQualityCheckFlag &&
!read.getNotPrimaryAlignmentFlag
def scanReads(f: File): Int = {
val r = reader(f)
var filtered = 0
for (read <- r.iterator) {
if (validateRead(read)) {
filtered += 1
}
}
filtered
}
def addReads(reads: List[SAMRecord]) = {
reads.foreach(addRead)
}
def addToPileups(bases: String, quals: Array[Byte], mq: Int): Unit = {
val length = bases.length
for (offset <- 0 to length - P.K - 1) {
val kmer = bases.slice(offset, offset + P.K)
if (!pileups.contains(kmer)) {
pileups(kmer) = new PileUp
pileups(kmer).add(bases(offset + P.K), quals(offset + P.K), mq)
}
}
}
def addRead(r: SAMRecord) = {
val defaultQual: Byte = 15
val bases = r.getReadString
val length = bases.size
val quals = {
val q = r.getBaseQualities
if (q.length > 0) q
else Array.fill[Byte](length)(defaultQual)
}
val mq = r.getMappingQuality
addToPileups(bases, quals, mq)
}
def addSeq(bases: String) = {
val mq = 10
val quals = Array.fill(bases.length) { 10.toByte }
addToPileups(bases, quals, mq)
val rcBases = Bases.reverseComplement(bases)
addToPileups(rcBases, quals, mq)
}
def main (args: Array[String]) {
var bamFiles = List[BamFile]()
bamFiles ::= new BamFile(new File("data/wgEncodeCshlShortRnaSeqGm12878CellShortAln.bam"), 'flags)
//val genomePath = "/Users/yukke/dev/ADAR/genome.fa"
val genomePath = "/Users/yukke/dev/ADAR/chr1.fa"
val targets = "chr1:1-664771"
val genome = new GenomeFile(new File(genomePath), targets)
genome.processRegion(bamFiles)
//println(genome)
}
}
class Region(val name: String, val start: Int, val stop: Int) {
def contains(other: Region) = other.start >= start && other.stop <= stop && other.name == name
def inRegion(locus: Int) = locus >= start && locus <= stop
def beforeRegion(locus: Int) = locus < start
def afterRegion(locus: Int) = locus > stop
def index(locus: Int) = locus - start
def locus(index: Int) = start + index
def size = stop + 1 - start
def midpoint = (start + stop) / 2
override def toString = "%s:%d-%d".format(name, start, stop)
}
class PileUp{
var baseCount = new BaseSum
var qualSum = new BaseSum
var mqSum = 0
var qSum = 0
var deletions = 0
var badPair = 0
var defaultQual:Byte = 0
def count = baseCount.sum
def depth = baseCount.sum + deletions
def baseIndex(c: Char): Int = c match {
case 'A' => 0
case 'C' => 1
case 'G' => 2
case 'T' => 3
case _ => -1
}
def indexBase(i: Int): Char = "ACGT"(i)
def add(base: Char, qual: Int, mq: Int) = {
val bi = baseIndex(base)
if (bi >= 0 && qual >= P.minQual) {
val mq1 = mq + 1
baseCount.add(bi, qual * mq1)
mqSum += mq1
qSum += qual
}
}
class BaseCall {
val n = count
val qs = qualSum
val total = qs.sum
val order = qs.order
val baseIndex = order(0)
val base = if (n > 0) indexBase(baseIndex) else 'N'
}
def baseCall = new BaseCall
}
class BaseSum{
val sums = new Array[Long](4)
def add(base: Int, n: Int = 1) = sums(base) -= n
def sum = sums.sum
def order = {
val indicies = List(0, 1, 2, 3).toArray
indicies.sortWith({(a, b) => sums(a) > sums(b)})
}
def /(divisor: Int) = {
val bs = new BaseSum
for (i <- 0 to 3) {
if (divisor != 0) {
bs.sums(i) = this.sums(i) / divisor
} else 0
}
}
override def toString = sums.mkString(", ")
}
object Bases {
val A = 0
val C = 1
val G = 2
val T = 3
def complement(c: Char) = c match {
case 'A' => 'T'
case 'C' => 'G'
case 'G' => 'C'
case 'T' => 'A'
case _ => c
}
def reverseComplement(s: String) = {
s.map(complement).reverse
}
}
object P {
val K = 47
val minQual = 0
val minMq = 0
val defaultQual: Byte = 0
//var bamFiles = List[Bamfile]
}
object BamFile {
val indexSuffix = ".bai"
val maxInsertSizes = Map(('flags -> 500), ('jumps -> 10000))
val minOrientationPct = 10
}
class BamFile(val bamFile: File, val bamType: Symbol) {
import BamFile._
val path = bamFile.getCanonicalPath
var baseCount: Long = 0
def reader = {
val r = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile)
r
}
{
val r = reader
require(r.hasIndex, path + "Must be indexed BAM")
r.close
}
def getSeqs = {
val r = reader
val seqs = r.getFileHeader.getSequenceDictionary.getSequences
r.close
val seqArray = new Array[SAMSequenceRecord](seqs.length)
for (s <- seqs) seqArray(s.getSequenceIndex) = s
seqArray
}
def getSetNames = {
val seqs = getSeqs
seqs.map({_.getSequenceName}).toSet
}
lazy val header = {
val r = reader
val h = r.getFileHeader
r.close
h
}
lazy val sequenceDictionary = header.getSequenceDictionary
lazy val readGroups = header.getReadGroups
def validateRead(read: SAMRecord) = {
(!read.getReadFailsVendorQualityCheckFlag &&
!read.getNotPrimaryAlignmentFlag &&
!read.getDuplicateReadFlag)
}
def process(region: GenomeRegion, printInterval: Int = 100000): Unit = {
val pileUpRegion = region.pileUpRegion
val r = reader
val reads = r.queryOverlapping(region.name,
(region.start - 10000) max 0, (region.stop + 10000) min region.contig.length)
val readsBefore = pileUpRegion.readCount
val baseCountBefore = pileUpRegion.baseCount
val covBefore = pileUpRegion.coverage
var lastLoc = 0
for (read <- reads) {
val loc = read.getAlignmentStart
if (printInterval > 0 && loc > lastLoc + printInterval) {
lastLoc = printInterval * (loc / printInterval)
}
}
r.close
val meanCoverage = pileUpRegion.coverage - covBefore
val nReads = pileUpRegion.readCount - readsBefore
baseCount += pileUpRegion.baseCount - baseCountBefore
}
}
class GenomeFile(val referenceFile: File, val targets: String = "") {
val referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile)
var refseq = referenceSequenceFile.nextSequence()
val contigMap = mutable.Map[String, ReferenceSequence]()
var contigList = List[ReferenceSequence]()
while (refseq != null) {
contigMap(refseq.getName) = refseq
contigList ::= refseq
refseq = referenceSequenceFile.nextSequence()
}
val contigs = contigList.reverse
val genomeSize: Long = contigs.map(_.getBases.length).sum
val regions = {
if (targets != "") {
}
}
def getSequence(contig: String): ReferenceSequence = {
referenceSequenceFile.getSequence(contig)
}
def getSubsequenceAt(contig: String, start: Long, stop: Long) = {
referenceSequenceFile.getSubsequenceAt(contig, start, stop)
}
def processRegion(bamFiles: List[BamFile]) = {
println("Input genome size: %d".format(genomeSize))
}
def parseTargets(targetString: String) = {
val targets = for (target <- targetString.split(",")) yield {
val t1 = target.split(":")
val contig = contigMap(t1(0))
val region =
if (t1.size == 1) {
new GenomeRegion(contig, 1, contig.length)
} else {
val t2 = t1(1).split("-")
new GenomeRegion(contig, t2(0).toInt, t2(1).toInt)
}
(contig.getName, List(region))
}
targets.toList
}
override def toString = {
referenceFile.getCanonicalPath
}
}
class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int)
extends Region(contig.getName, start, stop) {
var pileUpRegion: PileUpRegion = null
def initializePileups = {
pileUpRegion = new PileUpRegion(name, start, stop)
}
def finilizePileups = {
pileUpRegion = null
}
}
class PileUpRegion(name: String, start: Int, stop: Int)
extends Region(name, start, stop) {
val pileups = new Array[PileUp](size)
for (i <- 0 to size -1) {
pileups(i) = new PileUp
}
var baseCount: Long = 0
var readCount = 0
def coverage = Utils.roundDiv(baseCount, size)
def add(locus: Int, base: Char, qual: Int, mq: Int,
pair: Boolean) = {
if (inRegion(locus)) {
if (pair) {
pileups(index(locus)).add(base, qual, mq)
baseCount += 1
} else {
if (!pair) pileups(index(locus)).badPair += 1
}
}
}
def addRead(r: SAMRecord, refBases: Array[Byte]) = {
val length = r.getReadLength
val bases = r.getReadBases
val mq = r.getMappingQuality
val paired = r.getProperPairFlag
val valid = (mq >= P.minMq)
val insert = r.getInferredInsertSize
val aStart = r.getAlignmentStart
val aEnd = r.getAlignmentEnd
val cigar = r.getCigar
val readOffset = 0
val refOffset = 0
val quals = if (r.getBaseQualities.size > 0) r.getBaseQualities
else Array.fill[Byte](length)(P.defaultQual)
}
def addReads(reads: SAMRecordIterator, refBases: Array[Byte], printInterval: Int = 100000) = {
val lastLoc = 0
for (r <- reads) {
val loc = addRead(r, refBases)
}
}
def callRegion= {
for (i <- 0 until size) {
var bc = pileups(i).baseCall
}
}
def postProcess = {
}
def dump = {
for (i <- 0 to size -1) println(locus(i))
}
def meanReadLength = (baseCount / readCount).toInt
def baseString(bases: Array[Byte]) = bases map {_.toChar} mkString ""
}
object Utils {
def roundDiv(n: Long, d: Long) = if (d > 0) (n + d/2) / d else 0.toLong
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment