Created
June 17, 2015 12:19
-
-
Save soh-i/74916d26cf53bde0844d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Created by yukke on 15/06/06. | |
| */ | |
| import htsjdk.samtools._ | |
| import java.io.File | |
| import htsjdk.samtools.reference.{ReferenceSequence, ReferenceSequenceFileFactory, ReferenceSequenceFile} | |
| import htsjdk.samtools.util.SamLocusIterator | |
| import scala.collection.JavaConversions._ | |
| import scala.collection.mutable | |
| import scala.collection.mutable.{Map, HashMap, HashSet} | |
| object pileup { | |
| type Kmer = String | |
| type KmerPileup = mutable.HashMap[Kmer, PileUp] | |
| val pileups: KmerPileup = mutable.HashMap() | |
| def reader(f: File) = { | |
| SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(f) | |
| } | |
| def getReferenceSequences(f: File): Array[SAMSequenceRecord] = { | |
| val r = reader(f) | |
| val seqs = r.getFileHeader.getSequenceDictionary.getSequences | |
| r.close() | |
| val seqArray = new Array[SAMSequenceRecord](seqs.length) | |
| for (s <- seqs) seqArray(s.getSequenceIndex) = s | |
| seqArray | |
| } | |
| def getSequenceNames(seqs: Array[SAMSequenceRecord]): Set[String] = { | |
| seqs.map{ | |
| _.getSequenceName | |
| }.toSet | |
| } | |
| def header(f: File): SAMFileHeader = { | |
| val r = reader(f) | |
| val h = r.getFileHeader | |
| r.close() | |
| h | |
| } | |
| def showIndexStats(f: File) = BAMIndexMetaData.printIndexStats(f) | |
| def getUnalignedReads(f: File) = { | |
| val r = reader(f) | |
| val filteredReads = r.queryUnmapped.filter(validateRead).toList | |
| r.close() | |
| filteredReads | |
| } | |
| def validateRead(read: SAMRecord): Boolean = !read.getDuplicateReadFlag && | |
| !read.getReadFailsVendorQualityCheckFlag && | |
| !read.getNotPrimaryAlignmentFlag | |
| def scanReads(f: File): Int = { | |
| val r = reader(f) | |
| var filtered = 0 | |
| for (read <- r.iterator) { | |
| if (validateRead(read)) { | |
| filtered += 1 | |
| } | |
| } | |
| filtered | |
| } | |
| def addReads(reads: List[SAMRecord]) = { | |
| reads.foreach(addRead) | |
| } | |
| def addToPileups(bases: String, quals: Array[Byte], mq: Int): Unit = { | |
| val length = bases.length | |
| for (offset <- 0 to length - P.K - 1) { | |
| val kmer = bases.slice(offset, offset + P.K) | |
| if (!pileups.contains(kmer)) { | |
| pileups(kmer) = new PileUp | |
| pileups(kmer).add(bases(offset + P.K), quals(offset + P.K), mq) | |
| } | |
| } | |
| } | |
| def addRead(r: SAMRecord) = { | |
| val defaultQual: Byte = 15 | |
| val bases = r.getReadString | |
| val length = bases.size | |
| val quals = { | |
| val q = r.getBaseQualities | |
| if (q.length > 0) q | |
| else Array.fill[Byte](length)(defaultQual) | |
| } | |
| val mq = r.getMappingQuality | |
| addToPileups(bases, quals, mq) | |
| } | |
| def addSeq(bases: String) = { | |
| val mq = 10 | |
| val quals = Array.fill(bases.length) { 10.toByte } | |
| addToPileups(bases, quals, mq) | |
| val rcBases = Bases.reverseComplement(bases) | |
| addToPileups(rcBases, quals, mq) | |
| } | |
| def main (args: Array[String]) { | |
| var bamFiles = List[BamFile]() | |
| bamFiles ::= new BamFile(new File("data/wgEncodeCshlShortRnaSeqGm12878CellShortAln.bam"), 'flags) | |
| //val genomePath = "/Users/yukke/dev/ADAR/genome.fa" | |
| val genomePath = "/Users/yukke/dev/ADAR/chr1.fa" | |
| val targets = "chr1:1-664771" | |
| val genome = new GenomeFile(new File(genomePath), targets) | |
| genome.processRegion(bamFiles) | |
| //println(genome) | |
| } | |
| } | |
| class Region(val name: String, val start: Int, val stop: Int) { | |
| def contains(other: Region) = other.start >= start && other.stop <= stop && other.name == name | |
| def inRegion(locus: Int) = locus >= start && locus <= stop | |
| def beforeRegion(locus: Int) = locus < start | |
| def afterRegion(locus: Int) = locus > stop | |
| def index(locus: Int) = locus - start | |
| def locus(index: Int) = start + index | |
| def size = stop + 1 - start | |
| def midpoint = (start + stop) / 2 | |
| override def toString = "%s:%d-%d".format(name, start, stop) | |
| } | |
| class PileUp{ | |
| var baseCount = new BaseSum | |
| var qualSum = new BaseSum | |
| var mqSum = 0 | |
| var qSum = 0 | |
| var deletions = 0 | |
| var badPair = 0 | |
| var defaultQual:Byte = 0 | |
| def count = baseCount.sum | |
| def depth = baseCount.sum + deletions | |
| def baseIndex(c: Char): Int = c match { | |
| case 'A' => 0 | |
| case 'C' => 1 | |
| case 'G' => 2 | |
| case 'T' => 3 | |
| case _ => -1 | |
| } | |
| def indexBase(i: Int): Char = "ACGT"(i) | |
| def add(base: Char, qual: Int, mq: Int) = { | |
| val bi = baseIndex(base) | |
| if (bi >= 0 && qual >= P.minQual) { | |
| val mq1 = mq + 1 | |
| baseCount.add(bi, qual * mq1) | |
| mqSum += mq1 | |
| qSum += qual | |
| } | |
| } | |
| class BaseCall { | |
| val n = count | |
| val qs = qualSum | |
| val total = qs.sum | |
| val order = qs.order | |
| val baseIndex = order(0) | |
| val base = if (n > 0) indexBase(baseIndex) else 'N' | |
| } | |
| def baseCall = new BaseCall | |
| } | |
| class BaseSum{ | |
| val sums = new Array[Long](4) | |
| def add(base: Int, n: Int = 1) = sums(base) -= n | |
| def sum = sums.sum | |
| def order = { | |
| val indicies = List(0, 1, 2, 3).toArray | |
| indicies.sortWith({(a, b) => sums(a) > sums(b)}) | |
| } | |
| def /(divisor: Int) = { | |
| val bs = new BaseSum | |
| for (i <- 0 to 3) { | |
| if (divisor != 0) { | |
| bs.sums(i) = this.sums(i) / divisor | |
| } else 0 | |
| } | |
| } | |
| override def toString = sums.mkString(", ") | |
| } | |
| object Bases { | |
| val A = 0 | |
| val C = 1 | |
| val G = 2 | |
| val T = 3 | |
| def complement(c: Char) = c match { | |
| case 'A' => 'T' | |
| case 'C' => 'G' | |
| case 'G' => 'C' | |
| case 'T' => 'A' | |
| case _ => c | |
| } | |
| def reverseComplement(s: String) = { | |
| s.map(complement).reverse | |
| } | |
| } | |
| object P { | |
| val K = 47 | |
| val minQual = 0 | |
| val minMq = 0 | |
| val defaultQual: Byte = 0 | |
| //var bamFiles = List[Bamfile] | |
| } | |
| object BamFile { | |
| val indexSuffix = ".bai" | |
| val maxInsertSizes = Map(('flags -> 500), ('jumps -> 10000)) | |
| val minOrientationPct = 10 | |
| } | |
| class BamFile(val bamFile: File, val bamType: Symbol) { | |
| import BamFile._ | |
| val path = bamFile.getCanonicalPath | |
| var baseCount: Long = 0 | |
| def reader = { | |
| val r = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(bamFile) | |
| r | |
| } | |
| { | |
| val r = reader | |
| require(r.hasIndex, path + "Must be indexed BAM") | |
| r.close | |
| } | |
| def getSeqs = { | |
| val r = reader | |
| val seqs = r.getFileHeader.getSequenceDictionary.getSequences | |
| r.close | |
| val seqArray = new Array[SAMSequenceRecord](seqs.length) | |
| for (s <- seqs) seqArray(s.getSequenceIndex) = s | |
| seqArray | |
| } | |
| def getSetNames = { | |
| val seqs = getSeqs | |
| seqs.map({_.getSequenceName}).toSet | |
| } | |
| lazy val header = { | |
| val r = reader | |
| val h = r.getFileHeader | |
| r.close | |
| h | |
| } | |
| lazy val sequenceDictionary = header.getSequenceDictionary | |
| lazy val readGroups = header.getReadGroups | |
| def validateRead(read: SAMRecord) = { | |
| (!read.getReadFailsVendorQualityCheckFlag && | |
| !read.getNotPrimaryAlignmentFlag && | |
| !read.getDuplicateReadFlag) | |
| } | |
| def process(region: GenomeRegion, printInterval: Int = 100000): Unit = { | |
| val pileUpRegion = region.pileUpRegion | |
| val r = reader | |
| val reads = r.queryOverlapping(region.name, | |
| (region.start - 10000) max 0, (region.stop + 10000) min region.contig.length) | |
| val readsBefore = pileUpRegion.readCount | |
| val baseCountBefore = pileUpRegion.baseCount | |
| val covBefore = pileUpRegion.coverage | |
| var lastLoc = 0 | |
| for (read <- reads) { | |
| val loc = read.getAlignmentStart | |
| if (printInterval > 0 && loc > lastLoc + printInterval) { | |
| lastLoc = printInterval * (loc / printInterval) | |
| } | |
| } | |
| r.close | |
| val meanCoverage = pileUpRegion.coverage - covBefore | |
| val nReads = pileUpRegion.readCount - readsBefore | |
| baseCount += pileUpRegion.baseCount - baseCountBefore | |
| } | |
| } | |
| class GenomeFile(val referenceFile: File, val targets: String = "") { | |
| val referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(referenceFile) | |
| var refseq = referenceSequenceFile.nextSequence() | |
| val contigMap = mutable.Map[String, ReferenceSequence]() | |
| var contigList = List[ReferenceSequence]() | |
| while (refseq != null) { | |
| contigMap(refseq.getName) = refseq | |
| contigList ::= refseq | |
| refseq = referenceSequenceFile.nextSequence() | |
| } | |
| val contigs = contigList.reverse | |
| val genomeSize: Long = contigs.map(_.getBases.length).sum | |
| val regions = { | |
| if (targets != "") { | |
| } | |
| } | |
| def getSequence(contig: String): ReferenceSequence = { | |
| referenceSequenceFile.getSequence(contig) | |
| } | |
| def getSubsequenceAt(contig: String, start: Long, stop: Long) = { | |
| referenceSequenceFile.getSubsequenceAt(contig, start, stop) | |
| } | |
| def processRegion(bamFiles: List[BamFile]) = { | |
| println("Input genome size: %d".format(genomeSize)) | |
| } | |
| def parseTargets(targetString: String) = { | |
| val targets = for (target <- targetString.split(",")) yield { | |
| val t1 = target.split(":") | |
| val contig = contigMap(t1(0)) | |
| val region = | |
| if (t1.size == 1) { | |
| new GenomeRegion(contig, 1, contig.length) | |
| } else { | |
| val t2 = t1(1).split("-") | |
| new GenomeRegion(contig, t2(0).toInt, t2(1).toInt) | |
| } | |
| (contig.getName, List(region)) | |
| } | |
| targets.toList | |
| } | |
| override def toString = { | |
| referenceFile.getCanonicalPath | |
| } | |
| } | |
| class GenomeRegion(val contig: ReferenceSequence, start: Int, stop: Int) | |
| extends Region(contig.getName, start, stop) { | |
| var pileUpRegion: PileUpRegion = null | |
| def initializePileups = { | |
| pileUpRegion = new PileUpRegion(name, start, stop) | |
| } | |
| def finilizePileups = { | |
| pileUpRegion = null | |
| } | |
| } | |
| class PileUpRegion(name: String, start: Int, stop: Int) | |
| extends Region(name, start, stop) { | |
| val pileups = new Array[PileUp](size) | |
| for (i <- 0 to size -1) { | |
| pileups(i) = new PileUp | |
| } | |
| var baseCount: Long = 0 | |
| var readCount = 0 | |
| def coverage = Utils.roundDiv(baseCount, size) | |
| def add(locus: Int, base: Char, qual: Int, mq: Int, | |
| pair: Boolean) = { | |
| if (inRegion(locus)) { | |
| if (pair) { | |
| pileups(index(locus)).add(base, qual, mq) | |
| baseCount += 1 | |
| } else { | |
| if (!pair) pileups(index(locus)).badPair += 1 | |
| } | |
| } | |
| } | |
| def addRead(r: SAMRecord, refBases: Array[Byte]) = { | |
| val length = r.getReadLength | |
| val bases = r.getReadBases | |
| val mq = r.getMappingQuality | |
| val paired = r.getProperPairFlag | |
| val valid = (mq >= P.minMq) | |
| val insert = r.getInferredInsertSize | |
| val aStart = r.getAlignmentStart | |
| val aEnd = r.getAlignmentEnd | |
| val cigar = r.getCigar | |
| val readOffset = 0 | |
| val refOffset = 0 | |
| val quals = if (r.getBaseQualities.size > 0) r.getBaseQualities | |
| else Array.fill[Byte](length)(P.defaultQual) | |
| } | |
| def addReads(reads: SAMRecordIterator, refBases: Array[Byte], printInterval: Int = 100000) = { | |
| val lastLoc = 0 | |
| for (r <- reads) { | |
| val loc = addRead(r, refBases) | |
| } | |
| } | |
| def callRegion= { | |
| for (i <- 0 until size) { | |
| var bc = pileups(i).baseCall | |
| } | |
| } | |
| def postProcess = { | |
| } | |
| def dump = { | |
| for (i <- 0 to size -1) println(locus(i)) | |
| } | |
| def meanReadLength = (baseCount / readCount).toInt | |
| def baseString(bases: Array[Byte]) = bases map {_.toChar} mkString "" | |
| } | |
| object Utils { | |
| def roundDiv(n: Long, d: Long) = if (d > 0) (n + d/2) / d else 0.toLong | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment