Last active
August 29, 2015 14:21
-
-
Save soh-i/119e690ac6f9000ceb33 to your computer and use it in GitHub Desktop.
read GFF in scala
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import java.io.File | |
| import htsjdk.samtools._ | |
| import scala.compat.Platform | |
| import scala.collection.JavaConversions._ | |
| import scala.util.control.Breaks.break | |
| /** | |
| * Created by yukke on 15/05/23. | |
| */ | |
| trait Benchmark { | |
| def run() | |
| var multiplier = 1 | |
| def runBenchmark(noTimes: Int): List[Int] = { | |
| for (i <- List.range(1, noTimes + 1)) yield { | |
| val startTime = Platform.currentTime | |
| var i = 0 | |
| while (i < multiplier) { | |
| run() | |
| i += 1 | |
| } | |
| val stopTime = Platform.currentTime | |
| Platform.collectGarbage() | |
| (stopTime - startTime).toInt | |
| } | |
| } | |
| } | |
| object bamReader { | |
| def readAllBAMRecords(f: File): Int = { | |
| //val reader: SamReader = SamReaderFactory.makeDefault().open(f) | |
| val reader = SamReaderFactory.make() | |
| .disable(SamReaderFactory.Option.DONT_MEMORY_MAP_INDEX) | |
| .validationStringency(ValidationStringency.SILENT) | |
| .samRecordFactory(DefaultSAMRecordFactory.getInstance()) | |
| .open(f) | |
| val iter: SAMRecordIterator = reader.iterator() | |
| var nRecs = 0 | |
| while (iter.hasNext) { | |
| val rec: SAMRecord = iter.next() | |
| rec.getAlignmentBlocks | |
| //nRecs += 1 | |
| //val c = nRecs % 1000000 | |
| //if (c == 0) { | |
| //println(nRecs) | |
| //} | |
| } | |
| iter.close() | |
| reader.close() | |
| //println(nRecs) | |
| nRecs | |
| } | |
| def benchmark(times: Int)(f: => Any) = new Benchmark{def run = f }.runBenchmark(times) | |
| def getProcedures(bamHeader: SamReader): List[htsjdk.samtools.SAMProgramRecord] = { | |
| bamHeader.getFileHeader.getProgramRecords.toList | |
| } | |
| def getMaxChromosomeLength(bamHeaderRecs: java.util.List[htsjdk.samtools.SAMSequenceRecord]): Int = { | |
| bamHeaderRecs.map(_.getSequenceLength).max | |
| } | |
| def createIndexOfChromosomes(bamReader: SamReader): Map[String, Int] = { | |
| val chromIndex = { | |
| val b = Map.newBuilder[String, Int] | |
| for (rec <- bamReader.getFileHeader.getSequenceDictionary.getSequences) { | |
| b += rec.getSequenceName -> rec.getSequenceLength | |
| } | |
| b.result | |
| } | |
| chromIndex | |
| } | |
| def averageBaseQuality(qual: Array[Byte]): Int = { | |
| var sum = 0 | |
| for (b <- qual) sum += b | |
| sum / qual.length | |
| } | |
| def main(args: Array[String]) { | |
| val GM12872 = new File("data/wgEncodeCshlShortRnaSeqGm12878CellShortAln.bam") | |
| val bamReader = SamReaderFactory.makeDefault().open(GM12872) | |
| val bamHeaderRecs = bamReader.getFileHeader.getSequenceDictionary.getSequences | |
| val chrIndex = createIndexOfChromosomes(bamReader) | |
| //println(chrIndex.valuesIterator.reduceLeft((x, y) => if(x>y) x else y)) | |
| val maxChrom = getMaxChromosomeLength(bamHeaderRecs) | |
| val genome = new File("") | |
| //val fastaReader = htsjdk.samtools.reference.FastaSequenceFile | |
| for (read: htsjdk.samtools.SAMRecord <- bamReader.iterator) { | |
| //println(averageBaseQuality(read.getBaseQualities)) | |
| //println("%d-%d".format(read.getStart, read.getEnd)) | |
| var mismatches = 0 | |
| val readBases: Array[Byte] = read.getReadBases | |
| //println(readBases.take(1).mkString("")) | |
| val referenceBase = Array(41.toByte) | |
| for (block <- read.getAlignmentBlocks) { | |
| val readBlockStart = block.getReadStart - 1 | |
| val referenceBlockStart = block.getReferenceStart - 1 | |
| val length = block.getLength | |
| println("Read block start: <%d>".format(readBlockStart)) | |
| println("Ref block start: <%d>".format(referenceBlockStart-1)) | |
| println("Block length: <%d>".format(length)) | |
| //println(referenceBase(referenceBlockStart+1)) | |
| //val i = htsjdk.samtools.util.SequenceUtil.basesEqual(readBases(readBlockStart), referenceBase(referenceBlockStart)) | |
| //println(i) | |
| for (i <- 0 to length-1) { | |
| if (!htsjdk.samtools.util.SequenceUtil.basesEqual(readBases(readBlockStart + i), referenceBase(0))) { | |
| mismatches += 1 | |
| } | |
| } | |
| } | |
| println("# of mismatches: <%d>".format(mismatches)) | |
| //val ref:Array[Byte] = Array("T".toByte, "C".toByte) | |
| //val mismatch = htsjdk.samtools.util.SequenceUtil.countMismatches(rec, ref, 0, true) | |
| //println(mismatch) | |
| break | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Created by yukke on 15/05/21. | |
| */ | |
| import scala.io.Source | |
| object Program { | |
| val gffFile: String = "/Users/yukke/IdeaProjects/hello-scala/src/mature.gtf" | |
| def countLine(file: String): Int = { | |
| Source.fromFile(file, "utf-8").getLines().length | |
| } | |
| def tabToComma(file: String): Iterator[String] = { | |
| Source.fromFile(file, "utf-8").getLines().map(_.replaceAll("\t", ",")) | |
| } | |
| def showHead(file: String, N: Int): List[String] = { | |
| Source.fromFile(file, "utf-8").getLines().toList.take(N) | |
| } | |
| def showTail(file: String, N: Int): List[String] = { | |
| Source.fromFile(file, "utf-8").getLines().toList.takeRight(N) | |
| } | |
| def removeSpaces(s: String): String = { | |
| s.replaceAll("[\\s]+", "") | |
| } | |
| def parseAttributeLine(col: String): Map[String, String] = { | |
| val attributes = col.split(";") | |
| val trID = attributes(0).replaceAll("(?:transcript_id)|[\"\\s]", "") | |
| val miID = attributes(1).replaceAll("(?:Alias)|[\"\\s]", "") | |
| val name = attributes(2).replaceAll("(?:Name)|[\"\\s]", "") | |
| val derivesFrom = attributes(3).replaceAll("(?:Derives_from)|[\"\\s]", "") | |
| Map("TranscriptID" -> trID, "Alias" -> miID, "Name" -> name, "Derives_from" -> derivesFrom) | |
| } | |
| def generateGffMap(file: String): Map[Int, Map[String, Any]] = { | |
| val pair = Map.newBuilder[Int, Map[String, Any]] | |
| for (line <- Source.fromFile(gffFile, "utf-8").getLines()) { | |
| val cols = line.split("\t") | |
| val chromosome = cols(0) | |
| val start = cols(3) | |
| val end = cols(4) | |
| val strand = cols(6) | |
| val primaryKey: Int = start.toInt + end.toInt | |
| val attrs = parseAttributeLine(cols(8)) | |
| pair += primaryKey -> Map("chr" -> chromosome, "start" -> start, "end" -> end, "strand" -> strand, "attributes" -> attrs) | |
| } | |
| pair.result | |
| } | |
| def main(args: Array[String]): Unit = { | |
| val miData = generateGffMap(gffFile) | |
| println(miData.take(1)) | |
| } | |
| } | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Created by yukke on 15/05/21. | |
| */ | |
| import scala.io.Source | |
| object Annotation { | |
| def loadMIRbase(file: String) = { | |
| val b = Array.newBuilder[miRbase] | |
| for (line <- Source.fromFile(file, "utf-8").getLines; | |
| mirna <- miRbase.parse(line)) { | |
| b += mirna | |
| } | |
| b.result | |
| } | |
| def getEntryCount(gene: Array[Annotation.miRbase]): Int = { | |
| gene.length | |
| } | |
| def getmiRNAinChromosome(mirbase: Array[Annotation.miRbase], chromosome: String): Int = { | |
| var c = 0 | |
| for (m <- mirbase) { | |
| if (m.chr == chromosome) { | |
| c += 1 | |
| } | |
| } | |
| c | |
| } | |
| def main(args: Array[String]): Unit = { | |
| val file = "/Users/yukke/IdeaProjects/hello-scala/src/mature.gtf" | |
| val matureMina = loadMIRbase(file) | |
| println(getEntryCount(matureMina)) | |
| println(getmiRNAinChromosome(matureMina, "chr1")) | |
| } | |
| class miRbase(val chr: String, | |
| val source: String, | |
| val feature: String, | |
| val start: Int, | |
| val end: Int, | |
| val score: String, | |
| val strand: String, | |
| val frame: String, | |
| val attributes: Array[String]) { | |
| override def toString = { | |
| "%s:%d-%d:%s".format(chr, start, end, strand) | |
| } | |
| } | |
| object miRbase { | |
| def parse(line: String): Option[miRbase] = { | |
| def splitBySemicolon(attr: String): Array[String] = { | |
| val col = attr.split(";") | |
| val trID = col(0).replaceAll("(?:transcript_id)|[\"\\s]", "") | |
| val miID = col(1).replaceAll("(?:Alias)|[\"\\s]", "") | |
| val name = col(2).replaceAll("(?:Name)|[\"\\s]", "") | |
| val derivesFrom = col(3).replaceAll("(?:Derives_from)|[\"\\s]", "") | |
| Array(trID, miID, name, derivesFrom) | |
| } | |
| val col = line.split("\\t") | |
| if (col.length != 9) { | |
| sys.error("Wrong number of columns are found") | |
| None | |
| } | |
| else { | |
| Some(new miRbase(col(0), col(1), col(2), col(3).toInt, col(4).toInt, | |
| col(5), col(6), col(7), splitBySemicolon(col(8)))) | |
| } | |
| } | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /** | |
| * Created by yukke on 15/06/06. | |
| */ | |
| import htsjdk.samtools._ | |
| import java.io.File | |
| import scala.collection.JavaConversions._ | |
| object pileup { | |
| def reader(f: File) = { | |
| SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(f) | |
| } | |
| def getReferenceSequences(f: File): Array[SAMSequenceRecord] = { | |
| val r = reader(f) | |
| val seqs = r.getFileHeader.getSequenceDictionary.getSequences | |
| r.close() | |
| val seqArray = new Array[SAMSequenceRecord](seqs.length) | |
| for (s <- seqs) seqArray(s.getSequenceIndex) = s | |
| seqArray | |
| } | |
| def getSequenceNames(seqs: Array[SAMSequenceRecord]): Set[String] = { | |
| seqs.map{ | |
| _.getSequenceName | |
| }.toSet | |
| } | |
| def header(f: File): SAMFileHeader = { | |
| val r = reader(f) | |
| val h = r.getFileHeader | |
| r.close() | |
| h | |
| } | |
| def showIndexStats(f: File) = BAMIndexMetaData.printIndexStats(f) | |
| def getUnalignedReads(f: File) = { | |
| val r = reader(f) | |
| val filteredReads = r.queryUnmapped.filter(validateRead).toList | |
| r.close() | |
| filteredReads | |
| } | |
| def validateRead(read: SAMRecord): Boolean = !read.getDuplicateReadFlag && | |
| !read.getReadFailsVendorQualityCheckFlag && | |
| !read.getNotPrimaryAlignmentFlag | |
| def scanReads(f: File): Int = { | |
| val r = reader(f) | |
| var filtered = 0 | |
| for (read <- r.iterator) { | |
| if (validateRead(read)) { | |
| filtered += 1 | |
| } | |
| } | |
| filtered | |
| } | |
| def readToPileup(reads: Iterator[SAMRecord]){ | |
| } | |
| def main (args: Array[String]) { | |
| val bamFile = new File("data/wgEncodeCshlShortRnaSeqGm12878CellShortAln.bam") | |
| val r = reader(bamFile) | |
| val reads = r.queryOverlapping("chr1", 10573, 20000) | |
| for (read <- reads) { | |
| val loc = read.getAlignmentStart | |
| if (validateRead(read)) { | |
| for (b <- read.getAlignmentBlocks) { | |
| println(b.getLength) | |
| println(read.getReadString) | |
| } | |
| } else println("Bad alignment") | |
| } | |
| } | |
| } |
Author
soh-i
commented
May 21, 2015
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment