Skip to content

Instantly share code, notes, and snippets.

@soh-i
Last active August 29, 2015 14:21
Show Gist options
  • Select an option

  • Save soh-i/119e690ac6f9000ceb33 to your computer and use it in GitHub Desktop.

Select an option

Save soh-i/119e690ac6f9000ceb33 to your computer and use it in GitHub Desktop.
read GFF in scala
import java.io.File
import htsjdk.samtools._
import scala.compat.Platform
import scala.collection.JavaConversions._
import scala.util.control.Breaks.break
/**
* Created by yukke on 15/05/23.
*/
trait Benchmark {
def run()
var multiplier = 1
def runBenchmark(noTimes: Int): List[Int] = {
for (i <- List.range(1, noTimes + 1)) yield {
val startTime = Platform.currentTime
var i = 0
while (i < multiplier) {
run()
i += 1
}
val stopTime = Platform.currentTime
Platform.collectGarbage()
(stopTime - startTime).toInt
}
}
}
object bamReader {
def readAllBAMRecords(f: File): Int = {
//val reader: SamReader = SamReaderFactory.makeDefault().open(f)
val reader = SamReaderFactory.make()
.disable(SamReaderFactory.Option.DONT_MEMORY_MAP_INDEX)
.validationStringency(ValidationStringency.SILENT)
.samRecordFactory(DefaultSAMRecordFactory.getInstance())
.open(f)
val iter: SAMRecordIterator = reader.iterator()
var nRecs = 0
while (iter.hasNext) {
val rec: SAMRecord = iter.next()
rec.getAlignmentBlocks
//nRecs += 1
//val c = nRecs % 1000000
//if (c == 0) {
//println(nRecs)
//}
}
iter.close()
reader.close()
//println(nRecs)
nRecs
}
def benchmark(times: Int)(f: => Any) = new Benchmark{def run = f }.runBenchmark(times)
def getProcedures(bamHeader: SamReader): List[htsjdk.samtools.SAMProgramRecord] = {
bamHeader.getFileHeader.getProgramRecords.toList
}
def getMaxChromosomeLength(bamHeaderRecs: java.util.List[htsjdk.samtools.SAMSequenceRecord]): Int = {
bamHeaderRecs.map(_.getSequenceLength).max
}
def createIndexOfChromosomes(bamReader: SamReader): Map[String, Int] = {
val chromIndex = {
val b = Map.newBuilder[String, Int]
for (rec <- bamReader.getFileHeader.getSequenceDictionary.getSequences) {
b += rec.getSequenceName -> rec.getSequenceLength
}
b.result
}
chromIndex
}
def averageBaseQuality(qual: Array[Byte]): Int = {
var sum = 0
for (b <- qual) sum += b
sum / qual.length
}
def main(args: Array[String]) {
val GM12872 = new File("data/wgEncodeCshlShortRnaSeqGm12878CellShortAln.bam")
val bamReader = SamReaderFactory.makeDefault().open(GM12872)
val bamHeaderRecs = bamReader.getFileHeader.getSequenceDictionary.getSequences
val chrIndex = createIndexOfChromosomes(bamReader)
//println(chrIndex.valuesIterator.reduceLeft((x, y) => if(x>y) x else y))
val maxChrom = getMaxChromosomeLength(bamHeaderRecs)
val genome = new File("")
//val fastaReader = htsjdk.samtools.reference.FastaSequenceFile
for (read: htsjdk.samtools.SAMRecord <- bamReader.iterator) {
//println(averageBaseQuality(read.getBaseQualities))
//println("%d-%d".format(read.getStart, read.getEnd))
var mismatches = 0
val readBases: Array[Byte] = read.getReadBases
//println(readBases.take(1).mkString(""))
val referenceBase = Array(41.toByte)
for (block <- read.getAlignmentBlocks) {
val readBlockStart = block.getReadStart - 1
val referenceBlockStart = block.getReferenceStart - 1
val length = block.getLength
println("Read block start: <%d>".format(readBlockStart))
println("Ref block start: <%d>".format(referenceBlockStart-1))
println("Block length: <%d>".format(length))
//println(referenceBase(referenceBlockStart+1))
//val i = htsjdk.samtools.util.SequenceUtil.basesEqual(readBases(readBlockStart), referenceBase(referenceBlockStart))
//println(i)
for (i <- 0 to length-1) {
if (!htsjdk.samtools.util.SequenceUtil.basesEqual(readBases(readBlockStart + i), referenceBase(0))) {
mismatches += 1
}
}
}
println("# of mismatches: <%d>".format(mismatches))
//val ref:Array[Byte] = Array("T".toByte, "C".toByte)
//val mismatch = htsjdk.samtools.util.SequenceUtil.countMismatches(rec, ref, 0, true)
//println(mismatch)
break
}
}
}
/**
* Created by yukke on 15/05/21.
*/
import scala.io.Source
object Program {
val gffFile: String = "/Users/yukke/IdeaProjects/hello-scala/src/mature.gtf"
def countLine(file: String): Int = {
Source.fromFile(file, "utf-8").getLines().length
}
def tabToComma(file: String): Iterator[String] = {
Source.fromFile(file, "utf-8").getLines().map(_.replaceAll("\t", ","))
}
def showHead(file: String, N: Int): List[String] = {
Source.fromFile(file, "utf-8").getLines().toList.take(N)
}
def showTail(file: String, N: Int): List[String] = {
Source.fromFile(file, "utf-8").getLines().toList.takeRight(N)
}
def removeSpaces(s: String): String = {
s.replaceAll("[\\s]+", "")
}
def parseAttributeLine(col: String): Map[String, String] = {
val attributes = col.split(";")
val trID = attributes(0).replaceAll("(?:transcript_id)|[\"\\s]", "")
val miID = attributes(1).replaceAll("(?:Alias)|[\"\\s]", "")
val name = attributes(2).replaceAll("(?:Name)|[\"\\s]", "")
val derivesFrom = attributes(3).replaceAll("(?:Derives_from)|[\"\\s]", "")
Map("TranscriptID" -> trID, "Alias" -> miID, "Name" -> name, "Derives_from" -> derivesFrom)
}
def generateGffMap(file: String): Map[Int, Map[String, Any]] = {
val pair = Map.newBuilder[Int, Map[String, Any]]
for (line <- Source.fromFile(gffFile, "utf-8").getLines()) {
val cols = line.split("\t")
val chromosome = cols(0)
val start = cols(3)
val end = cols(4)
val strand = cols(6)
val primaryKey: Int = start.toInt + end.toInt
val attrs = parseAttributeLine(cols(8))
pair += primaryKey -> Map("chr" -> chromosome, "start" -> start, "end" -> end, "strand" -> strand, "attributes" -> attrs)
}
pair.result
}
def main(args: Array[String]): Unit = {
val miData = generateGffMap(gffFile)
println(miData.take(1))
}
}
/**
* Created by yukke on 15/05/21.
*/
import scala.io.Source
object Annotation {
def loadMIRbase(file: String) = {
val b = Array.newBuilder[miRbase]
for (line <- Source.fromFile(file, "utf-8").getLines;
mirna <- miRbase.parse(line)) {
b += mirna
}
b.result
}
def getEntryCount(gene: Array[Annotation.miRbase]): Int = {
gene.length
}
def getmiRNAinChromosome(mirbase: Array[Annotation.miRbase], chromosome: String): Int = {
var c = 0
for (m <- mirbase) {
if (m.chr == chromosome) {
c += 1
}
}
c
}
def main(args: Array[String]): Unit = {
val file = "/Users/yukke/IdeaProjects/hello-scala/src/mature.gtf"
val matureMina = loadMIRbase(file)
println(getEntryCount(matureMina))
println(getmiRNAinChromosome(matureMina, "chr1"))
}
class miRbase(val chr: String,
val source: String,
val feature: String,
val start: Int,
val end: Int,
val score: String,
val strand: String,
val frame: String,
val attributes: Array[String]) {
override def toString = {
"%s:%d-%d:%s".format(chr, start, end, strand)
}
}
object miRbase {
def parse(line: String): Option[miRbase] = {
def splitBySemicolon(attr: String): Array[String] = {
val col = attr.split(";")
val trID = col(0).replaceAll("(?:transcript_id)|[\"\\s]", "")
val miID = col(1).replaceAll("(?:Alias)|[\"\\s]", "")
val name = col(2).replaceAll("(?:Name)|[\"\\s]", "")
val derivesFrom = col(3).replaceAll("(?:Derives_from)|[\"\\s]", "")
Array(trID, miID, name, derivesFrom)
}
val col = line.split("\\t")
if (col.length != 9) {
sys.error("Wrong number of columns are found")
None
}
else {
Some(new miRbase(col(0), col(1), col(2), col(3).toInt, col(4).toInt,
col(5), col(6), col(7), splitBySemicolon(col(8))))
}
}
}
}
/**
* Created by yukke on 15/06/06.
*/
import htsjdk.samtools._
import java.io.File
import scala.collection.JavaConversions._
object pileup {
def reader(f: File) = {
SamReaderFactory.makeDefault().validationStringency(ValidationStringency.SILENT).open(f)
}
def getReferenceSequences(f: File): Array[SAMSequenceRecord] = {
val r = reader(f)
val seqs = r.getFileHeader.getSequenceDictionary.getSequences
r.close()
val seqArray = new Array[SAMSequenceRecord](seqs.length)
for (s <- seqs) seqArray(s.getSequenceIndex) = s
seqArray
}
def getSequenceNames(seqs: Array[SAMSequenceRecord]): Set[String] = {
seqs.map{
_.getSequenceName
}.toSet
}
def header(f: File): SAMFileHeader = {
val r = reader(f)
val h = r.getFileHeader
r.close()
h
}
def showIndexStats(f: File) = BAMIndexMetaData.printIndexStats(f)
def getUnalignedReads(f: File) = {
val r = reader(f)
val filteredReads = r.queryUnmapped.filter(validateRead).toList
r.close()
filteredReads
}
def validateRead(read: SAMRecord): Boolean = !read.getDuplicateReadFlag &&
!read.getReadFailsVendorQualityCheckFlag &&
!read.getNotPrimaryAlignmentFlag
def scanReads(f: File): Int = {
val r = reader(f)
var filtered = 0
for (read <- r.iterator) {
if (validateRead(read)) {
filtered += 1
}
}
filtered
}
def readToPileup(reads: Iterator[SAMRecord]){
}
def main (args: Array[String]) {
val bamFile = new File("data/wgEncodeCshlShortRnaSeqGm12878CellShortAln.bam")
val r = reader(bamFile)
val reads = r.queryOverlapping("chr1", 10573, 20000)
for (read <- reads) {
val loc = read.getAlignmentStart
if (validateRead(read)) {
for (b <- read.getAlignmentBlocks) {
println(b.getLength)
println(read.getReadString)
}
} else println("Bad alignment")
}
}
}
@soh-i
Copy link
Author

soh-i commented May 21, 2015

Map(152141126 ->
  Map(attributes ->
    Map(TranscriptID -> MIMAT0029782_1,
            Alias -> MIMAT0029782,
            Name -> hsa-miR-7641,
            Derives_from -> MI0024976),
    chr -> chr14,
    end -> 76070572, 
    strand -> +,
    start -> 76070554
   )
)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment