Skip to content

Instantly share code, notes, and snippets.

@matiasinsaurralde
Last active November 17, 2021 11:46
Show Gist options
  • Save matiasinsaurralde/82270cd624ffd22283dccb024bad5961 to your computer and use it in GitHub Desktop.
Save matiasinsaurralde/82270cd624ffd22283dccb024bad5961 to your computer and use it in GitHub Desktop.
program
% go test -benchmem -run=^$ -bench Bench github.com/matiasinsaurralde/chronomics
goos: darwin
goarch: amd64
pkg: github.com/matiasinsaurralde/chronomics
cpu: VirtualApple @ 2.50GHz
BenchmarkLineByLine-8 4094 262549 ns/op 330682 B/op 1713 allocs/op
BenchmarkFullRead-8 4506 261635 ns/op 330682 B/op 1713 allocs/op
BenchmarkIndexed-8 27243445 43.77 ns/op 16 B/op 1 allocs/op
PASS
ok github.com/matiasinsaurralde/chronomics 3.945s
package main
import (
"bufio"
"errors"
"fmt"
"io/ioutil"
"os"
"strings"
"time"
)
const (
inputFileName = "input_tiny.vcf"
)
var (
errNoMatch = errors.New("no match")
// Used by readSequenceIndexed
chromosomePositionIndex map[string]string
)
func readSequenceLineByLine(inputChromosome, inputPosition string) (reference *string, err error) {
file, err := os.Open(inputFileName)
if err != nil {
return nil, err
}
defer file.Close()
scanner := bufio.NewScanner(file)
for scanner.Scan() {
ln := scanner.Text()
if strings.HasPrefix(ln, "#") {
continue
}
rows := strings.Split(ln, "\t")
if len(rows) < 3 {
continue
}
var (
chromosome = rows[0]
position = rows[1]
ref = rows[3]
)
if inputChromosome == chromosome && inputPosition == position {
reference = &ref
return
}
}
return nil, errNoMatch
}
func readSequenceFullRead(inputChromosome, inputPosition string) (reference *string, err error) {
rawData, err := ioutil.ReadFile(inputFileName)
if err != nil {
return nil, err
}
lines := strings.Split(string(rawData), "\n")
for _, ln := range lines {
if strings.HasPrefix(ln, "#") {
continue
}
rows := strings.Split(ln, "\t")
if len(rows) < 3 {
continue
}
var (
chromosome = rows[0]
position = rows[1]
ref = rows[3]
)
if inputChromosome == chromosome && inputPosition == position {
reference = &ref
return
}
}
return nil, errNoMatch
}
func readSequenceIndexed(inputChromosome, inputPosition string) (reference *string, err error) {
if chromosomePositionIndex == nil {
chromosomePositionIndex = make(map[string]string, 0)
var rawData []byte
rawData, err = ioutil.ReadFile(inputFileName)
if err != nil {
return nil, err
}
lines := strings.Split(string(rawData), "\n")
for _, ln := range lines {
if strings.HasPrefix(ln, "#") {
continue
}
rows := strings.Split(ln, "\t")
if len(rows) < 3 {
continue
}
var (
chromosome = rows[0]
position = rows[1]
ref = rows[3]
)
key := chromosome + position
chromosomePositionIndex[key] = ref
}
}
inputKey := inputChromosome + inputPosition
val, found := chromosomePositionIndex[inputKey]
if found {
return &val, nil
}
return nil, errNoMatch
}
func main() {
ref, err := readSequenceFullRead("chr1", "16837")
if err != nil {
panic(err)
}
fmt.Println("ref=", *ref)
ref2, err := readSequenceLineByLine("chr1", "16837")
if err != nil {
panic(err)
}
fmt.Println("ref2=", *ref2)
}
const fs = require('fs'),
readline = require('readline')
var readSequence = (inputChromosome, inputPosition) => {
return new Promise(function (resolve, reject) {
const rl = readline.createInterface({
input: fs.createReadStream("input_tiny.vcf")
})
let foundRef = null;
rl.on('line', function (line) {
if (foundRef != null) {
return
}
// Skip comments:
if (line.startsWith("#")) {
return
}
// Split rows (TSV format):
const rows = line.split("\t")
if (rows.length == 0) {
return
}
// Map fields:
const chromosome = rows[0],
position = rows[1],
ref = rows[3]
// If there's a match resolve immediately:
if (chromosome == inputChromosome && inputPosition == position) {
foundRef = ref
resolve(foundRef)
}
})
// Return an error when the stream closes with no match:
rl.on('close', function () {
if (foundRef == null) {
reject(new Error("No match"))
}
})
})
}
// Lookup valid match:
readSequence("chr1", "10049").then(function (ref) {
console.log("ref is:", ref)
}, function (err) {
console.log("error:", err)
})
// Lookup invalid match:
readSequence("chr1", "16831").then(function (ref) {
console.log("ref is:", ref)
}, function (err) {
console.log("error:", err)
})
package main
import "testing"
func BenchmarkLineByLine(b *testing.B) {
for n := 0; n < b.N; n++ {
readSequenceFullRead("chr1", "16837")
}
}
func BenchmarkFullRead(b *testing.B) {
for n := 0; n < b.N; n++ {
readSequenceFullRead("chr1", "16837")
}
}
func BenchmarkIndexed(b *testing.B) {
for n := 0; n < b.N; n++ {
readSequenceIndexed("chr1", "16837")
}
}

Summary

A very simple vanilla implementation was done in Javascript (main.js), it might not be the simplest approach. The simplest approach would probably consists of a function that loads the full input file on memory (with the consequent impact it might have depending on the file size and system capacity). The implemented function uses the readline read stream and Promises approach to resolve the result as early as possible, while the file is read line by line.

I've explored additional scenarios in Golang, providing reference implementations for different possible strategies. Main reason is that I'm pretty familiar with Golang benchmark tooling. These were the results:

BenchmarkLineByLine-8   	    4276	    259729 ns/op	  330681 B/op	    1713 allocs/op
BenchmarkFullRead-8     	    4486	    262854 ns/op	  330681 B/op	    1713 allocs/op
BenchmarkIndexed-8      	27449694	        43.70 ns/op	      16 B/op	       1 allocs/op

The BenchmarkLineByLine replicates the same implemented logic from the Javascript sample.

The BenchmarkFullRead is different to the previous one as it loads the full input file into memory before reading line-by-line. This is one of the fastest approaches and the memory usage is pretty much the same (see number of allocs). The main disadvantage is that we're limited to the amount of memory that's available, which should be enough to temporarily store the file on memory. Could be a good option if we know what the file size looks like. It's slightly faster than the line by line approach.

The BenchmarkIndexed follows a completely different approach. Even though the file is also completely loaded into memory like the full read approach, this is only done once in order to generate an index -all benchmarks are ran multiple times-. The index consists of a key (chromosome+position) while the reference is used as the value. For every lookup, we rely on a hashmap lookup allowing way faster reads. It's multiple times faster than any of the previous approaches.

To summarize there might be different possible optimizations depending on the constraints (speed requirements, memory requirements, etc.) and context of the application. In the context of this small experiment the full read or indexing approach might be the fastest available approaches despite the involved memory penalty they may present.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment