Last active
August 29, 2015 14:03
-
-
Save def-/c6f3ae893b935464dab0 to your computer and use it in GitHub Desktop.
Fasta_Pairs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 0.19 seconds, 79 MB/s | |
proc fgets(c: cstring, n: int, f: TFile): cstring {. | |
importc: "fgets", header: "<stdio.h>", tags: [FReadIO].} | |
proc myReadLine(f: TFile, line: var TaintedString): bool = | |
var buf {.noinit.}: array[8192, char] | |
setLen(line.string, 0) | |
result = true | |
while true: | |
if fgets(buf, 8192, f) == nil: | |
result = false | |
break | |
if buf[cstring(buf).len-1] == '\l': | |
buf[cstring(buf).len-1] = '\0' | |
add(line, cstring(buf)) | |
break | |
add(line, cstring(buf)) | |
iterator fasta_pairs*(f: TFile): tuple[header: string, sequence: string] = | |
#var sequence = newStringOfCap(15_000_000) # If you know the approximate size beforehand | |
var sequence = "" | |
var header = "" | |
var seq_index = -1 | |
var line = "" | |
while f.myReadLine(line): | |
if line[0] == '>': | |
if seq_index >= 0: | |
yield (header, sequence) | |
header = line | |
sequence.setLen(0) | |
seq_index += 1 | |
else: | |
sequence.add(line) | |
yield (header, sequence) | |
var f = open("sequence.fasta") | |
for i in fasta_pairs(f): | |
echo i[0].len, " ", i[1].len |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 0.09 seconds, 167 MB/s | |
proc handleFasta(header, sequence) = | |
echo header.len, " ", sequence.len | |
var f = system.open("sequence.fasta") | |
const bufSize = 8192 | |
var | |
size = bufSize | |
buf {.noinit.}: array[bufSize, char] | |
sequence = "" | |
header = "" | |
afterNewLine = true | |
afterNewSeq = false | |
while size == bufSize: | |
size = f.readBuffer(addr buf, bufSize) | |
var lastPos = 0 | |
for i in 0 .. <size: | |
if buf[i] == '\l': | |
afterNewLine = true | |
if afterNewSeq: | |
afterNewSeq = false | |
header.setLen(i - lastPos) | |
copyMem(addr header[0], addr buf[lastPos], i - lastPos) | |
else: | |
let oldLen = sequence.len | |
sequence.setLen(oldLen + i - lastPos) | |
copyMem(addr sequence[oldLen], addr buf[lastPos], i - lastPos) | |
lastPos = i+1 | |
elif afterNewLine and buf[i] == '>': | |
afterNewLine = false | |
afterNewSeq = true | |
lastPos = i | |
if header != "": | |
handleFasta(header, sequence) | |
header.setLen(0) | |
sequence.setLen(0) | |
let oldLen = sequence.len | |
sequence.setLen(oldLen + size - lastPos) | |
copyMem(addr sequence[oldLen], addr buf[lastPos], size - lastPos) | |
handleFasta(header, sequence) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Ok, here's the really fast one, doesn't even create the sequence string, | |
# just splits a file into multiple sequences, which you can access over f.mem | |
# with the accompanying start and length. Also contains newlines, which makes | |
# this unfit for most usecases I guess. | |
# 0.03 seconds, 500 MB/s | |
import memfiles | |
var | |
f = memfiles.open("sequence.fasta") | |
fastas: seq[tuple[header: string; seqStart, seqLen: int]] = @[] | |
afterNewLine = true | |
afterNewSeq = false | |
header = "" | |
lastPos = 0 | |
i = 0 | |
seqStart = 0 | |
for c in cast[cstring](f.mem): | |
if c == '\l': | |
if afterNewSeq: | |
header.setLen(i - lastPos - 1) | |
copyMem(addr header[0], cast[pointer](cast[int](f.mem) + lastPos + 1), i - lastPos - 1) | |
afterNewSeq = false | |
seqStart = i + 1 | |
afterNewLine = true | |
elif c == '>': | |
if seqStart > 0: | |
fastas.add((header, seqStart, i - seqStart - 1)) | |
afterNewLine = false | |
afterNewSeq = true | |
else: | |
afterNewLine = false | |
inc(i) | |
fastas.add((header, seqStart, i - seqStart - 1)) | |
echo fastas |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment