Last active
January 2, 2023 23:50
-
-
Save zachcp/40a58b426c60fc212155a456d39a9fa2 to your computer and use it in GitHub Desktop.
Experiment with Cyber for Fasta
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
run: | |
./cyber readfq.cy < test.fasta | |
# 190 Mb | |
downloadref: | |
wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/metagenomes/human_skin_metagenome/latest_assembly_versions/GCA_013297495.1_ASM1329749v1/GCA_013297495.1_ASM1329749v1_genomic.fna.gz | |
gunzip GCA_013297495.1_ASM1329749v1_genomic.fna.gz | |
test_cy: | |
time ./cyber readfq.cy < GCA_013297495.1_ASM1329749v1_genomic.fna | |
test_cy2: | |
time ./cyber readfq2.cy < GCA_013297495.1_ASM1329749v1_genomic.fna | |
test_py: | |
time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna | |
test: test_py test_cy test_cy2 | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- attempt to use cyber as a fast scripting language for bio. | |
--- tried to implement the readfq script. | |
--- Notes: | |
--- io can only do single line at a time. | |
--- as a workaround I created an object to handle the parsing logic. | |
--- match is not yet available | |
--- can you type hint fields in the objects? | |
--- type hints didn't work on in the is_fastx function | |
--- '@+>' is 64 / 43 / 62 | |
func is_fastx(chr) bool: | |
if chr == 64: | |
return true | |
if chr == 62: | |
return true | |
return false | |
func is_plus(chr) bool: | |
if chr == 43: | |
return true | |
return false | |
object Seq: | |
name | |
sequences | |
qualities | |
in_qual | |
func create(): | |
return Seq{ | |
name: "", | |
sequences: [], | |
qualities: [], | |
in_qual: false | |
} | |
func get_seq_len(self): | |
local_n = 0 | |
for self.sequences as i, it: | |
local_len = it.len() | |
local_n += local_len | |
return local_n | |
func add_seq(self, seqtoadd): | |
self.sequences.add(seqtoadd) | |
func add_quals(self, qualtoadd): | |
self.qualities.add(qualtoadd) | |
func dump(self): | |
print "\n----------\nDumping Seq...." | |
print self.name | |
for self.sequences as s: | |
print s | |
for self.qualities as s: | |
print s | |
print '---------\n\n' | |
n = 0 | |
slen = 0 | |
qlen = 0 | |
line_count = 0 | |
seq = Seq.create() | |
for: | |
line_count += 1 | |
--- print 'line count is {line_count}' | |
line = readLine() | |
if line == error(#EndOfStream): | |
--- print 'end of stream' | |
n += 1 | |
slen += seq.get_seq_len() | |
break | |
-- if its a new fasta line then register the global counts | |
-- and then start a new record. | |
if is_fastx(line.charAt(0)): | |
--- print 'Matched a new sequence!' | |
if seq.name != "": | |
n += 1 | |
slen += seq.get_seq_len() | |
seq = Seq.create() | |
seq.name = line | |
continue | |
else is_plus(line.charAt(0)): | |
seq.in_qual = true | |
else: | |
if seq.in_qual == false: | |
seq.add_seq(line) | |
--- print seq.dump() | |
print 'There are {slen} bases from {n} records in this file.' | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def readfq(fp): # this is a generator function | |
last = None # this is a buffer keeping the last unprocessed line | |
while True: # mimic closure; is it a bad idea? | |
if not last: # the first record or a record following a fastq | |
for l in fp: # search for the start of the next record | |
if l[0] in '>@': # fasta/q header line | |
last = l[:-1] # save this line | |
break | |
if not last: | |
break | |
name, seqs, last = last[1:].partition(" ")[0], [], None | |
for l in fp: # read the sequence | |
if l[0] in '@+>': | |
last = l[:-1] | |
break | |
seqs.append(l[:-1]) | |
if not last or last[0] != '+': # this is a fasta record | |
yield name, ''.join(seqs), None # yield a fasta record | |
if not last: break | |
else: # this is a fastq record | |
seq, leng, seqs = ''.join(seqs), 0, [] | |
for l in fp: # read the quality | |
seqs.append(l[:-1]) | |
leng += len(l) - 1 | |
if leng >= len(seq): # have read enough quality | |
last = None | |
yield name, seq, ''.join(seqs); # yield a fastq record | |
break | |
if last: # reach EOF before reading enough quality | |
yield name, seq, None # yield a fasta record instead | |
break | |
if __name__ == "__main__": | |
import sys | |
n, slen, qlen = 0, 0, 0 | |
for name, seq, qual in readfq(sys.stdin): | |
n += 1 | |
slen += len(seq) | |
qlen += qual and len(qual) or 0 | |
print(f" There are {n} records and {slen} bases") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- minimal parse. don't use object or fastq | |
--- '@+>' is 64 / 43 / 62 | |
func is_fastx(chr) bool: | |
if chr == 64: | |
return true | |
if chr == 62: | |
return true | |
return false | |
n = 0 | |
slen = 0 | |
qlen = 0 | |
for: | |
line = readLine() | |
if line == error(#EndOfStream): | |
break | |
if is_fastx(line.charAt(0)): | |
n += 1 | |
else: | |
slen += line.len() | |
print 'There are {slen} bases from {n} records in this file.' | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(base) zcharlop-powers@ZL-12760 cyber-fasta % make test | |
time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna | |
There are 341540 records and 161512289 bases | |
real 0m1.065s | |
user 0m0.981s | |
sys 0m0.060s | |
time ./cyber readfq.cy < GCA_013297495.1_ASM1329749v1_genomic.fna | |
There are 27323200 bases from 341540 records in this file. | |
real 2m24.335s | |
user 0m53.681s | |
sys 1m28.221s | |
time ./cyber readfq2.cy < GCA_013297495.1_ASM1329749v1_genomic.fna | |
There are 161512289 bases from 341540 records in this file. | |
real 2m30.641s | |
user 0m52.714s | |
sys 1m28.372s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment