zachcp · January 2, 2023 23:50
diff --git a/Makefile b/Makefile
 run:
 	./cyber readfq.cy < test.fasta

 # 190 Mb
 downloadref:
 	wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/metagenomes/human_skin_metagenome/latest_assembly_versions/GCA_013297495.1_ASM1329749v1/GCA_013297495.1_ASM1329749v1_genomic.fna.gz
 	gunzip GCA_013297495.1_ASM1329749v1_genomic.fna.gz 


 test_cy:
 	time ./cyber readfq.cy <  GCA_013297495.1_ASM1329749v1_genomic.fna

 test_cy2:
 	time ./cyber readfq2.cy <  GCA_013297495.1_ASM1329749v1_genomic.fna

 test_py:
 	time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna


 test: test_py test_cy test_cy2 


diff --git a/readfq.cy b/readfq.cy
 --- attempt to use cyber as a fast scripting language for bio.
 --- tried to implement the readfq script.


 --- Notes:
 ---    io can only do single line at a time.
 ---    as a workaround I created an object to handle the parsing logic.
 ---    match is not yet available
 ---    can you type hint fields in the objects?
 ---    type hints didn't work on in the is_fastx function


 ---  '@+>' is  64 / 43 / 62
 func is_fastx(chr) bool:
    if chr == 64:
        return true
    if chr == 62:
        return true
    return false

 func is_plus(chr) bool:
    if chr == 43:
        return true
    return false


 object Seq:
    name
    sequences
    qualities
    in_qual

    func create():
        return Seq{
            name: "", 
            sequences: [],
            qualities: [],
            in_qual: false
        }

    func get_seq_len(self):
        local_n = 0
        for self.sequences as i, it: 
            local_len = it.len()
            local_n += local_len
            return local_n

    func add_seq(self, seqtoadd):
        self.sequences.add(seqtoadd)

    func add_quals(self, qualtoadd):
        self.qualities.add(qualtoadd)

    func dump(self):
        print "\n----------\nDumping Seq...."
        print self.name
        for self.sequences as s:
            print s
        for self.qualities as s:
            print s
        print '---------\n\n'



 n     = 0
 slen  = 0
 qlen  = 0
 line_count = 0

 seq   = Seq.create()


 for:
    line_count += 1
    --- print 'line count is {line_count}'

    line = readLine()
    if line == error(#EndOfStream):

        --- print 'end of stream'
        n    += 1
        slen += seq.get_seq_len()
        break

    -- if its a new fasta line then register the global counts
    -- and then start a new record.
    if is_fastx(line.charAt(0)):
        --- print 'Matched a new sequence!'

        if seq.name != "":
            n    += 1
            slen += seq.get_seq_len()
            

        seq = Seq.create()
        seq.name = line
        continue
    else is_plus(line.charAt(0)):
        seq.in_qual = true
    else:
        if seq.in_qual == false:
            seq.add_seq(line)

    --- print seq.dump()

 print 'There are {slen} bases from {n} records in this file.'

diff --git a/readfq.py b/readfq.py


 def readfq(fp): # this is a generator function
    last = None # this is a buffer keeping the last unprocessed line
    while True: # mimic closure; is it a bad idea?
        if not last: # the first record or a record following a fastq
            for l in fp: # search for the start of the next record
                if l[0] in '>@': # fasta/q header line
                    last = l[:-1] # save this line
                    break
        if not last: 
            break
        name, seqs, last = last[1:].partition(" ")[0], [], None
        for l in fp: # read the sequence
            if l[0] in '@+>':
                last = l[:-1]
                break
            seqs.append(l[:-1])
        if not last or last[0] != '+': # this is a fasta record
            yield name, ''.join(seqs), None # yield a fasta record
            if not last: break
        else: # this is a fastq record
            seq, leng, seqs = ''.join(seqs), 0, []
            for l in fp: # read the quality
                seqs.append(l[:-1])
                leng += len(l) - 1
                if leng >= len(seq): # have read enough quality
                    last = None
                    yield name, seq, ''.join(seqs); # yield a fastq record
                    break
            if last: # reach EOF before reading enough quality
                yield name, seq, None # yield a fasta record instead
                break

 if __name__ == "__main__":
    import sys
    n, slen, qlen = 0, 0, 0
    for name, seq, qual in readfq(sys.stdin):
        n += 1
        slen += len(seq)
        qlen += qual and len(qual) or 0
    print(f" There are {n} records and {slen} bases")
diff --git a/readfq2.cy b/readfq2.cy
 --- minimal parse. don't use object or fastq 


 ---  '@+>' is  64 / 43 / 62
 func is_fastx(chr) bool:
    if chr == 64:
        return true
    if chr == 62:
        return true
    return false


 n     = 0
 slen  = 0
 qlen  = 0

 for:
    line = readLine()
    if line == error(#EndOfStream):
        break

    if is_fastx(line.charAt(0)):
        n    += 1
    else:
        slen += line.len()
    

 print 'There are {slen} bases from {n} records in this file.'

diff --git a/results.txt b/results.txt
 (base) zcharlop-powers@ZL-12760 cyber-fasta % make test
 time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna
 There are 341540 records and 161512289 bases

 real    0m1.065s
 user    0m0.981s
 sys     0m0.060s
 time ./cyber readfq.cy <  GCA_013297495.1_ASM1329749v1_genomic.fna
 There are 27323200 bases from 341540 records in this file.

 real    2m24.335s
 user    0m53.681s
 sys     1m28.221s
 time ./cyber readfq2.cy <  GCA_013297495.1_ASM1329749v1_genomic.fna
 There are 161512289 bases from 341540 records in this file.

 real    2m30.641s
 user    0m52.714s
 sys     1m28.372s
	run:
	./cyber readfq.cy < test.fasta

	# 190 Mb
	downloadref:
	wget https://ftp.ncbi.nlm.nih.gov/genomes/genbank/metagenomes/human_skin_metagenome/latest_assembly_versions/GCA_013297495.1_ASM1329749v1/GCA_013297495.1_ASM1329749v1_genomic.fna.gz
	gunzip GCA_013297495.1_ASM1329749v1_genomic.fna.gz


	test_cy:
	time ./cyber readfq.cy < GCA_013297495.1_ASM1329749v1_genomic.fna

	test_cy2:
	time ./cyber readfq2.cy < GCA_013297495.1_ASM1329749v1_genomic.fna

	test_py:
	time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna


	test: test_py test_cy test_cy2
	--- attempt to use cyber as a fast scripting language for bio.
	--- tried to implement the readfq script.


	--- Notes:
	--- io can only do single line at a time.
	--- as a workaround I created an object to handle the parsing logic.
	--- match is not yet available
	--- can you type hint fields in the objects?
	--- type hints didn't work on in the is_fastx function


	--- '@+>' is 64 / 43 / 62
	func is_fastx(chr) bool:
	if chr == 64:
	return true
	if chr == 62:
	return true
	return false

	func is_plus(chr) bool:
	if chr == 43:
	return true
	return false


	object Seq:
	name
	sequences
	qualities
	in_qual

	func create():
	return Seq{
	name: "",
	sequences: [],
	qualities: [],
	in_qual: false
	}

	func get_seq_len(self):
	local_n = 0
	for self.sequences as i, it:
	local_len = it.len()
	local_n += local_len
	return local_n

	func add_seq(self, seqtoadd):
	self.sequences.add(seqtoadd)

	func add_quals(self, qualtoadd):
	self.qualities.add(qualtoadd)

	func dump(self):
	print "\n----------\nDumping Seq...."
	print self.name
	for self.sequences as s:
	print s
	for self.qualities as s:
	print s
	print '---------\n\n'



	n = 0
	slen = 0
	qlen = 0
	line_count = 0

	seq = Seq.create()


	for:
	line_count += 1
	--- print 'line count is {line_count}'

	line = readLine()
	if line == error(#EndOfStream):

	--- print 'end of stream'
	n += 1
	slen += seq.get_seq_len()
	break

	-- if its a new fasta line then register the global counts
	-- and then start a new record.
	if is_fastx(line.charAt(0)):
	--- print 'Matched a new sequence!'

	if seq.name != "":
	n += 1
	slen += seq.get_seq_len()


	seq = Seq.create()
	seq.name = line
	continue
	else is_plus(line.charAt(0)):
	seq.in_qual = true
	else:
	if seq.in_qual == false:
	seq.add_seq(line)

	--- print seq.dump()

	print 'There are {slen} bases from {n} records in this file.'


	def readfq(fp): # this is a generator function
	last = None # this is a buffer keeping the last unprocessed line
	while True: # mimic closure; is it a bad idea?
	if not last: # the first record or a record following a fastq
	for l in fp: # search for the start of the next record
	if l[0] in '>@': # fasta/q header line
	last = l[:-1] # save this line
	break
	if not last:
	break
	name, seqs, last = last[1:].partition(" ")[0], [], None
	for l in fp: # read the sequence
	if l[0] in '@+>':
	last = l[:-1]
	break
	seqs.append(l[:-1])
	if not last or last[0] != '+': # this is a fasta record
	yield name, ''.join(seqs), None # yield a fasta record
	if not last: break
	else: # this is a fastq record
	seq, leng, seqs = ''.join(seqs), 0, []
	for l in fp: # read the quality
	seqs.append(l[:-1])
	leng += len(l) - 1
	if leng >= len(seq): # have read enough quality
	last = None
	yield name, seq, ''.join(seqs); # yield a fastq record
	break
	if last: # reach EOF before reading enough quality
	yield name, seq, None # yield a fasta record instead
	break

	if __name__ == "__main__":
	import sys
	n, slen, qlen = 0, 0, 0
	for name, seq, qual in readfq(sys.stdin):
	n += 1
	slen += len(seq)
	qlen += qual and len(qual) or 0
	print(f" There are {n} records and {slen} bases")
	--- minimal parse. don't use object or fastq


	--- '@+>' is 64 / 43 / 62
	func is_fastx(chr) bool:
	if chr == 64:
	return true
	if chr == 62:
	return true
	return false


	n = 0
	slen = 0
	qlen = 0

	for:
	line = readLine()
	if line == error(#EndOfStream):
	break

	if is_fastx(line.charAt(0)):
	n += 1
	else:
	slen += line.len()


	print 'There are {slen} bases from {n} records in this file.'
	(base) zcharlop-powers@ZL-12760 cyber-fasta % make test
	time python3 readfq.py < GCA_013297495.1_ASM1329749v1_genomic.fna
	There are 341540 records and 161512289 bases

	real 0m1.065s
	user 0m0.981s
	sys 0m0.060s
	time ./cyber readfq.cy < GCA_013297495.1_ASM1329749v1_genomic.fna
	There are 27323200 bases from 341540 records in this file.

	real 2m24.335s
	user 0m53.681s
	sys 1m28.221s
	time ./cyber readfq2.cy < GCA_013297495.1_ASM1329749v1_genomic.fna
	There are 161512289 bases from 341540 records in this file.

	real 2m30.641s
	user 0m52.714s
	sys 1m28.372s