Skip to content

Instantly share code, notes, and snippets.

@jimhester
Last active January 8, 2018 14:50
Show Gist options
  • Save jimhester/3169859 to your computer and use it in GitHub Desktop.
Save jimhester/3169859 to your computer and use it in GitHub Desktop.
Parsing fasta files in perl ruby python and go
#!/usr/bin/env perl
use warnings;use strict;
my ($header,$sequence);
$header = <>;
chomp $header;
while(my $line = <>){
chomp $line;
if($line =~ /^>/){
$header = substr($header,1);
$header = $line;
$sequence = ''
print "$sequence\tlength($sequence)\n";
} else {
$sequence .= $line;
}
}
#!/usr/bin/env perl
use warnings;use strict;
use Bio::SeqIO;
my $in = Bio::SeqIO->new(-file => shift, '-format' => 'Fasta');
while(my $rec = $in->next_seq() ){
print join(" ",$rec->display_id,$rec->length)."\n";
}
#!/usr/bin/env perl
use warnings;use strict;
use Bio::SeqIO;
my $in = Bio::SeqIO->new(-file => shift, -format => 'Fasta', -alphabet => 'dna');
while(my $rec = $in->next_seq() ){
print join(" ",$rec->display_id,$rec->length)."\n";
}
#!/usr/bin/env python
import sys
from Bio import SeqIO
for record in SeqIO.parse(sys.argv[1],'fasta'):
print record.id, len(record)
#!/usr/bin/env ruby
require 'bio'
ff = Bio::FlatFile.new(Bio::FastaFormat,ARGF)
ff.each_entry do |record|
puts [record.definition, record.nalen.to_s ].join(" ")
end
package main
import (
"bufio"
"io"
"fmt"
"os"
"strings"
)
type fasta struct {
header string
sequence string
}
func NewFastxReader(f io.Reader) *FastxReader {
return &FastxReader{
r: bufio.NewReader(f),
}
}
type FastxReader struct {
r *bufio.Reader
}
func (r *FastxReader) next_seq() (record fasta, err error) {
var str string
if str, err = r.r.ReadString('>'); err == nil {
if str, err = r.r.ReadString('>'); err == nil {
split_result := strings.SplitN(str, "\n", 2)
record.header = split_result[0]
//remove newlines and trailing >
record.sequence = chomp(strings.Replace(split_result[1], "\n", "", -1), ">")
}
}
return record, err
}
//remove last char in a string if that char is the delim
func chomp(s string, delim string) string {
if s[len(s)-1] == delim[0] {
return s[0:len(s)-1]
}
return s
}
func main() {
file, _ := os.Open(os.Args[1])
fastx = NewFastxReader(file)
for rec, err := fastx.next_seq(); err == nil; {
fmt.Println(rec.header, "\t", len(rec.sequence))
rec, err = fastx.next_seq();
}
}
#!/usr/bin/env perl
use warnings;use strict;
local $/ = ">";
my $first = <>;
while(my $record = <>){
chomp $record;
my $newline_loc = index($record,"\n");
my $header = substr($record,0,$newline_loc);
my $sequence = substr($record,$newline_loc+1);
$sequence =~ tr/\n//d;
print "$sequence\t" . length($sequence) . "\n";
}
#!/usr/bin/env python
import os,sys
f = open(sys.argv[1],'rU')
header = f.readline()
header = header.rstrip(os.linesep)
sequence=''
for line in f:
line = line.rstrip('\n')
if(line[0] == '>'):
header = header[1:]
header = line
print header, len(sequence)
sequence = ''
else:
sequence += line
print header, len(sequence)
#!/usr/bin/env ruby
$/ = ">"
ARGF.gets
while rec = ARGF.gets
rec.chomp!
nl = rec.index("\n")
header = rec[0..nl-1]
seq = rec[nl+1..-1]
seq.gsub!(/\n/,'')
puts [header, seq.length].join(" ")
end
#!/usr/bin/env python
import os,sys
f = open(sys.argv[1],'rU')
header = f.readline()
header = header.rstrip(os.linesep)
sequences=[]
for line in f:
line = line.rstrip('\n')
if(line[0] == '>'):
header = header[1:]
header = line
print header, len("".join(sequences))
sequences = []
else:
sequences.append(line)
print header, len("".join(sequences))
>chr1|this is a fasta header|example
CCTAAACCCTGAACCCTAAACCCTAAACCCTGAACCCTAAACCCTGAACCCTGAACCCTAAACCCTGAACCCTAAACCTA
AACCCTGAACCCTGAACCCTAAACCCTGAACCCTAAACCCTAAACCCTAAACCTAAACCCTGAACCCTAAACCTAAACCC
TGAACCCTAAACCCTAAACCCTGAACCCTAAACCTAAACCCTGAACCCTAACCCCTGAACCCTAAACCCTGAACCCTAAA
CCCTGAAACCTGAACCCTGAACCCTAAACCCTAAACCCTGAACCCTAAACCCTGAACCCTGAACCCTAAACCCTGAACCC
>chr2|another record
TAAACCCTGAACCCTAAACCCTAAACCCTGAACCCTAAACCTAAACCATGAACCCTAAACCCTGAACCCTAAACCCTAAA
CCCTGAACCCTAAACCCTGAACCCTAAACCTAAACCCTAAACCCTGAACCCTAAACCTGAACCCTAAACCCCTAAACCTA
AACCCTGAAACCTAAACCCTAAACCCTGAACCCTAAACCCTAAACCCTGAACCCTGAAACCCTGAACCCTAAACCATGAA
CCCTGAACCCTAAACCCTAAACCCTAAACCCTGAACCCTGAACCCTAAACCTAAACCCTGAACCCTGAACCCTAAACCCT
GAACCCTAAACCCTAAACCCTGAACCCTAAACCCTGAACCCTAAACCCTGAACCCTGAACCCTAAACCCTGAACCCTAAA
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment