Skip to content

Instantly share code, notes, and snippets.

@nylander
Created October 30, 2025 12:17
Show Gist options
  • Save nylander/3e197cb3c683419965b84d327a5217d1 to your computer and use it in GitHub Desktop.
Save nylander/3e197cb3c683419965b84d327a5217d1 to your computer and use it in GitHub Desktop.
Simple fasta to nexus converter reading from stdin and writingto stdout
#!/usr/bin/env perl
=pod
=head2
FILE: fas2nex.stdinout
USAGE: ./fas2nex.stdinout <input>
DESCRIPTION: Convert Fasta file to a 'stripped' Nexus format (readable by MrBayes v.3.2)
OPTIONS: ---
REQUIREMENTS: ---
BUGS: ---
NOTES: The original script was able to see if there where several headers with
the same label, and then tried to merge those sequences - if the total
length for the concatenated sequence was the same as any other labels'
sequence. Hence, it allowed an "interleaved" fasta format.
The current version, however, will not allow this, but quit with an
error indicating that there are duplicated sequence labels.
To change this behaviour, change the variable $ALLOW_INTERLEAVED to 1.
AUTHOR: Johan Nylander (JN)
COMPANY: NRM
VERSION: 1.0
CREATED: 2007
REVISION: Wed 23 Jan 2019 02:35:42 PM CET
TODO:
=cut
use strict;
use warnings;
my $ALLOW_INTERLEAVED = 0;
sub get_data_type {
#=== FUNCTION ================================================================
# NAME: get_data_type
# VERSION: 04/15/2010 08:37:45 AM CEST
# DESCRIPTION: Counts the number of A+C+G+T-'-' in max_n sequences, and if the
# ratio sum_acgt/length_of_all_max_n_seqs > cutoff, then the
# data_type is considered as 'DNA' (else 'PROTEIN')
# PARAMETERS: ref to hash containing the sequences
# RETURNS: string: 'DNA' or 'PROTEIN'
#===============================================================================
my ($hash_ref) = @_;
my $data_type = q{};
my $max_n = 10; # count letters in no more than n sequences
my $cutoff = 0.7; # sum of A+C+G+T-'gap' should be above cutoff
my $nseqs = keys %$hash_ref;
## Adjust max_n for smaller data sets
if ($max_n > $nseqs) {
$max_n = $nseqs;
}
## Get the first max_n keys from the hash
my (@first_max_n_names) = (keys %$hash_ref)[0..$max_n-1];
## Get the (concatenated) sequence from max_n sequences
my $long_seq = @{$hash_ref}{@first_max_n_names}; # Hash slice accesses multiple keys. Note the '@'
# Count the A,C,G,T's. Remove gaps first.
$long_seq =~ s/-//g;
my $acgt_count = $long_seq =~ s/([a|c|g|t])/$1/gi;
## Check ratio
if (($acgt_count/length($long_seq)) > $cutoff) {
$data_type = 'DNA';
}
else {
$data_type = 'PROTEIN';
}
return($data_type);
}
sub get_ntax_nchar {
#=== FUNCTION ================================================================
# NAME: get_ntax_nchar
# VERSION: 02/02/2012 10:03:53 AM
# DESCRIPTION: Get ntax, nseq
# PARAMETERS: ref to hash containing the sequences
# RETURNS: $ntax, $nchar
#===============================================================================
my ($hash_ref) = @_;
my $nchar = q{};
my $ntax = keys %$hash_ref; # length of list of keys
my @taxa = keys %$hash_ref;
## Check sequence lengths
my $tmp_taxon = shift(@taxa);
my $tmp_length = length($hash_ref->{$tmp_taxon}); # length of first seq
foreach my $taxon (keys %$hash_ref) {
my $length = length($hash_ref->{$taxon}); # val is a length
if ($length != $tmp_length) {
print STDERR "\nError!\nLength of sequence differ ($taxon has $length, $tmp_taxon has $tmp_length)\n";
exit(1);
}
else {
$tmp_taxon = $taxon;
$tmp_length = length($hash_ref->{$taxon});
$nchar = $tmp_length;
}
}
return($ntax, $nchar);
}
sub prettydate {
#=== FUNCTION ================================================================
# NAME: prettydate
# VERSION: 02/02/2012 10:58:32 AM
# DESCRIPTION: Returns a date string, e.g. '13:07 10/30/2025'
# PARAMETERS: none
# RETURNS: string
#===============================================================================
@_ = localtime(shift || time);
return(sprintf("%02d:%02d %02d/%02d/%04d", @_[2,1], $_[4]+1, $_[3], $_[5]+1900));
}
MAIN:
{
#=== FUNCTION ================================================================
# NAME: "main"
# VERSION: Wed 23 Jan 2019 02:40:06 PM CET
# DESCRIPTION: Convert fasta to simple nexus. No extensive error checking!
# PARAMETERS: none. Reads from stdin.
# RETURNS: prints to stdout
#===============================================================================
my $taxname = "";
my $ntax = 0;
my $nchar = 0;
my $nheaders = 0;
my %tax_seq = (); # key: taxon name, value: sequence
my %count_tax_hash = (); # key: taxon name, value: count
my @fasta_headers = ();
my $state = 0;
my $data_type = 'DNA';
my $missing_symbol = '?';
my $gap_symbol = '-';
my $space = ' ';
my $datestring = prettydate();
while (<>) {
chomp;
if (/\>(\S*.*)$/i) { # greedy: takes whole line as sequence label
$state = 1;
$taxname = $1;
$nheaders++;
push @fasta_headers, $taxname;
$count_tax_hash{$taxname}++;
}
elsif ($state == 1) {
$tax_seq{$taxname} .= $_;
}
}
$ntax = keys %tax_seq;
## If number of headers ne ntax, then sequences could be interleaved (and then to be concatenated)
## or there could simply be som mislabelling. Checking for sequence length will catch the error
## but will not guide in finding the labels that differ.
if (scalar (keys %tax_seq) != scalar @fasta_headers) {
if ($ALLOW_INTERLEAVED) {
print STDERR "\n[Note: Found duplicate fasta headers. Will see if those could be concatenated.\n";
}
else {
die "\nError: Found duplicate fasta headers. Check input.\n";
}
}
($ntax, $nchar) = get_ntax_nchar(\%tax_seq);
$data_type = get_data_type(\%tax_seq);
print STDOUT "\#NEXUS\n[Converted $datestring]\nBEGIN DATA\;\n";
print STDOUT "DIMENSIONS NTAX=$ntax NCHAR=$nchar\;\n";
print STDOUT "FORMAT MISSING=$missing_symbol GAP=$gap_symbol DATATYPE=$data_type\;\nMATRIX\n";
## Find longest seq label by sorting labels from hash and take the first
my $longest = (reverse sort { $a <=> $b } map { length($_) } keys %tax_seq)[0];
foreach my $seq_label (@fasta_headers) {
my $npad = $longest - length($seq_label) + 1;
my $pad = $space x $npad;
my $label;
if ($seq_label =~ /[\s|-]+/) { # if any spaces or dashes in label, need to print with single ticks
$label = "'" . $seq_label . "'" . $pad;
}
else {
$label = $seq_label . $pad;
}
print STDOUT $label, $tax_seq{$seq_label}, "\n";
}
print STDOUT "\t\;\nEND;\n";
}
__END__
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment