nylander · October 30, 2025 12:17
diff --git a/fas2nex.stdinout b/fas2nex.stdinout
 #!/usr/bin/env perl

 =pod

 =head2

         FILE: fas2nex.stdinout

        USAGE: ./fas2nex.stdinout <input>

  DESCRIPTION: Convert Fasta file to a 'stripped' Nexus format (readable by MrBayes v.3.2)

      OPTIONS: ---

 REQUIREMENTS: ---

         BUGS: ---

        NOTES: The original script was able to see if there where several headers with
               the same label, and then tried to merge those sequences - if the total
               length for the concatenated sequence was the same as any other labels'
               sequence. Hence, it allowed an "interleaved" fasta format.
               The current version, however, will not allow this, but quit with an
               error indicating that there are duplicated sequence labels.
               To change this behaviour, change the variable $ALLOW_INTERLEAVED to 1.

       AUTHOR: Johan Nylander (JN)

      COMPANY: NRM

      VERSION: 1.0

      CREATED: 2007

     REVISION: Wed 23 Jan 2019 02:35:42 PM CET

         TODO: 

 =cut

 use strict;
 use warnings;

 my $ALLOW_INTERLEAVED = 0;

 sub get_data_type {

    #===  FUNCTION  ================================================================
    #         NAME:  get_data_type
    #      VERSION:  04/15/2010 08:37:45 AM CEST
    #  DESCRIPTION:  Counts the number of A+C+G+T-'-' in max_n sequences, and if the
    #                ratio sum_acgt/length_of_all_max_n_seqs > cutoff, then the
    #                data_type is considered as 'DNA' (else 'PROTEIN')
    #   PARAMETERS:  ref to hash containing the sequences
    #      RETURNS:  string: 'DNA' or 'PROTEIN'
    #===============================================================================

    my ($hash_ref) = @_;
    my $data_type  = q{};
    my $max_n      = 10;              # count letters in no more than n sequences
    my $cutoff     = 0.7;             # sum of A+C+G+T-'gap' should be above cutoff
    my $nseqs      = keys %$hash_ref;

    ## Adjust max_n for smaller data sets
    if ($max_n > $nseqs) {
        $max_n = $nseqs;
    }

    ## Get the first max_n keys from the hash
    my (@first_max_n_names) = (keys %$hash_ref)[0..$max_n-1];

    ## Get the (concatenated) sequence from max_n sequences
    my $long_seq = @{$hash_ref}{@first_max_n_names}; # Hash slice accesses multiple keys. Note the '@'

    # Count the A,C,G,T's. Remove gaps first.
    $long_seq =~ s/-//g;
    my $acgt_count = $long_seq =~ s/([a|c|g|t])/$1/gi;

    ## Check ratio
    if (($acgt_count/length($long_seq)) > $cutoff) {
        $data_type = 'DNA';
    }
    else {
        $data_type = 'PROTEIN';
    }

    return($data_type);

 }

 sub get_ntax_nchar {

    #===  FUNCTION  ================================================================
    #         NAME:  get_ntax_nchar
    #      VERSION:  02/02/2012 10:03:53 AM
    #  DESCRIPTION:  Get ntax, nseq
    #   PARAMETERS:  ref to hash containing the sequences
    #      RETURNS:  $ntax, $nchar
    #===============================================================================

    my ($hash_ref) = @_;
    my $nchar      = q{};
    my $ntax       = keys %$hash_ref; # length of list of keys
    my @taxa       = keys %$hash_ref;

    ## Check sequence lengths
    my $tmp_taxon = shift(@taxa);
    my $tmp_length = length($hash_ref->{$tmp_taxon}); # length of first seq
    foreach my $taxon (keys %$hash_ref) {
        my $length = length($hash_ref->{$taxon}); # val is a length
        if ($length != $tmp_length) {
            print STDERR "\nError!\nLength of sequence differ ($taxon has $length, $tmp_taxon has $tmp_length)\n";
            exit(1);
        }
        else {
            $tmp_taxon = $taxon;
            $tmp_length = length($hash_ref->{$taxon});
            $nchar = $tmp_length;
        }
    }

    return($ntax, $nchar);

 }

 sub prettydate {

    #===  FUNCTION  ================================================================
    #         NAME:  prettydate
    #      VERSION:  02/02/2012 10:58:32 AM
    #  DESCRIPTION:  Returns a date string, e.g. '13:07 10/30/2025'
    #   PARAMETERS:  none
    #      RETURNS:  string
    #===============================================================================

   @_ = localtime(shift || time);

   return(sprintf("%02d:%02d %02d/%02d/%04d", @_[2,1], $_[4]+1, $_[3], $_[5]+1900));

 }

 MAIN:
 {
    #===  FUNCTION  ================================================================
    #         NAME:  "main"
    #      VERSION:  Wed 23 Jan 2019 02:40:06 PM CET
    #  DESCRIPTION:  Convert fasta to simple nexus. No extensive error checking!
    #   PARAMETERS:  none. Reads from stdin.
    #      RETURNS:  prints to stdout
    #===============================================================================

    my $taxname        = "";
    my $ntax           = 0;
    my $nchar          = 0;
    my $nheaders       = 0;
    my %tax_seq        = (); # key: taxon name, value: sequence
    my %count_tax_hash = (); # key: taxon name, value: count
    my @fasta_headers  = ();
    my $state          = 0;
    my $data_type      = 'DNA';
    my $missing_symbol = '?';
    my $gap_symbol     = '-';
    my $space          = ' ';
    my $datestring     = prettydate();

    while (<>) {
        chomp;
        if (/\>(\S*.*)$/i) { # greedy: takes whole line as sequence label
            $state = 1;
            $taxname = $1;
            $nheaders++;
            push @fasta_headers, $taxname;
            $count_tax_hash{$taxname}++;
        }
        elsif ($state == 1) {
            $tax_seq{$taxname} .= $_;
        }
    }

    $ntax = keys %tax_seq;
    ## If number of headers ne ntax, then sequences could be interleaved (and then to be concatenated)
    ## or there could simply be som mislabelling. Checking for sequence length will catch the error
    ## but will not guide in finding the labels that differ.
    if (scalar (keys %tax_seq) != scalar @fasta_headers) {
        if ($ALLOW_INTERLEAVED) {
            print STDERR "\n[Note: Found duplicate fasta headers. Will see if those could be concatenated.\n";
        }
        else {
            die "\nError: Found duplicate fasta headers. Check input.\n";
        }
    }
    ($ntax, $nchar) = get_ntax_nchar(\%tax_seq);

    $data_type = get_data_type(\%tax_seq);

    print STDOUT "\#NEXUS\n[Converted $datestring]\nBEGIN DATA\;\n";
    print STDOUT "DIMENSIONS NTAX=$ntax NCHAR=$nchar\;\n";
    print STDOUT "FORMAT MISSING=$missing_symbol GAP=$gap_symbol DATATYPE=$data_type\;\nMATRIX\n";

    ## Find longest seq label by sorting labels from hash and take the first
    my $longest = (reverse sort { $a <=> $b } map { length($_) } keys %tax_seq)[0];

    foreach my $seq_label (@fasta_headers) {
        my $npad = $longest - length($seq_label) + 1;
        my $pad = $space x $npad;
        my $label;
        if ($seq_label =~ /[\s|-]+/) { # if any spaces or dashes in label, need to print with single ticks
            $label = "'" . $seq_label . "'" . $pad;
        }
        else {
            $label = $seq_label . $pad;
        }
        print STDOUT $label, $tax_seq{$seq_label}, "\n";
    }

    print STDOUT "\t\;\nEND;\n";

 }
 __END__
	#!/usr/bin/env perl

	=pod

	=head2

	FILE: fas2nex.stdinout

	USAGE: ./fas2nex.stdinout <input>

	DESCRIPTION: Convert Fasta file to a 'stripped' Nexus format (readable by MrBayes v.3.2)

	OPTIONS: ---

	REQUIREMENTS: ---

	BUGS: ---

	NOTES: The original script was able to see if there where several headers with
	the same label, and then tried to merge those sequences - if the total
	length for the concatenated sequence was the same as any other labels'
	sequence. Hence, it allowed an "interleaved" fasta format.
	The current version, however, will not allow this, but quit with an
	error indicating that there are duplicated sequence labels.
	To change this behaviour, change the variable $ALLOW_INTERLEAVED to 1.

	AUTHOR: Johan Nylander (JN)

	COMPANY: NRM

	VERSION: 1.0

	CREATED: 2007

	REVISION: Wed 23 Jan 2019 02:35:42 PM CET

	TODO:

	=cut

	use strict;
	use warnings;

	my $ALLOW_INTERLEAVED = 0;

	sub get_data_type {

	#=== FUNCTION ================================================================
	# NAME: get_data_type
	# VERSION: 04/15/2010 08:37:45 AM CEST
	# DESCRIPTION: Counts the number of A+C+G+T-'-' in max_n sequences, and if the
	# ratio sum_acgt/length_of_all_max_n_seqs > cutoff, then the
	# data_type is considered as 'DNA' (else 'PROTEIN')
	# PARAMETERS: ref to hash containing the sequences
	# RETURNS: string: 'DNA' or 'PROTEIN'
	#===============================================================================

	my ($hash_ref) = @_;
	my $data_type = q{};
	my $max_n = 10; # count letters in no more than n sequences
	my $cutoff = 0.7; # sum of A+C+G+T-'gap' should be above cutoff
	my $nseqs = keys %$hash_ref;

	## Adjust max_n for smaller data sets
	if ($max_n > $nseqs) {
	$max_n = $nseqs;
	}

	## Get the first max_n keys from the hash
	my (@first_max_n_names) = (keys %$hash_ref)[0..$max_n-1];

	## Get the (concatenated) sequence from max_n sequences
	my $long_seq = @{$hash_ref}{@first_max_n_names}; # Hash slice accesses multiple keys. Note the '@'

	# Count the A,C,G,T's. Remove gaps first.
	$long_seq =~ s/-//g;
	my $acgt_count = $long_seq =~ s/([a\|c\|g\|t])/$1/gi;

	## Check ratio
	if (($acgt_count/length($long_seq)) > $cutoff) {
	$data_type = 'DNA';
	}
	else {
	$data_type = 'PROTEIN';
	}

	return($data_type);

	}

	sub get_ntax_nchar {

	#=== FUNCTION ================================================================
	# NAME: get_ntax_nchar
	# VERSION: 02/02/2012 10:03:53 AM
	# DESCRIPTION: Get ntax, nseq
	# PARAMETERS: ref to hash containing the sequences
	# RETURNS: $ntax, $nchar
	#===============================================================================

	my ($hash_ref) = @_;
	my $nchar = q{};
	my $ntax = keys %$hash_ref; # length of list of keys
	my @taxa = keys %$hash_ref;

	## Check sequence lengths
	my $tmp_taxon = shift(@taxa);
	my $tmp_length = length($hash_ref->{$tmp_taxon}); # length of first seq
	foreach my $taxon (keys %$hash_ref) {
	my $length = length($hash_ref->{$taxon}); # val is a length
	if ($length != $tmp_length) {
	print STDERR "\nError!\nLength of sequence differ ($taxon has $length, $tmp_taxon has $tmp_length)\n";
	exit(1);
	}
	else {
	$tmp_taxon = $taxon;
	$tmp_length = length($hash_ref->{$taxon});
	$nchar = $tmp_length;
	}
	}

	return($ntax, $nchar);

	}

	sub prettydate {

	#=== FUNCTION ================================================================
	# NAME: prettydate
	# VERSION: 02/02/2012 10:58:32 AM
	# DESCRIPTION: Returns a date string, e.g. '13:07 10/30/2025'
	# PARAMETERS: none
	# RETURNS: string
	#===============================================================================

	@_ = localtime(shift \|\| time);

	return(sprintf("%02d:%02d %02d/%02d/%04d", @_[2,1], $_[4]+1, $_[3], $_[5]+1900));

	}

	MAIN:
	{
	#=== FUNCTION ================================================================
	# NAME: "main"
	# VERSION: Wed 23 Jan 2019 02:40:06 PM CET
	# DESCRIPTION: Convert fasta to simple nexus. No extensive error checking!
	# PARAMETERS: none. Reads from stdin.
	# RETURNS: prints to stdout
	#===============================================================================

	my $taxname = "";
	my $ntax = 0;
	my $nchar = 0;
	my $nheaders = 0;
	my %tax_seq = (); # key: taxon name, value: sequence
	my %count_tax_hash = (); # key: taxon name, value: count
	my @fasta_headers = ();
	my $state = 0;
	my $data_type = 'DNA';
	my $missing_symbol = '?';
	my $gap_symbol = '-';
	my $space = ' ';
	my $datestring = prettydate();

	while (<>) {
	chomp;
	if (/\>(\S.)$/i) { # greedy: takes whole line as sequence label
	$state = 1;
	$taxname = $1;
	$nheaders++;
	push @fasta_headers, $taxname;
	$count_tax_hash{$taxname}++;
	}
	elsif ($state == 1) {
	$tax_seq{$taxname} .= $_;
	}
	}

	$ntax = keys %tax_seq;
	## If number of headers ne ntax, then sequences could be interleaved (and then to be concatenated)
	## or there could simply be som mislabelling. Checking for sequence length will catch the error
	## but will not guide in finding the labels that differ.
	if (scalar (keys %tax_seq) != scalar @fasta_headers) {
	if ($ALLOW_INTERLEAVED) {
	print STDERR "\n[Note: Found duplicate fasta headers. Will see if those could be concatenated.\n";
	}
	else {
	die "\nError: Found duplicate fasta headers. Check input.\n";
	}
	}
	($ntax, $nchar) = get_ntax_nchar(\%tax_seq);

	$data_type = get_data_type(\%tax_seq);

	print STDOUT "\#NEXUS\n[Converted $datestring]\nBEGIN DATA\;\n";
	print STDOUT "DIMENSIONS NTAX=$ntax NCHAR=$nchar\;\n";
	print STDOUT "FORMAT MISSING=$missing_symbol GAP=$gap_symbol DATATYPE=$data_type\;\nMATRIX\n";

	## Find longest seq label by sorting labels from hash and take the first
	my $longest = (reverse sort { $a <=> $b } map { length($_) } keys %tax_seq)[0];

	foreach my $seq_label (@fasta_headers) {
	my $npad = $longest - length($seq_label) + 1;
	my $pad = $space x $npad;
	my $label;
	if ($seq_label =~ /[\s\|-]+/) { # if any spaces or dashes in label, need to print with single ticks
	$label = "'" . $seq_label . "'" . $pad;
	}
	else {
	$label = $seq_label . $pad;
	}
	print STDOUT $label, $tax_seq{$seq_label}, "\n";
	}

	print STDOUT "\t\;\nEND;\n";

	}
	__END__
No results found