Created
October 30, 2025 12:17
-
-
Save nylander/3e197cb3c683419965b84d327a5217d1 to your computer and use it in GitHub Desktop.
Simple fasta to nexus converter reading from stdin and writingto stdout
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env perl | |
| =pod | |
| =head2 | |
| FILE: fas2nex.stdinout | |
| USAGE: ./fas2nex.stdinout <input> | |
| DESCRIPTION: Convert Fasta file to a 'stripped' Nexus format (readable by MrBayes v.3.2) | |
| OPTIONS: --- | |
| REQUIREMENTS: --- | |
| BUGS: --- | |
| NOTES: The original script was able to see if there where several headers with | |
| the same label, and then tried to merge those sequences - if the total | |
| length for the concatenated sequence was the same as any other labels' | |
| sequence. Hence, it allowed an "interleaved" fasta format. | |
| The current version, however, will not allow this, but quit with an | |
| error indicating that there are duplicated sequence labels. | |
| To change this behaviour, change the variable $ALLOW_INTERLEAVED to 1. | |
| AUTHOR: Johan Nylander (JN) | |
| COMPANY: NRM | |
| VERSION: 1.0 | |
| CREATED: 2007 | |
| REVISION: Wed 23 Jan 2019 02:35:42 PM CET | |
| TODO: | |
| =cut | |
| use strict; | |
| use warnings; | |
| my $ALLOW_INTERLEAVED = 0; | |
| sub get_data_type { | |
| #=== FUNCTION ================================================================ | |
| # NAME: get_data_type | |
| # VERSION: 04/15/2010 08:37:45 AM CEST | |
| # DESCRIPTION: Counts the number of A+C+G+T-'-' in max_n sequences, and if the | |
| # ratio sum_acgt/length_of_all_max_n_seqs > cutoff, then the | |
| # data_type is considered as 'DNA' (else 'PROTEIN') | |
| # PARAMETERS: ref to hash containing the sequences | |
| # RETURNS: string: 'DNA' or 'PROTEIN' | |
| #=============================================================================== | |
| my ($hash_ref) = @_; | |
| my $data_type = q{}; | |
| my $max_n = 10; # count letters in no more than n sequences | |
| my $cutoff = 0.7; # sum of A+C+G+T-'gap' should be above cutoff | |
| my $nseqs = keys %$hash_ref; | |
| ## Adjust max_n for smaller data sets | |
| if ($max_n > $nseqs) { | |
| $max_n = $nseqs; | |
| } | |
| ## Get the first max_n keys from the hash | |
| my (@first_max_n_names) = (keys %$hash_ref)[0..$max_n-1]; | |
| ## Get the (concatenated) sequence from max_n sequences | |
| my $long_seq = @{$hash_ref}{@first_max_n_names}; # Hash slice accesses multiple keys. Note the '@' | |
| # Count the A,C,G,T's. Remove gaps first. | |
| $long_seq =~ s/-//g; | |
| my $acgt_count = $long_seq =~ s/([a|c|g|t])/$1/gi; | |
| ## Check ratio | |
| if (($acgt_count/length($long_seq)) > $cutoff) { | |
| $data_type = 'DNA'; | |
| } | |
| else { | |
| $data_type = 'PROTEIN'; | |
| } | |
| return($data_type); | |
| } | |
| sub get_ntax_nchar { | |
| #=== FUNCTION ================================================================ | |
| # NAME: get_ntax_nchar | |
| # VERSION: 02/02/2012 10:03:53 AM | |
| # DESCRIPTION: Get ntax, nseq | |
| # PARAMETERS: ref to hash containing the sequences | |
| # RETURNS: $ntax, $nchar | |
| #=============================================================================== | |
| my ($hash_ref) = @_; | |
| my $nchar = q{}; | |
| my $ntax = keys %$hash_ref; # length of list of keys | |
| my @taxa = keys %$hash_ref; | |
| ## Check sequence lengths | |
| my $tmp_taxon = shift(@taxa); | |
| my $tmp_length = length($hash_ref->{$tmp_taxon}); # length of first seq | |
| foreach my $taxon (keys %$hash_ref) { | |
| my $length = length($hash_ref->{$taxon}); # val is a length | |
| if ($length != $tmp_length) { | |
| print STDERR "\nError!\nLength of sequence differ ($taxon has $length, $tmp_taxon has $tmp_length)\n"; | |
| exit(1); | |
| } | |
| else { | |
| $tmp_taxon = $taxon; | |
| $tmp_length = length($hash_ref->{$taxon}); | |
| $nchar = $tmp_length; | |
| } | |
| } | |
| return($ntax, $nchar); | |
| } | |
| sub prettydate { | |
| #=== FUNCTION ================================================================ | |
| # NAME: prettydate | |
| # VERSION: 02/02/2012 10:58:32 AM | |
| # DESCRIPTION: Returns a date string, e.g. '13:07 10/30/2025' | |
| # PARAMETERS: none | |
| # RETURNS: string | |
| #=============================================================================== | |
| @_ = localtime(shift || time); | |
| return(sprintf("%02d:%02d %02d/%02d/%04d", @_[2,1], $_[4]+1, $_[3], $_[5]+1900)); | |
| } | |
| MAIN: | |
| { | |
| #=== FUNCTION ================================================================ | |
| # NAME: "main" | |
| # VERSION: Wed 23 Jan 2019 02:40:06 PM CET | |
| # DESCRIPTION: Convert fasta to simple nexus. No extensive error checking! | |
| # PARAMETERS: none. Reads from stdin. | |
| # RETURNS: prints to stdout | |
| #=============================================================================== | |
| my $taxname = ""; | |
| my $ntax = 0; | |
| my $nchar = 0; | |
| my $nheaders = 0; | |
| my %tax_seq = (); # key: taxon name, value: sequence | |
| my %count_tax_hash = (); # key: taxon name, value: count | |
| my @fasta_headers = (); | |
| my $state = 0; | |
| my $data_type = 'DNA'; | |
| my $missing_symbol = '?'; | |
| my $gap_symbol = '-'; | |
| my $space = ' '; | |
| my $datestring = prettydate(); | |
| while (<>) { | |
| chomp; | |
| if (/\>(\S*.*)$/i) { # greedy: takes whole line as sequence label | |
| $state = 1; | |
| $taxname = $1; | |
| $nheaders++; | |
| push @fasta_headers, $taxname; | |
| $count_tax_hash{$taxname}++; | |
| } | |
| elsif ($state == 1) { | |
| $tax_seq{$taxname} .= $_; | |
| } | |
| } | |
| $ntax = keys %tax_seq; | |
| ## If number of headers ne ntax, then sequences could be interleaved (and then to be concatenated) | |
| ## or there could simply be som mislabelling. Checking for sequence length will catch the error | |
| ## but will not guide in finding the labels that differ. | |
| if (scalar (keys %tax_seq) != scalar @fasta_headers) { | |
| if ($ALLOW_INTERLEAVED) { | |
| print STDERR "\n[Note: Found duplicate fasta headers. Will see if those could be concatenated.\n"; | |
| } | |
| else { | |
| die "\nError: Found duplicate fasta headers. Check input.\n"; | |
| } | |
| } | |
| ($ntax, $nchar) = get_ntax_nchar(\%tax_seq); | |
| $data_type = get_data_type(\%tax_seq); | |
| print STDOUT "\#NEXUS\n[Converted $datestring]\nBEGIN DATA\;\n"; | |
| print STDOUT "DIMENSIONS NTAX=$ntax NCHAR=$nchar\;\n"; | |
| print STDOUT "FORMAT MISSING=$missing_symbol GAP=$gap_symbol DATATYPE=$data_type\;\nMATRIX\n"; | |
| ## Find longest seq label by sorting labels from hash and take the first | |
| my $longest = (reverse sort { $a <=> $b } map { length($_) } keys %tax_seq)[0]; | |
| foreach my $seq_label (@fasta_headers) { | |
| my $npad = $longest - length($seq_label) + 1; | |
| my $pad = $space x $npad; | |
| my $label; | |
| if ($seq_label =~ /[\s|-]+/) { # if any spaces or dashes in label, need to print with single ticks | |
| $label = "'" . $seq_label . "'" . $pad; | |
| } | |
| else { | |
| $label = $seq_label . $pad; | |
| } | |
| print STDOUT $label, $tax_seq{$seq_label}, "\n"; | |
| } | |
| print STDOUT "\t\;\nEND;\n"; | |
| } | |
| __END__ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment