avrilcoghlan · March 1, 2013 15:37
diff --git a/badgenes_in_alns2.pl b/badgenes_in_alns2.pl
 #!/usr/local/bin/perl

 # 
 # Perl script badgenes_in_alns2.pl
 # Written by Avril Coghlan ([email protected]).
 # 6-Oct-05. 
 #
 # For the TreeFam project.
 #
 # This reads in an alignment, and finds sequences that align to 
 # <x% of the alignment length. It reads in a fasta format alignment.
 #
 # The command-line format is:
 # % perl <badgenes_in_alns2.pl> aln min_pc_aln                            
 # where aln is the alignment file (fasta format),
 #       min_pc_aln is the minimum percent of a sequence that should be aligned.
 #
 #------------------------------------------------------------------#

 # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

 $num_args                  = $#ARGV + 1;
 if ($num_args != 2)
 {
   print "Usage of badgenes_in_alns2.pl\n\n";
   print "perl badgenes_in_alns2.pl <aln> <min_pc_aln>\n";
   print "where <aln> is the alignment file (fasta format),\n";
   print "      <min_pc_aln> is the minimum percent of a sequence that should be aligned.\n";
   print "For example, >perl -w badgenes_in_aln2.pl TF105221_unfiltered_seed.mfa.pep 50\n";
   exit;
 }

 # FIND THE NAME OF THE INPUT ALIGNMENT:

 $aln                       = $ARGV[0];

 # FIND THE MINIMUM PERCENT OF A SEQUENCE THAT SHOULD BE ALIGNED TO THE OUTGROUP:

 $min_pc_aln                = $ARGV[1];

 #------------------------------------------------------------------#

 # READ IN THE ALIGNMENT FROM THE ALIGNMENT FILE:

 %ALN                       = ();
 open(ALN,"$aln") || die "ERROR: cannot open $aln.\n";
 while(<ALN>)
 {
   $line                   = $_;
   chomp $line;
   if (substr($line,0,1) eq '>')
   {
      @temp                = split(/\s+/,$line);
      $gene                = $temp[0];
      $gene                = substr($gene,1,length($gene)-1);
      @proteins            = (@proteins,$gene);
   }
   else
   {
      if (!($ALN{$gene})) { $ALN{$gene} = $line;            }
      else                { $ALN{$gene} = $ALN{$gene}.$line;}
   }
 }
 close(ALN);
 $no_proteins               = $#proteins + 1;

 #------------------------------------------------------------------#

 # FIND WHICH POSITIONS IN THE ALIGNMENT WE HAVE SEQUENCE FOR AT LEAST TWO GENES:

 if ($no_proteins == 0) { print STDERR "ERROR: no proteins is $no_proteins.\n"; exit;}
 $first_gene                = $proteins[0];
 if (!($ALN{$first_gene})) { print STDERR "ERROR: no alignment for $first_gene.\n"; exit;}
 $alignment                 = $ALN{$first_gene};
 $aln_length                = length($alignment);
 $cols_to_count             = 0;
 %COLS_TO_COUNT             = (); # HASH TABLE TO REMEMBER FOR WHICH POSITIONS THERE IS SEQUENCE
                                 # FOR AT LEAST TWO GENES.
 for ($i = 0; $i < $aln_length; $i++)
 {
   # FIND OUT IF THERE IS SEQUENCE FOR AT LEAST TWO GENES AT THIS POSITION:
   $no_prots               = 0;
   for ($j = 0; $j < $no_proteins; $j++)
   {
      $protein             = $proteins[$j];
      if (!($ALN{$protein})) { print STDERR "ERROR: no alignment for $protein.\n"; exit;}
      $alignment           = $ALN{$protein};
      # CHECK WHETHER THERE IS SEQUENCE AT THIS POSITION:   
      $aa                  = substr($alignment,$i,1);
      if ($aa ne '-')
      {
         $no_prots++;
      }
   }
   # COUNT THIS POSITION, IF IT HAS SEQUENCE FOR AT LEAST TWO GENES:
   if ($no_prots >= 2) 
   {
      $cols_to_count++;
      $COLS_TO_COUNT{$i}   = 1;
   }
 }

 #------------------------------------------------------------------#
                     
 # CHECK FOR EACH NON-OUTGROUP SEQUENCE WHETHER IT ALIGNS WITH AT LEAST
 # $min_pc_aln % OF THE WHOLE ALIGNMENT:   

 for ($i = 0; $i < $no_proteins; $i++)
 {
   $protein                = $proteins[$i];

   if (!($ALN{$protein})) { print "ERROR: no alignment for $protein.\n"; exit;}
   $alignment              = $ALN{$protein};
   $aln_length             = length($alignment);
   $aligning_positions     = 0; # NUMBER OF POSITIONS WHERE $protein ALIGNS.                                                                           
   for ($j = 0; $j < $aln_length; $j++)
   {
      # CHECK IF THERE IS SEQUENCE FOR AT LEAST TWO GENES AT THIS POSITION:
      if ($COLS_TO_COUNT{$j})
      {
         # CHECK WHETHER THERE IS SEQUENCE AT THIS POSITION:
         $aa               = substr($alignment,$j,1);
         if ($aa ne "-") 
         { # THERE IS SEQUENCE AT THIS POSITION.
            $aligning_positions++;
         }
      }
   } 

   # FIND THE PERCENT OF $protein THAT ALIGNS:
   $pc_aligning_positions = $aligning_positions*100/$cols_to_count;
   if ($pc_aligning_positions < $min_pc_aln)
   {
      print "$protein has $pc_aligning_positions % aligning ($aligning_positions out of $cols_to_count)\n";
   }
 }

 #------------------------------------------------------------------# 

 print STDERR "FINISHED.\n";

 #------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script badgenes_in_alns2.pl
	# Written by Avril Coghlan ([email protected]).
	# 6-Oct-05.
	#
	# For the TreeFam project.
	#
	# This reads in an alignment, and finds sequences that align to
	# <x% of the alignment length. It reads in a fasta format alignment.
	#
	# The command-line format is:
	# % perl <badgenes_in_alns2.pl> aln min_pc_aln
	# where aln is the alignment file (fasta format),
	# min_pc_aln is the minimum percent of a sequence that should be aligned.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 2)
	{
	print "Usage of badgenes_in_alns2.pl\n\n";
	print "perl badgenes_in_alns2.pl <aln> <min_pc_aln>\n";
	print "where <aln> is the alignment file (fasta format),\n";
	print " <min_pc_aln> is the minimum percent of a sequence that should be aligned.\n";
	print "For example, >perl -w badgenes_in_aln2.pl TF105221_unfiltered_seed.mfa.pep 50\n";
	exit;
	}

	# FIND THE NAME OF THE INPUT ALIGNMENT:

	$aln = $ARGV[0];

	# FIND THE MINIMUM PERCENT OF A SEQUENCE THAT SHOULD BE ALIGNED TO THE OUTGROUP:

	$min_pc_aln = $ARGV[1];

	#------------------------------------------------------------------#

	# READ IN THE ALIGNMENT FROM THE ALIGNMENT FILE:

	%ALN = ();
	open(ALN,"$aln") \|\| die "ERROR: cannot open $aln.\n";
	while(<ALN>)
	{
	$line = $_;
	chomp $line;
	if (substr($line,0,1) eq '>')
	{
	@temp = split(/\s+/,$line);
	$gene = $temp[0];
	$gene = substr($gene,1,length($gene)-1);
	@proteins = (@proteins,$gene);
	}
	else
	{
	if (!($ALN{$gene})) { $ALN{$gene} = $line; }
	else { $ALN{$gene} = $ALN{$gene}.$line;}
	}
	}
	close(ALN);
	$no_proteins = $#proteins + 1;

	#------------------------------------------------------------------#

	# FIND WHICH POSITIONS IN THE ALIGNMENT WE HAVE SEQUENCE FOR AT LEAST TWO GENES:

	if ($no_proteins == 0) { print STDERR "ERROR: no proteins is $no_proteins.\n"; exit;}
	$first_gene = $proteins[0];
	if (!($ALN{$first_gene})) { print STDERR "ERROR: no alignment for $first_gene.\n"; exit;}
	$alignment = $ALN{$first_gene};
	$aln_length = length($alignment);
	$cols_to_count = 0;
	%COLS_TO_COUNT = (); # HASH TABLE TO REMEMBER FOR WHICH POSITIONS THERE IS SEQUENCE
	# FOR AT LEAST TWO GENES.
	for ($i = 0; $i < $aln_length; $i++)
	{
	# FIND OUT IF THERE IS SEQUENCE FOR AT LEAST TWO GENES AT THIS POSITION:
	$no_prots = 0;
	for ($j = 0; $j < $no_proteins; $j++)
	{
	$protein = $proteins[$j];
	if (!($ALN{$protein})) { print STDERR "ERROR: no alignment for $protein.\n"; exit;}
	$alignment = $ALN{$protein};
	# CHECK WHETHER THERE IS SEQUENCE AT THIS POSITION:
	$aa = substr($alignment,$i,1);
	if ($aa ne '-')
	{
	$no_prots++;
	}
	}
	# COUNT THIS POSITION, IF IT HAS SEQUENCE FOR AT LEAST TWO GENES:
	if ($no_prots >= 2)
	{
	$cols_to_count++;
	$COLS_TO_COUNT{$i} = 1;
	}
	}

	#------------------------------------------------------------------#

	# CHECK FOR EACH NON-OUTGROUP SEQUENCE WHETHER IT ALIGNS WITH AT LEAST
	# $min_pc_aln % OF THE WHOLE ALIGNMENT:

	for ($i = 0; $i < $no_proteins; $i++)
	{
	$protein = $proteins[$i];

	if (!($ALN{$protein})) { print "ERROR: no alignment for $protein.\n"; exit;}
	$alignment = $ALN{$protein};
	$aln_length = length($alignment);
	$aligning_positions = 0; # NUMBER OF POSITIONS WHERE $protein ALIGNS.
	for ($j = 0; $j < $aln_length; $j++)
	{
	# CHECK IF THERE IS SEQUENCE FOR AT LEAST TWO GENES AT THIS POSITION:
	if ($COLS_TO_COUNT{$j})
	{
	# CHECK WHETHER THERE IS SEQUENCE AT THIS POSITION:
	$aa = substr($alignment,$j,1);
	if ($aa ne "-")
	{ # THERE IS SEQUENCE AT THIS POSITION.
	$aligning_positions++;
	}
	}
	}

	# FIND THE PERCENT OF $protein THAT ALIGNS:
	$pc_aligning_positions = $aligning_positions*100/$cols_to_count;
	if ($pc_aligning_positions < $min_pc_aln)
	{
	print "$protein has $pc_aligning_positions % aligning ($aligning_positions out of $cols_to_count)\n";
	}
	}

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#