Created
March 1, 2013 15:37
-
-
Save avrilcoghlan/5065446 to your computer and use it in GitHub Desktop.
Perl script that reads in a fasta-format alignment, and finds sequences that align to <x% of the alignment length.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script badgenes_in_alns2.pl | |
# Written by Avril Coghlan ([email protected]). | |
# 6-Oct-05. | |
# | |
# For the TreeFam project. | |
# | |
# This reads in an alignment, and finds sequences that align to | |
# <x% of the alignment length. It reads in a fasta format alignment. | |
# | |
# The command-line format is: | |
# % perl <badgenes_in_alns2.pl> aln min_pc_aln | |
# where aln is the alignment file (fasta format), | |
# min_pc_aln is the minimum percent of a sequence that should be aligned. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of badgenes_in_alns2.pl\n\n"; | |
print "perl badgenes_in_alns2.pl <aln> <min_pc_aln>\n"; | |
print "where <aln> is the alignment file (fasta format),\n"; | |
print " <min_pc_aln> is the minimum percent of a sequence that should be aligned.\n"; | |
print "For example, >perl -w badgenes_in_aln2.pl TF105221_unfiltered_seed.mfa.pep 50\n"; | |
exit; | |
} | |
# FIND THE NAME OF THE INPUT ALIGNMENT: | |
$aln = $ARGV[0]; | |
# FIND THE MINIMUM PERCENT OF A SEQUENCE THAT SHOULD BE ALIGNED TO THE OUTGROUP: | |
$min_pc_aln = $ARGV[1]; | |
#------------------------------------------------------------------# | |
# READ IN THE ALIGNMENT FROM THE ALIGNMENT FILE: | |
%ALN = (); | |
open(ALN,"$aln") || die "ERROR: cannot open $aln.\n"; | |
while(<ALN>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
@temp = split(/\s+/,$line); | |
$gene = $temp[0]; | |
$gene = substr($gene,1,length($gene)-1); | |
@proteins = (@proteins,$gene); | |
} | |
else | |
{ | |
if (!($ALN{$gene})) { $ALN{$gene} = $line; } | |
else { $ALN{$gene} = $ALN{$gene}.$line;} | |
} | |
} | |
close(ALN); | |
$no_proteins = $#proteins + 1; | |
#------------------------------------------------------------------# | |
# FIND WHICH POSITIONS IN THE ALIGNMENT WE HAVE SEQUENCE FOR AT LEAST TWO GENES: | |
if ($no_proteins == 0) { print STDERR "ERROR: no proteins is $no_proteins.\n"; exit;} | |
$first_gene = $proteins[0]; | |
if (!($ALN{$first_gene})) { print STDERR "ERROR: no alignment for $first_gene.\n"; exit;} | |
$alignment = $ALN{$first_gene}; | |
$aln_length = length($alignment); | |
$cols_to_count = 0; | |
%COLS_TO_COUNT = (); # HASH TABLE TO REMEMBER FOR WHICH POSITIONS THERE IS SEQUENCE | |
# FOR AT LEAST TWO GENES. | |
for ($i = 0; $i < $aln_length; $i++) | |
{ | |
# FIND OUT IF THERE IS SEQUENCE FOR AT LEAST TWO GENES AT THIS POSITION: | |
$no_prots = 0; | |
for ($j = 0; $j < $no_proteins; $j++) | |
{ | |
$protein = $proteins[$j]; | |
if (!($ALN{$protein})) { print STDERR "ERROR: no alignment for $protein.\n"; exit;} | |
$alignment = $ALN{$protein}; | |
# CHECK WHETHER THERE IS SEQUENCE AT THIS POSITION: | |
$aa = substr($alignment,$i,1); | |
if ($aa ne '-') | |
{ | |
$no_prots++; | |
} | |
} | |
# COUNT THIS POSITION, IF IT HAS SEQUENCE FOR AT LEAST TWO GENES: | |
if ($no_prots >= 2) | |
{ | |
$cols_to_count++; | |
$COLS_TO_COUNT{$i} = 1; | |
} | |
} | |
#------------------------------------------------------------------# | |
# CHECK FOR EACH NON-OUTGROUP SEQUENCE WHETHER IT ALIGNS WITH AT LEAST | |
# $min_pc_aln % OF THE WHOLE ALIGNMENT: | |
for ($i = 0; $i < $no_proteins; $i++) | |
{ | |
$protein = $proteins[$i]; | |
if (!($ALN{$protein})) { print "ERROR: no alignment for $protein.\n"; exit;} | |
$alignment = $ALN{$protein}; | |
$aln_length = length($alignment); | |
$aligning_positions = 0; # NUMBER OF POSITIONS WHERE $protein ALIGNS. | |
for ($j = 0; $j < $aln_length; $j++) | |
{ | |
# CHECK IF THERE IS SEQUENCE FOR AT LEAST TWO GENES AT THIS POSITION: | |
if ($COLS_TO_COUNT{$j}) | |
{ | |
# CHECK WHETHER THERE IS SEQUENCE AT THIS POSITION: | |
$aa = substr($alignment,$j,1); | |
if ($aa ne "-") | |
{ # THERE IS SEQUENCE AT THIS POSITION. | |
$aligning_positions++; | |
} | |
} | |
} | |
# FIND THE PERCENT OF $protein THAT ALIGNS: | |
$pc_aligning_positions = $aligning_positions*100/$cols_to_count; | |
if ($pc_aligning_positions < $min_pc_aln) | |
{ | |
print "$protein has $pc_aligning_positions % aligning ($aligning_positions out of $cols_to_count)\n"; | |
} | |
} | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment