Last active
June 8, 2024 21:56
-
-
Save josephhughes/1167776 to your computer and use it in GitHub Desktop.
Use this script to replace the stop codons with gaps in the nucleotide alignment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ReplaceStopsWithGaps.pl is a perlscript written by Joseph Hughes, University of Glasgow | |
use this to remove stop codons from an alignment | |
typically, this would be done to calculate dN/dS in HYPHY | |
Usage: | |
perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta | |
use this to replace stop codons from the nucleotide alignment | |
the nucleotide and the peptide alignments are necessary |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# | |
# use this to remove stop codons from an alignment | |
# typically, this would be done to calculate dN/dS in HYPHY | |
# Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta | |
# use this to replace stop codons from the nucleotide alignment | |
# the nucleotide and the peptide alignments are necessary | |
use strict; | |
use Getopt::Long; | |
use Bio::SeqIO; | |
my ($inpep,$innuc,$output, $i, %stop); | |
&GetOptions( | |
'pep:s' => \$inpep,# | |
'nuc:s' => \$innuc, | |
'output:s' => \$output,#file without gaps | |
); | |
my $pep = Bio::SeqIO->new(-file => "$inpep" , '-format' => 'fasta'); | |
my $nuc = Bio::SeqIO->new(-file => "$innuc" , '-format' => 'fasta'); | |
my $out = Bio::SeqIO->new(-file => ">$output" , '-format' => 'fasta'); | |
while ( my $pepseq = $pep->next_seq() ) { | |
my $pep_str=uc($pepseq->seq); | |
if ($pep_str=~/\*/){ | |
my $pep_id=$pepseq->id(); | |
my @aa=split(//,uc($pepseq->seq)); | |
for ($i=0; $i<scalar(@aa); $i++){ | |
if ($aa[$i]=~/\*/){ | |
$stop{$pep_id}{$i}++; | |
print "$pep_id peptide sequence has a stop $aa[$i] at ".($i+1)."\n"; | |
} | |
} | |
} | |
} | |
while (my $nucseq = $nuc->next_seq()){ | |
my $nuc_id=$nucseq->id(); | |
my $nuc_str=uc($nucseq->seq); | |
foreach my $pid (keys %stop){ | |
if ("$nuc_id" eq "$pid"){ | |
foreach my $site (keys %{$stop{$pid}}){ | |
#print "match $nuc_id and $pid\n"; | |
#print "The sequence for $nuc_id is \n$nuc_str\n"; | |
my $nucpos=$site*3; | |
my $codon = substr $nuc_str, $nucpos, 3; | |
print "$codon "; | |
if ($codon =~ /(((U|T)A(A|G|R))|((T|U)GA))/i){ | |
substr($nuc_str, $nucpos, 3) = '---'; | |
print "=> Match to a stop codon at nucleotide position ".($nucpos+1)."\nNew sequence for $nuc_id\n$nuc_str\n"; | |
}else{ | |
print "Doesn't seem to match a stop codon at nucleotide position ".($nucpos+1)." in $nuc_id\n"; | |
} | |
} | |
} | |
} | |
my $newseq = Bio::Seq->new(-seq => "$nuc_str", | |
-display_id => $nuc_id); | |
$out->write_seq($newseq); | |
} | |
I run this code as per the instructions. The script doesn't throw any errors, but it does not remove the stop codons either. I don't seem to get any of the messages that are supposed to be printed out as well. Is this common? I input an alignment multi-fasta with 8 sequences in it and the the translation of the alignment which was obtained using t-coffee.
Thanks, Tal
The perl script only replaces stop codons if the peptide file has a '*' in place of the stop codon
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I run this code as per the instructions. The script doesn't throw any errors, but it does not remove the stop codons either. I don't seem to get any of the messages that are supposed to be printed out as well. Is this common?
I input an alignment multi-fasta with 8 sequences in it and the the translation of the alignment which was obtained using t-coffee.
Thanks,
Tal