Last active
June 8, 2024 21:56
-
-
Save josephhughes/1167776 to your computer and use it in GitHub Desktop.
Use this script to replace the stop codons with gaps in the nucleotide alignment
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ReplaceStopsWithGaps.pl is a perlscript written by Joseph Hughes, University of Glasgow | |
use this to remove stop codons from an alignment | |
typically, this would be done to calculate dN/dS in HYPHY | |
Usage: | |
perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta | |
use this to replace stop codons from the nucleotide alignment | |
the nucleotide and the peptide alignments are necessary |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# | |
# use this to remove stop codons from an alignment | |
# typically, this would be done to calculate dN/dS in HYPHY | |
# Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta | |
# use this to replace stop codons from the nucleotide alignment | |
# the nucleotide and the peptide alignments are necessary | |
use strict; | |
use Getopt::Long; | |
use Bio::SeqIO; | |
my ($inpep,$innuc,$output, $i, %stop); | |
&GetOptions( | |
'pep:s' => \$inpep,# | |
'nuc:s' => \$innuc, | |
'output:s' => \$output,#file without gaps | |
); | |
my $pep = Bio::SeqIO->new(-file => "$inpep" , '-format' => 'fasta'); | |
my $nuc = Bio::SeqIO->new(-file => "$innuc" , '-format' => 'fasta'); | |
my $out = Bio::SeqIO->new(-file => ">$output" , '-format' => 'fasta'); | |
while ( my $pepseq = $pep->next_seq() ) { | |
my $pep_str=uc($pepseq->seq); | |
if ($pep_str=~/\*/){ | |
my $pep_id=$pepseq->id(); | |
my @aa=split(//,uc($pepseq->seq)); | |
for ($i=0; $i<scalar(@aa); $i++){ | |
if ($aa[$i]=~/\*/){ | |
$stop{$pep_id}{$i}++; | |
print "$pep_id peptide sequence has a stop $aa[$i] at ".($i+1)."\n"; | |
} | |
} | |
} | |
} | |
while (my $nucseq = $nuc->next_seq()){ | |
my $nuc_id=$nucseq->id(); | |
my $nuc_str=uc($nucseq->seq); | |
foreach my $pid (keys %stop){ | |
if ("$nuc_id" eq "$pid"){ | |
foreach my $site (keys %{$stop{$pid}}){ | |
#print "match $nuc_id and $pid\n"; | |
#print "The sequence for $nuc_id is \n$nuc_str\n"; | |
my $nucpos=$site*3; | |
my $codon = substr $nuc_str, $nucpos, 3; | |
print "$codon "; | |
if ($codon =~ /(((U|T)A(A|G|R))|((T|U)GA))/i){ | |
substr($nuc_str, $nucpos, 3) = '---'; | |
print "=> Match to a stop codon at nucleotide position ".($nucpos+1)."\nNew sequence for $nuc_id\n$nuc_str\n"; | |
}else{ | |
print "Doesn't seem to match a stop codon at nucleotide position ".($nucpos+1)." in $nuc_id\n"; | |
} | |
} | |
} | |
} | |
my $newseq = Bio::Seq->new(-seq => "$nuc_str", | |
-display_id => $nuc_id); | |
$out->write_seq($newseq); | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The perl script only replaces stop codons if the peptide file has a '*' in place of the stop codon