Created
January 17, 2013 09:34
-
-
Save josephhughes/4554822 to your computer and use it in GitHub Desktop.
use this to remove stop codons from an alignment
typically, this would be done to calculate dN/dS in HYPHY
Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta -ref 104D5S1
use this to replace stop codons from the nucleotide alignment with the codon of the reference
the nucleotide and the…
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
# | |
# use this to remove stop codons from an alignment | |
# typically, this would be done to calculate dN/dS in HYPHY | |
# Usage: perl ../Scripts/ReplaceStopWithGaps.pl -pep 104D5_pep.fasta -nuc 104D5.fasta -output 104D5_nostop.fasta -ref 104D5S1 | |
# use this to replace stop codons from the nucleotide alignment with the codon of the reference | |
# the nucleotide and the peptide alignments are necessary and the name of the reference sequence | |
# the reference sequence needs to be in the nucleotide alignment | |
use strict; | |
use Getopt::Long; | |
use Bio::SeqIO; | |
my ($inpep,$innuc,$output, $i, %stop,$ref,$refseq,@sequences); | |
&GetOptions( | |
'pep:s' => \$inpep,# | |
'nuc:s' => \$innuc, | |
'output:s' => \$output,#file without gaps | |
'ref:s' => \$ref,#the name of codon sequence to use as reference | |
); | |
my $pep = Bio::SeqIO->new(-file => "$inpep" , '-format' => 'fasta'); | |
my $nuc = Bio::SeqIO->new(-file => "$innuc" , '-format' => 'fasta'); | |
my $out = Bio::SeqIO->new(-file => ">$output" , '-format' => 'fasta'); | |
while ( my $pepseq = $pep->next_seq() ) { | |
my $pep_str=uc($pepseq->seq); | |
if ($pep_str=~/\*/){ | |
my $pep_id=$pepseq->id(); | |
my @aa=split(//,uc($pepseq->seq)); | |
for ($i=0; $i<scalar(@aa); $i++){ | |
if ($aa[$i]=~/\*/){ | |
$stop{$pep_id}=$i; | |
print "$pep_id peptide sequence has a stop $aa[$i] at ".($stop{$pep_id}+1)."\n"; | |
} | |
} | |
} | |
} | |
while (my $nucseq = $nuc->next_seq()){ | |
my $nuc_id=$nucseq->id(); | |
if ($nuc_id=~/$ref/){ | |
$refseq=lc($nucseq->seq()); | |
} | |
push(@sequences,$nucseq); | |
} | |
print "The reference sequence $ref is:\n$refseq\n"; | |
foreach my $nucseq (@sequences){ | |
my $nuc_id=$nucseq->id(); | |
my $nuc_str=uc($nucseq->seq); | |
foreach my $pid (keys %stop){ | |
if ("$nuc_id" eq "$pid"){ | |
#print "match $nuc_id and $pid\n"; | |
#print "The sequence for $nuc_id is \n$nuc_str\n"; | |
my $nucpos=$stop{$pid}*3; | |
my $codon = substr $nuc_str, $nucpos, 3; | |
print "$codon "; | |
if ($codon =~ /(((U|T)A(A|G|R))|((T|U)GA))/){ | |
my $refcodon = substr($refseq, $nucpos, 3); | |
print "reference codon is $refcodon\n"; | |
substr($nuc_str, $nucpos, 3) = substr($refseq, $nucpos, 3); | |
print "=> Match to a stop codon at nucleotide position ".($nucpos+1)."\nNew sequence for $nuc_id\n$nuc_str\n"; | |
}else{ | |
print "Doesn't seem to match a stop codon at nucleotide position ".($nucpos+1)." in $nuc_id\n"; | |
} | |
} | |
} | |
my $newseq = Bio::Seq->new(-seq => "$nuc_str", | |
-display_id => $nuc_id); | |
$out->write_seq($newseq); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi Joseph,
I am very new to perl. Would you explain me what is -pep please? As my understanding, -nuc is the input nucleotide file and -output is the output nucleotide file with all the internal stop codon replace by gaps, right?
But I do not know what file should I put in for -pep.
It would be great if you can guide me through this.
Thank you,
Hien
P/s: Can you send me your example files 104D5_pep.fasta, 104D5.fasta, and 104D5_nostop.fasta? My email is [email protected].