Created
May 3, 2023 17:19
-
-
Save chasemc/cfa3beed287f48e81b494e30841d9b56 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Bio::Pfam::HMM::HMMResults.pm | |
# | |
# Author: finnr | |
# Maintainer: $Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $ | |
# Version: $Revision: 1.3 $ | |
# Created: Nov 19, 2008 | |
# Last Modified: $Date: 2009-12-15 14:38:08 $ | |
=head1 NAME | |
Bio::Pfam::HMM::HMMResults - A object to represents the results from hmmsearch | |
=cut | |
package Bio::Pfam::HMM::HMMResults; | |
=head1 DESCRIPTION | |
A more detailed description of what this class does and how it does it. | |
$Id: HMMResults.pm,v 1.3 2009-12-15 14:38:08 jt6 Exp $ | |
=head1 COPYRIGHT | |
File: Bio::Pfam::HMM::HMMResults.pm | |
Copyright (c) 2007: Genome Research Ltd. | |
Authors: Rob Finn ([email protected]) | |
This is free software; you can redistribute it and/or | |
modify it under the terms of the GNU General Public License | |
as published by the Free Software Foundation; either version 2 | |
of the License, or (at your option) any later version. | |
This program is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
GNU General Public License for more details. | |
You should have received a copy of the GNU General Public License | |
along with this program; if not, write to the Free Software | |
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
or see the on-line version at http://www.gnu.org/copyleft/gpl.txt | |
=cut | |
use strict; | |
use warnings; | |
use Moose; | |
use Moose::Util::TypeConstraints; | |
use Bio::Pfam::HMM::HMMSequence; | |
use Bio::Pfam::HMM::HMMUnit; | |
# | |
#------------------------------------------------------------------------------- | |
# Attributes | |
has 'hmmerVersion' => ( | |
isa => 'Str', | |
is => 'rw', | |
); | |
has 'hmmName' => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
has 'seqDB' => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
has hmmLength => ( | |
isa => 'Int', | |
is => 'rw' | |
); | |
has 'thisFile' => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
has seedName => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
has 'seqs' => ( | |
isa => 'HashRef', | |
is => 'rw', | |
default => sub { {} }, | |
); | |
has 'units' => ( | |
isa => 'ArrayRef', | |
is => 'rw', | |
default => sub { [] }, | |
); | |
has 'domThr' => ( | |
isa => 'Num', | |
is => 'rw', | |
default => '25.0' | |
); | |
has 'seqThr' => ( | |
isa => 'Num', | |
is => 'rw', | |
default => '25.0' | |
); | |
has 'evalueThr' => ( | |
isa => 'Num', | |
is => 'rw' | |
); | |
has 'domTC' => ( | |
isa => 'Num', | |
is => 'rw' | |
); | |
has 'seqTC' => ( | |
isa => 'Num', | |
is => 'rw' | |
); | |
has 'domNC' => ( | |
isa => 'Num', | |
is => 'rw' | |
); | |
has 'seqNC' => ( | |
isa => 'Num', | |
is => 'rw' | |
); | |
has 'randSeedNum' => ( | |
isa => 'Int', | |
is => 'rw' | |
); | |
has 'description' => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
has 'seqName' => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
has 'seqLength' => ( | |
isa => 'Int', | |
is => 'rw' | |
); | |
has 'eof' => ( | |
isa => 'Int', | |
is => 'rw', | |
default => 0 | |
); | |
has 'program' => ( | |
isa => 'Str', | |
is => 'rw' | |
); | |
=head1 METHODS | |
=head2 addHMMSeq | |
Title : addHMMSeq | |
Usage : $hmmRes->addHMMSeq( $hmmSeqObj ) | |
Function : Adds a Bio::Pfam::HMM::HMMSequence object to the results object | |
Args : A Bio::Pfam::HMM::HMMSequence object | |
Returns : nothing | |
=cut | |
sub addHMMSeq { | |
my( $self, $hmmSeq ) = @_; | |
unless($hmmSeq->isa('Bio::Pfam::HMM::HMMSequence')){ | |
die 'Trying to add a non Bio::Pfam::HMM::HMMSequence object'; | |
} | |
if($self->seqs){ | |
if($self->seqs->{$hmmSeq->name}){ | |
die "Trying to add the same sequence twice"; | |
} | |
} | |
$self->seqs->{$hmmSeq->name} = $hmmSeq; | |
} | |
=head2 eachHMMSeq | |
Title : eachHMMSeq | |
Usage : my @seqs = $hmmRes->eachHMMSeq | |
Function : Returns an array reference containing the references to all of the | |
: Bio::Pfam::HMM::HMMSequence objects stored in the HMMResults object. | |
Args : None | |
Returns : Array reference | |
=cut | |
sub eachHMMSeq { | |
my ($self ) = @_; | |
my @seqs; | |
my $seqRefs = $self->seqs; | |
foreach my $n (keys %{ $seqRefs }){ | |
push(@seqs, $seqRefs->{$n}); | |
} | |
return(\@seqs); | |
} | |
#------------------------------------------------------------------------------- | |
=head2 addHMMUnit | |
Title : addHMMUnit | |
Usage : $hmmRes | |
Function : Adds HMM units (the actual region hit) to the HMMSequence in the object | |
: and for convenience to the the results sets. All we store are duplicates | |
: of the references. | |
Args : A Bio::Pfam::HMM:HMMUnit | |
Returns : Nothing | |
=cut | |
sub addHMMUnit { | |
my ($self, $hmmUnit) = @_; | |
unless($hmmUnit->isa('Bio::Pfam::HMM::HMMUnit')){ | |
die "Trying to add an non-Bio::Pfam::HMM::HMMUnit\n"; | |
} | |
if($self->seqs){ | |
if($self->seqs->{$hmmUnit->name}){ | |
$self->seqs->{$hmmUnit->name}->addHMMUnit($hmmUnit); | |
}else{ | |
warn "Could not add hmmUnit as the sequence has not been added\n"; | |
} | |
} | |
#More conveinence we store the point to the hmmunit in an array | |
push(@{$self->units},$hmmUnit); | |
} | |
#------------------------------------------------------------------------------- | |
=head2 domainBitsCutoffFromEvalue | |
Title : domainBitsCutoffFromEvalue | |
Usage : $hmmRes->domainBitsCutoffFromEvalue(0.01) | |
Function : From the supplied evalue, it scans through all of the evalues in the results | |
: and calulates the bits score. | |
Args : An evalue. | |
Returns : A bits score. If no evalue is specified, returns nothing | |
=cut | |
sub domainBitsCutoffFromEvalue { | |
my ($self, $eval) = @_; | |
my ($dom,$prev,@doms,$cutoff,$sep,$seen); | |
unless(defined ($eval) ){ | |
warn "No evalue specified\n"; | |
return; | |
} | |
$seen = 0; | |
foreach $_ ( sort { $b->bits <=> $a->bits } @{$self->units}, @{$self->eachHMMSeq} ) { | |
if( $_->evalue > $eval ) { | |
$seen = 1; | |
$dom = $_; | |
last; | |
} | |
$prev = $_; | |
} | |
if( ! defined $prev || $seen == 0) { | |
carp("Evalue is either above or below the list..."); | |
return undef; | |
} | |
$sep = $prev->bits - $dom->bits ; | |
if( $sep < 1 ) { | |
return $prev->bits(); | |
} | |
if( $dom->bits < 25 && $prev->bits > 25 ) { | |
return 25; | |
} | |
return $dom->bits + sprintf("%.1f",$sep/2); | |
} | |
#------------------------------------------------------------------------------- | |
=head2 lowestTrue | |
Title : | |
Usage : | |
Function : | |
Args : | |
Returns : | |
=cut | |
sub lowestTrue { | |
my $self = shift; | |
unless($self->domTC && $self->seqTC) { | |
unless($self->domThr and $self->seqThr){ | |
die "Could not define TC as I am missing a threshold\n"; | |
} | |
#Set it wildly high! | |
my ($lowSeq, $lowDom); | |
$lowSeq = $lowDom = 999999.99; | |
foreach my $seqId (keys %{$self->seqs} ){ | |
if($self->seqs->{$seqId}->bits >= $self->seqThr){ | |
#Is this the lowest sequence thresh | |
if($self->seqs->{$seqId}->bits < $lowSeq){ | |
$lowSeq = $self->seqs->{$seqId}->bits; | |
} | |
#For each of the regions found on the sequence, look to see if the match is great | |
#than the domain threshold. If it is, is it lower than we we have seen previously | |
foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){ | |
if( $unit->bits() >= $self->domThr && $unit->bits() < $lowDom ) { | |
$lowDom = $unit->bits; | |
} | |
} | |
} | |
} | |
$self->domTC($lowDom); | |
$self->seqTC($lowSeq); | |
} | |
return($self->seqTC, $self->domTC); | |
} | |
#------------------------------------------------------------------------------- | |
=head2 highestNoise | |
Title : | |
Usage : | |
Function : | |
Args : | |
Returns : | |
=cut | |
sub highestNoise { | |
my $self = shift; | |
#See if it is already set | |
unless($self->domNC && $self->seqNC) { | |
unless($self->domThr and $self->seqThr){ | |
die "Could not define TC as I am missing a threshold\n"; | |
} | |
#Set it wildly low | |
my ($highSeq, $highDom); | |
$highSeq = $highDom = -999999.99; | |
foreach my $seqId (keys %{$self->seqs} ){ | |
if($self->seqs->{$seqId}->bits < $self->seqThr){ | |
#Is this the highest sequence thres below the cut-off | |
if($self->seqs->{$seqId}->bits > $highSeq){ | |
$highSeq = $self->seqs->{$seqId}->bits; | |
} | |
} | |
#For each of the regions found on the sequence, look to see if the match is great | |
#than the domain threshold. If it is, is it lower than we we have seen previously | |
foreach my $unit (@{ $self->seqs->{$seqId}->hmmUnits } ){ | |
if( $unit->bits < $self->domThr && $unit->bits > $highDom ) { | |
$highDom = $unit->bits; | |
} | |
} | |
} | |
$self->domNC($highDom); | |
$self->seqNC($highSeq); | |
} | |
return($self->seqNC, $self->domNC); | |
} | |
sub applyEdits { | |
my ($self, $edits, $removeBadEd) = @_; | |
my @validEd; #If removeBadEd flag is on, collect all the valid ED lines in this array and return at end of sub | |
foreach my $e (@$edits){ | |
#{ seq => $1, oldFrom => $2, oldTo => $3, newFrom => $5, newTo => $6 } | |
if($self->seqs->{$e->{seq}}){ | |
my $matched = 0; | |
foreach my $u (@{ $self->seqs->{ $e->{seq} }->hmmUnits }){ | |
if($u->envFrom == $e->{oldFrom} and $u->envTo == $e->{oldTo}) { | |
$matched = 1; #HMM unit found | |
if(defined $e->{newFrom} and $e->{newTo}){ | |
#Check co-ordinates of new start and end positions are in range | |
if( $e->{newFrom} < $u->{envFrom} or $e->{newTo} > $u->{envTo} or $e->{newFrom} > $e->{newTo}) { | |
if($removeBadEd) { | |
print "Removing ED line due to out of range co-ordinates: " . $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}. "\n"; | |
} | |
else { | |
warn $e->{seq}."/".$e->{newFrom}."-".$e->{newTo}." contains out of range co-ordinates - bad ED line\n"; | |
} | |
last; | |
} | |
#Modify the start end positions | |
$u->envFrom($e->{newFrom}); | |
$u->envTo($e->{newTo}); | |
#Check that the ali-positions are still okay | |
if($u->seqFrom < $e->{newFrom}){ | |
$u->seqFrom($e->{newFrom}); | |
} | |
if($u->seqTo > $e->{newTo}){ | |
$u->seqTo($e->{newTo}); | |
} | |
}else{ | |
#Set the score so low it will never get in the align | |
$u->bits(-999999.99); | |
} | |
push(@validEd, $e) if($removeBadEd); | |
last; | |
} | |
} | |
unless($matched){ #HMM unit not found - bad ED | |
if($removeBadEd) { | |
print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n"; | |
} | |
else { | |
warn $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}." does not appear in the list of hmm units - bad ED line\n"; | |
} | |
} | |
}else{ #Sequence not found - bad ED | |
if($removeBadEd) { | |
print "Removing ED line for invalid hmm unit: " . $e->{seq}."/".$e->{oldFrom}."-".$e->{oldTo}. "\n"; | |
} | |
else { | |
warn $e->{seq}." does not appear in the list of hmm units - bad ED line\n"; | |
} | |
} | |
} | |
return(\@validEd) if($removeBadEd); | |
} | |
sub remove_overlaps_by_clan { | |
my ($self, $clanmap, $nested) = @_; | |
my $new = Bio::Pfam::HMM::HMMResults->new; | |
$new->seqName($self->seqName); | |
foreach my $unit ( sort { $a->evalue <=> $b->evalue } @{ $self->units } ) { | |
#check if it overlaps before adding | |
my $o; | |
foreach my $u ( @{ $new->units } ) { | |
if( exists($clanmap->{$unit->name}) and exists($clanmap->{$u->name}) and ($clanmap->{$unit->name} eq $clanmap->{$u->name}) ) { | |
if( overlap( $unit, $u ) ) { | |
if(exists($$nested{$unit->name}{$u->name})) { | |
next; | |
} | |
else { | |
$o=1; | |
last; | |
} | |
} | |
} | |
} | |
unless($o) { | |
if(! $new->seqs->{$unit->name}) { | |
$new->addHMMSeq( Bio::Pfam::HMM::HMMSequence->new( { name => $self->seqs->{$unit->name}->name, | |
desc => $self->seqs->{$unit->name}->desc, | |
bits => $self->seqs->{$unit->name}->bits, | |
evalue => $self->seqs->{$unit->name}->evalue, | |
numberHits => $self->seqs->{$unit->name}->numberHits}) ); | |
} | |
$new->addHMMUnit($unit); | |
} | |
} | |
return $new; | |
} | |
sub overlap { | |
# does unit1 overlap with unit2? | |
my $unit1 = shift; | |
my $unit2 = shift; | |
my( $u1, $u2 ) = sort { $a->seqFrom <=> $b->seqFrom } ( $unit1, $unit2 ); | |
if( $u2->seqFrom <= $u1->seqTo ) { | |
return 1; | |
} | |
return 0; | |
} | |
sub results { | |
my ( $self, $pfamScanData, $e_value ) = @_; | |
my @results = (); | |
foreach my $unit ( sort { $a->seqFrom <=> $b->seqFrom } @{ $self->units } ) { | |
my $pfamB = $unit->name =~ /^Pfam-B/; | |
#Filter results based on thresholds | |
if ( $unit->name =~ /^Pfam-B/ ) { | |
next unless ( $self->seqs->{$unit->name}->evalue <= 0.001 and $unit->evalue <= 0.001 ); | |
$pfamB = 1; | |
} | |
else { | |
if ( $e_value ) { | |
next unless ( $self->seqs->{$unit->name}->evalue <= $e_value and $unit->evalue <= $e_value ) ; | |
} | |
else { | |
next unless $unit->sig; | |
} | |
} | |
push @results, { | |
seq => { from => $unit->seqFrom, | |
to => $unit->seqTo, | |
name => $self->seqName }, | |
env => { from => $unit->envFrom, | |
to => $unit->envTo }, | |
hmm => { from => $unit->hmmFrom, | |
to => $unit->hmmTo }, | |
model_length => $pfamScanData->{_model_len}->{ $unit->name }, | |
bits => $unit->bits, | |
evalue => $unit->evalue, | |
acc => $pfamScanData->{_accmap}->{ $unit->name }, | |
name => $unit->name, | |
desc => $pfamScanData->{_desc}->{ $unit->name }, | |
type => $pfamB ? undef : $pfamScanData->{_type}->{ $unit->name }, | |
clan => $pfamB ? undef : | |
$pfamScanData->{_clanmap}->{ $unit->name } || 'No_clan', | |
act_site => $pfamB ? undef : $unit->{act_site}, | |
sig => $pfamB ? "NA" : $unit->sig, | |
align => [ sprintf( '#HMM %s', $unit->hmmalign->{hmm} ), | |
sprintf( '#MATCH %s', $unit->hmmalign->{match} ), | |
sprintf( '#PP %s', $unit->hmmalign->{pp} ), | |
sprintf( '#SEQ %s', $unit->hmmalign->{seq} ) ] | |
}; | |
} | |
return \@results; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment