Created
August 9, 2010 00:07
-
-
Save wchristian/514713 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Greenphyl::Load::Family; | |
use strict; | |
use warnings; | |
no warnings 'once'; | |
use Carp; | |
use Bio::Seq; | |
use Bio::SearchIO; | |
use Bio::SeqIO; | |
use lib "../../lib"; | |
use Greenphyl::Path; | |
use Greenphyl::SequenceFilter 'filter_text_id'; | |
umask 0000; # set the appropriate permissions for created files/folder to anydody as deamon gives only read permission to others | |
my ( $mysql, $dbh ); | |
sub new { | |
my ( $class, %p ) = @_; | |
my $self = {}; | |
$mysql = $p{mysql}; | |
$dbh = $p{dbh}; | |
bless( $self, $class ); | |
return $self; | |
} | |
#calculate the number of sequence in the fasta input file | |
## inefficient. could be done with a grep in 1 line. | |
## qx/ grep '>' $file | wc -l /; | |
sub nbSequence { | |
my ( $file ) = @_; | |
my $total_size = 0; | |
open( IN, $file ); | |
while ( <IN> ) { | |
$total_size++ if /^>/; | |
} | |
close( IN ); | |
return $total_size; | |
} | |
# $liste_family is a text file containing family name that were modified and created | |
# this list will be useful for the maintenance when running MEME and the phylo pipeline | |
# premier hit ignore car sequence contre elle-meme = meilleur hit | |
# second hit : -> soit match contre une famille -> si longueur bonne integration dans la famille | |
# -> sinon ORPHAN | |
# -> match avec ORPHAN -> longueur bonne = nouvelle famille et on regarde les hit suivant -> si longueur bonne integration dans la nouvelle famille | |
# -> autre arret de l'algo (longueur mauvaise ou plus de hit) | |
# -> longueur mauvaise = ORPHAN | |
# -> math avec rien -> ORPHAN | |
sub loadFamily { | |
my ( $self, $file, $count, $evalue, $database, $out_number, $file_out, $file_temp, $species_name, $restart, %family_rem ) = @_; | |
my $pass = new Greenphyl::Path( $species_name ); | |
my $cpt = 0; | |
my $cpt_new_family = 0; | |
my $cpt_modif_family = 0; | |
my $cpt_progress = 0; | |
my $nb_sequence = nbSequence( $file ); | |
my %liste_pourcentage; | |
open( LOG, ">> $Greenphyl::Path::data" ) | |
or die "cannot create the file $Greenphyl::Path::data $!"; | |
open( FAMILY, ">> $pass->{file_result}$Greenphyl::Path::liste_family " ) | |
|| die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" ); | |
open( NFAM, ">> $pass->{file_result}$Greenphyl::Path::new_family" ) | |
|| die( "error not create the file $pass->{file_result}$Greenphyl::Path::new_family\n" ); | |
open( AFF, ">> $Greenphyl::Path::data_path/$Greenphyl::Path::log" ) | |
|| die( "error not create the file $Greenphyl::Path::data_path/$Greenphyl::Path::log\n" ); | |
my $in = Bio::SeqIO->newFh( -file => $file, -format => 'fasta' ); | |
while ( <$in> ) { | |
# extract a single query sequence from the fasta file with the newly added sequences | |
my $id = filter_text_id( $_->display_id ); #query | |
my $seq = $_->seq; | |
my $fasta = $_; | |
my $cpt_hit = 0; #counter for the number of hit with blast | |
my $hit_while = 0; #to last the loop | |
$cpt_progress++; | |
if ( $cpt_progress > $restart ) { | |
print "\n----------------------------------------------\n"; | |
print AFF "Gene family modified: " . $cpt_progress . " / " . $nb_sequence . "\n"; | |
print $cpt_progress. " / " . $nb_sequence . " => $id \n"; | |
$cpt++; | |
##check if query is in the database | |
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'"; | |
my @rep = $mysql->select( $sql ); | |
unless ( $rep[0] ) { | |
print "ERROR : sequence is not in the db!<br />"; | |
last; | |
} | |
my $seq = $_->seq; | |
#++++ should be in run::blast | |
##creation input fasta file as input for blast software | |
$file_temp = clipboard( $seq, $file_temp ); | |
system( "chmod 777 $file_temp" ); | |
$count++; | |
# run blast to compare it to blast database containing other sequences across all species | |
my $cmd = "$Greenphyl::Config::BLASTALL -i $file_temp -e $evalue -p blastp -F none -d $database -K $out_number -v $out_number -b $out_number -o $file_out"; | |
system( $cmd ); | |
my $found = 0; | |
if ( -e $file_out ) # if blast output exist | |
{ | |
# load blast results | |
my $in = new Bio::SearchIO( | |
-format => 'blast', | |
-file => "$file_out" | |
); | |
system( "chmod 666 $file_out" ); | |
#++++ | |
# collect all sequences that are similar to this one from the blast result | |
# and see if the query sequence can be added to a family or combined with another sequence to form a new family | |
# making sure that it is not accidentally combined with itself | |
while ( my $result = $in->next_result and $hit_while == 0 ) { | |
my $level1 = 0; | |
my $cpt = 0; | |
while ( my $hit = $result->next_hit | |
and $hit_while == 0 ) #sortir ici | |
{ | |
my $seq_textid = filter_text_id( $hit->name ); | |
$cpt_hit++; | |
if ( $seq_textid ne $id ) # ignore first hit | |
{ | |
$found++; | |
$seq_textid = ucfirst( lc( $seq_textid ) ); | |
my $sql = "SELECT S.seq_textid, F.family_name, F.family_id, FO.class_id | |
FROM sequences S, seq_is_in SI, family F, found_in FO | |
WHERE S.seq_textid =\"$seq_textid\" and S.seq_id = SI.seq_id and SI.family_id = F.family_id and F.family_id= FO.family_id"; | |
my @rep7 = $mysql->select( $sql ); | |
my $count_verif; #pour verifier la taille de la sequence | |
# hit found in an existing family | |
if ( @rep7 ) { | |
print "Find family for $seq_textid \t"; | |
my $family_id = $rep7[0][2]; #retrieve family id at level 1 | |
$count_verif = &count( $family_id, $id ); #verifier si ce sont les bons parametres | |
if ( $count_verif == 1 ) # length ok | |
{ | |
## retrieve family_id at level 1 down to 4 (if exists) | |
for my $i ( 0 .. $#rep7 ) { | |
my $family_id = $rep7[$i][2]; | |
if ( $found == 1 ) { | |
#retrieve seq_id for the sequence to be inserted in seq_is_in | |
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'"; | |
my @rep = $mysql->select( $sql ); | |
my $seq_id = $rep[0]; | |
## check if the sequence is not already classified | |
my $sql1 = "SELECT SI.family_id | |
FROM seq_is_in SI, found_in F | |
WHERE SI.seq_id = '" | |
. $seq_id . "' and F.family_id = SI.family_id and F.class_id = '" . $rep7[$i][3] . "'"; | |
my @rep2 = $mysql->select( $sql1 ); | |
##if classified | |
if ( $rep2[0] ) { | |
#Deja classifiee au niveau $rep7[$i][3] !!<br />\n"; | |
last; | |
} | |
#classify second hit | |
else { | |
print FAMILY "$family_id\n"; | |
print LOG "Insertion of the sequence $id in the $family_id family\n"; | |
%liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage ); | |
my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_id\")"; | |
$mysql->insert( $sql2 ); | |
$cpt_modif_family++; | |
} | |
} | |
} | |
} | |
else #length not ok : query = orphan | |
{ | |
$hit_while = 1; #last the loop | |
next; | |
} | |
} | |
# hit found est un orphan | |
else { | |
##recup seq_id de query | |
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'"; | |
my @rep = $mysql->select( $sql ); | |
my $seq_id = $rep[0]; | |
## verifie que query deja pas classifié dans la base avec 1er orphan de la sortie du blast | |
my $sql2 = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'"; | |
my @rep2 = $mysql->select( $sql2 ); | |
my $family_query = $rep2[0]; | |
#pour premier orphan trouve cree une famille, voir else | |
##deja classe:va voir si autre sorties de blast sont des orphans et si oui les classe dans meme famille | |
if ( $family_query ) { | |
##recup id des autres sortie du blast | |
my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$seq_textid'"; | |
my @rep = $mysql->select( $sql ); | |
my $seq_id = $rep[0]; | |
if ( $seq_id ) { | |
## verifie les autres sortie blast | |
my $sql = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'"; | |
my @rep = $mysql->select( $sql ); | |
my $family_id2 = $rep[0]; | |
if ( $family_id2 ) { | |
$hit_while = 1; #on va pas regarder les autres hits, on stop la boucle ici. | |
} | |
else #sortie est un orphan | |
{ | |
$count_verif = &count( $family_query, $id ); #test la taille de la sequence | |
if ( $count_verif == 1 ) #le test de taille est concluant | |
{ | |
%liste_pourcentage = &dataFamily( $family_query, %liste_pourcentage ); | |
my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_query\")"; | |
$mysql->insert( $sql2 ); | |
$dbh->do( "TRUNCATE go_family_cache;" ); | |
print FAMILY "$family_query\n"; | |
$cpt_new_family++; | |
} | |
} | |
} | |
} | |
#pas de famille cree : premier passage dans la boucle : creation d'une nouvelle famille avec l'orphan | |
#match avec un orphan | |
else { | |
print "new_family with $seq_textid\t"; | |
###compare sequence length between the query and the hit | |
my $sql00 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$id\""; #query | |
my $sql01 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\""; #hit | |
my @rep00 = $mysql->select( $sql00 ); | |
my @rep01 = $mysql->select( $sql01 ); | |
# should be set at the begining of the script | |
my $soixante = ( $rep01[0] * 60 ) / 100; | |
my $centquarante = ( $rep01[0] * 140 ) / 100; | |
# TODO: this size comparison is pretty broken and will need to be replaced once cleaning is done ( minimum to value is wider range than max to value ) | |
if ( ( $rep00[0] < $centquarante ) | |
and ( $rep00[0] > $soixante ) ) #if length control is ok | |
{ | |
# va chercher max family_number pour creer nouvelle famille | |
my $sql1 = 'SELECT MAX(family_number) FROM family'; | |
my @rep = $mysql->select( $sql1 ); | |
my $old_number = $rep[0] || 0; | |
my $family_number = $old_number + 1; | |
## create a new family in the db table family | |
my $new = "New created family by $species_name"; | |
my $sql = "INSERT INTO family (family_name, family_number, inference) VALUES(\"$new\", \"$family_number\", '')"; | |
$mysql->insert( $sql ); | |
$dbh->do( "TRUNCATE go_family_cache;" ); | |
## recup le family_id creer | |
my $sql3 = "SELECT family_id FROM family WHERE family_number LIKE \"$family_number\""; | |
my @rep1 = $mysql->select( $sql3 ); | |
my $family_id = $rep1[0]; | |
## new family is classify at level 1 | |
my $nb = 1; | |
my $sql4 = "INSERT INTO found_in (family_id, class_id) values(\"$family_id\", \"$nb\")"; | |
$mysql->insert( $sql4 ); | |
$dbh->do( "TRUNCATE go_family_cache;" ); | |
print "New family level 1: Family_id: $family_id\n"; | |
## 2 sequences in the new family | |
##Query | |
my $sql7 = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $seq_textid . "'"; | |
my @rep3 = $mysql->select( $sql7 ); | |
my $seq_id = $rep3[0]; | |
my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id\",\"$family_id\")"; | |
$mysql->insert( $sql2 ); | |
$dbh->do( "TRUNCATE go_family_cache;" ); | |
##Ancien orphan | |
my $sql6 = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'"; | |
my @rep2 = $mysql->select( $sql6 ); | |
my $seq_id2 = $rep2[0]; | |
my $sql5 = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id2\",\"$family_id\")"; | |
$mysql->insert( $sql5 ); | |
$dbh->do( "TRUNCATE go_family_cache;" ); | |
%liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage ); | |
print FAMILY "$family_id\n"; | |
print NFAM "$family_id\n"; | |
} | |
else { #taille est incorrecte = orphan | |
print " too small ORPHAN \n"; | |
$hit_while = 1; | |
} | |
} | |
} | |
} #end control if hit diff query | |
} # end boucle sur list HIT | |
} | |
} | |
} | |
} | |
$dbh->do( "call countFamilySeqBySpecies()" ) if $ENV{TESTING}; # TODO: this needs to be removed and count_family_seq updated properly! | |
print LOG "number of families amended : $cpt_modif_family \n"; | |
print LOG "number of families created : $cpt_new_family \n"; | |
close FAMILY; | |
close AFF; | |
&pourcentage( $species_name, \%liste_pourcentage, \%family_rem ); | |
return $cpt_new_family, $cpt_modif_family; | |
} | |
#track the number of sequence added to the family | |
#by tracking the number at the beginning and added, we can calculate the number of sequence removed | |
#allow to calculate the percentage of modification | |
sub dataFamily { | |
my ( $family_id, %liste_pourcentage ) = @_; | |
if ( keys( %liste_pourcentage ) ) { | |
if ( exists $liste_pourcentage{$family_id} ) #family was modified at least once | |
{ | |
$liste_pourcentage{$family_id} = $liste_pourcentage{$family_id} + 1; | |
} | |
else { | |
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id"; | |
my @rep = $mysql->select( $sql ); | |
my $old_size = scalar( @rep ); | |
$liste_pourcentage{$family_id} = 1; | |
} | |
} | |
else { | |
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id"; | |
my @rep = $mysql->select( $sql ); | |
$liste_pourcentage{$family_id} = 1; | |
} | |
return %liste_pourcentage; | |
} | |
#track the number of changes in a family (number of new and/or removed sequences) | |
#calculate the percentage of modification for a family | |
# %liste_family contains id of each famiily with new sequences members : key = family_id value = number of sequences added | |
# %family contains id of each famiily where sequences were removed | |
sub pourcentage (\%) #BAD way to specify paramter type | |
{ | |
my ( $species_name, $ref_liste_pourcentage, $ref_family_rem ) = @_; | |
my %liste_pourcentage = %$ref_liste_pourcentage; | |
my %family_rem = %$ref_family_rem; | |
my $pass = new Greenphyl::Path( $species_name ); | |
open( POURCENTAGE, "> $pass->{file_result}$Greenphyl::Path::pourcentage " ) | |
|| die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" ); | |
for my $key ( keys %liste_pourcentage ) #regarde toutes les familles modifiees avec des ADD | |
{ | |
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = \"$key\""; | |
my @rep = $mysql->select( $sql ); | |
my $new_size = scalar( @rep ); #family size | |
if ( exists $family_rem{$key} ) #famille a ete modifiee en + et -, on a une valeur de cle commune a %liste_family et %family_rem | |
{ | |
my $old_size = $new_size + $liste_pourcentage{$key} - $family_rem{$key}; #ancienne valeur de la taille | |
my $pourcentage = ( ( $liste_pourcentage{$key} + $family_rem{$key} ) / $old_size ) * 100; | |
print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add= > " . $liste_pourcentage{$key} . "\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n"; | |
delete( $family_rem{$key} ); #efface de la table des familles - cette entree car deja traitee | |
} | |
else #famille a ete modifiee en + seulement | |
{ | |
my $old_size = $new_size - $liste_pourcentage{$key}; #ancienne taille = taille actuelle - ce qui a ete ajoute | |
my $pourcentage = ( $liste_pourcentage{$key} / $old_size ) * 100; | |
print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle = > " . $new_size . "\t Number of add => " . $liste_pourcentage{$key} . "\tnumber of remove = /\t pourcentage of modification => " . $pourcentage . "\n"; | |
} | |
} | |
for my $key ( keys %family_rem ) # que ce qui a ete modifie en - (car on a enleve ce qui etait commun avec %liste_family | |
{ | |
my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $key"; | |
my @rep = $mysql->select( $sql ); | |
my $new_size = scalar( @rep ); #taille de la famille | |
my $old_size = $new_size + $family_rem{$key}; | |
my $pourcentage = ( $family_rem{$key} / $old_size ) * 100; | |
print POURCENTAGE "Family_id => " . $key . "\ttaille origine => " . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add => /\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n"; | |
} | |
close POURCENTAGE; | |
} | |
# if new_size = 0 obsolete family was probably removed from the db! | |
# function duplicated in many scripts, i don't know how to get rid of that function; TODO | |
sub clipboard { | |
my ( $clipboard, $file_temp ) = @_; | |
my @donnees = ( split( /\n/, $clipboard ) ); | |
my ( @sequence, $name, $seq ); | |
$name ||= ''; | |
open( OUT, ">$file_temp" ) or die "cannot open file : $!"; | |
for my $i ( 0 .. $#donnees ) { | |
if ( substr( $donnees[$i], 0, 1 ) eq ">" ) { $name = "$donnees[$i]"; } | |
else { | |
$seq .= $donnees[$i]; | |
} | |
print OUT $name . $seq . "\n"; | |
$name = ""; | |
$seq = ""; | |
} | |
close OUT; | |
return $file_temp; | |
} | |
=pod | |
=head2 count | |
B<Description>: perform control length on the sequences. sequences are inserted in the classification if the pass the test. | |
B<ArgsCount>: 2 | |
=over 4 | |
=item $seq_textid: (string) | |
sequence name of the query | |
=item $family_id: (int) | |
family identifier in which the query sequence may be classified | |
=back | |
=cut | |
sub count { | |
my ( $family_id, $seq_textid ) = @_; | |
my $cpt = 0; | |
my $somme_length = 0; | |
my %result; | |
my $sql = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\""; | |
my @rep = $mysql->select( $sql ); | |
if ( @rep ) { | |
my $seq_length = $rep[0]; | |
#select les sequences qui appartiennent a la meme famille que la seq que l'on veut etudier (pour le niveau 1 uniquement) | |
my $sql = "SELECT S.seq_textid, S.seq_length FROM sequences S, seq_is_in SI WHERE S.seq_id = SI.seq_id and SI.family_id = $family_id"; | |
my @rep = $mysql->select( $sql ); | |
if ( @rep ) #si trouve des sequences (si pas trouve alors que toutes les sequences de ALL sont dans la base = probleme !!!!!!) | |
{ | |
for my $i ( 0 .. $#rep ) { | |
$cpt++; | |
$result{$cpt}{'length'} = $rep[$i][1]; #recupere pour chaque sequence de la famille dans une hashatable, la taille de chaque sequence | |
} | |
for my $test ( keys %result ) { | |
$somme_length += $result{$test}{'length'}; #fait la somme des longueurs des sequences pour la famille | |
} | |
my $moy_length = $somme_length / $cpt; #fait la moyenne | |
# should be more visible for configuration and should work with median | |
my $soixante = ( $moy_length * 60 ) / 100; | |
my $centquarante = ( $moy_length * 140 ) / 100; | |
if ( $seq_length < $soixante ) { | |
print "Too small sequence $seq_textid !!! $seq_length < $soixante \n"; | |
return 0; # orphan, too small, sequence not integrated | |
} | |
if ( $seq_length > $centquarante ) { | |
print "Too large sequence $seq_textid !!! $seq_length > $centquarante \n"; | |
return 0; #mauvais c'est un orphan, il est trop grand, on integre pas cette sequence. | |
} | |
else { | |
print "OK min $soixante max $centquarante for $seq_length\n"; | |
return 1; #bon, on integre cette sequence. | |
} | |
} | |
else { | |
print "ERROR: the sequence is not in the Database ! \n"; | |
exit; | |
} | |
} | |
else { | |
print "cannot find length for $seq_textid , maybe the sequence is not in the database\n"; | |
} | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment