wchristian · August 9, 2010 00:07
diff --git a/gistfile1.pl b/gistfile1.pl
 package Greenphyl::Load::Family;

 use strict;
 use warnings;
 no warnings 'once';
 use Carp;
 use Bio::Seq;
 use Bio::SearchIO;
 use Bio::SeqIO;
 use lib "../../lib";
 use Greenphyl::Path;
 use Greenphyl::SequenceFilter 'filter_text_id';

 umask 0000;    # set the appropriate permissions for created files/folder to anydody as deamon gives only read permission to others

 my ( $mysql, $dbh );

 sub new {
    my ( $class, %p ) = @_;
    my $self = {};

    $mysql = $p{mysql};
    $dbh   = $p{dbh};

    bless( $self, $class );
    return $self;
 }

 #calculate the number of sequence in the fasta input file
 ## inefficient. could be done with a grep in 1 line.
 ## qx/ grep '>' $file | wc -l /;

 sub nbSequence {
    my ( $file ) = @_;
    my $total_size = 0;
    open( IN, $file );

    while ( <IN> ) {
        $total_size++ if /^>/;
    }

    close( IN );
    return $total_size;
 }

 # $liste_family is a text file containing family name that were modified and created
 # this list will be useful for the maintenance when running MEME and the phylo pipeline

 # premier hit ignore car sequence contre elle-meme = meilleur hit
 # second hit : -> soit match contre une famille -> si longueur bonne integration dans la famille
 #                                              -> sinon ORPHAN
 #              -> match avec ORPHAN -> longueur bonne = nouvelle famille et on regarde les hit suivant -> si longueur bonne integration dans la nouvelle famille
 #                                                                                                      -> autre arret de l'algo (longueur mauvaise ou plus de hit)
 #                                   -> longueur mauvaise = ORPHAN
 #              -> math avec rien    -> ORPHAN

 sub loadFamily {
    my ( $self, $file, $count, $evalue, $database, $out_number, $file_out, $file_temp, $species_name, $restart, %family_rem ) = @_;

    my $pass             = new Greenphyl::Path( $species_name );
    my $cpt              = 0;
    my $cpt_new_family   = 0;
    my $cpt_modif_family = 0;
    my $cpt_progress     = 0;
    my $nb_sequence      = nbSequence( $file );

    my %liste_pourcentage;

    open( LOG, ">> $Greenphyl::Path::data" )
      or die "cannot create the file $Greenphyl::Path::data $!";
    open( FAMILY, ">> $pass->{file_result}$Greenphyl::Path::liste_family " )
      || die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" );
    open( NFAM, ">> $pass->{file_result}$Greenphyl::Path::new_family" )
      || die( "error not create the file $pass->{file_result}$Greenphyl::Path::new_family\n" );
    open( AFF, ">> $Greenphyl::Path::data_path/$Greenphyl::Path::log" )
      || die( "error not create the file $Greenphyl::Path::data_path/$Greenphyl::Path::log\n" );
    my $in = Bio::SeqIO->newFh( -file => $file, -format => 'fasta' );

    while ( <$in> ) {

        # extract a single query sequence from the fasta file with the newly added sequences

        my $id        = filter_text_id( $_->display_id );    #query
        my $seq       = $_->seq;
        my $fasta     = $_;
        my $cpt_hit   = 0;                                   #counter for the number of hit with blast
        my $hit_while = 0;                                   #to last the loop

        $cpt_progress++;

        if ( $cpt_progress > $restart ) {
            print "\n----------------------------------------------\n";
            print AFF "Gene family modified: " . $cpt_progress . " / " . $nb_sequence . "\n";
            print $cpt_progress. " / " . $nb_sequence . " => $id \n";

            $cpt++;
            ##check if query is in the database
            my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'";
            my @rep = $mysql->select( $sql );

            unless ( $rep[0] ) {
                print "ERROR : sequence is not in the db!<br />";
                last;
            }

            my $seq = $_->seq;

            #++++ should be in run::blast
            ##creation input fasta file as input for blast software
            $file_temp = clipboard( $seq, $file_temp );
            system( "chmod 777 $file_temp" );
            $count++;

            # run blast to compare it to blast database containing other sequences across all species
            my $cmd = "$Greenphyl::Config::BLASTALL -i $file_temp -e $evalue -p blastp -F none -d $database -K $out_number -v $out_number -b $out_number -o $file_out";
            system( $cmd );

            my $found = 0;
            if ( -e $file_out )    # if blast output exist
            {

                # load blast results
                my $in = new Bio::SearchIO(
                    -format => 'blast',
                    -file   => "$file_out"
                );
                system( "chmod 666 $file_out" );

                #++++

                # collect all sequences that are similar to this one from the blast result
                # and see if the query sequence can be added to a family or combined with another sequence to form a new family
                # making sure that it is not accidentally combined with itself
                while ( my $result = $in->next_result and $hit_while == 0 ) {
                    my $level1 = 0;
                    my $cpt    = 0;

                    while ( my $hit = $result->next_hit
                        and $hit_while == 0 )    #sortir ici
                    {
                        my $seq_textid = filter_text_id( $hit->name );
                        $cpt_hit++;

                        if ( $seq_textid ne $id )    # ignore first hit
                        {
                            $found++;

                            $seq_textid = ucfirst( lc( $seq_textid ) );
                            my $sql = "SELECT S.seq_textid, F.family_name, F.family_id, FO.class_id 
                                                   FROM sequences S, seq_is_in SI, family F, found_in FO  
                                                   WHERE S.seq_textid =\"$seq_textid\" and S.seq_id = SI.seq_id and SI.family_id = F.family_id and F.family_id= FO.family_id";
                            my @rep7 = $mysql->select( $sql );
                            my $count_verif;         #pour verifier la taille de la sequence

                            # hit found in an existing family
                            if ( @rep7 ) {
                                print "Find family for $seq_textid \t";
                                my $family_id = $rep7[0][2];    #retrieve family id at level 1
                                $count_verif = &count( $family_id, $id );    #verifier si ce sont les bons parametres

                                if ( $count_verif == 1 )                     # length ok
                                {
                                    ## retrieve family_id at level 1 down to 4 (if exists)
                                    for my $i ( 0 .. $#rep7 ) {
                                        my $family_id = $rep7[$i][2];

                                        if ( $found == 1 ) {

                                            #retrieve seq_id for the sequence to be inserted in seq_is_in
                                            my $sql    = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'";
                                            my @rep    = $mysql->select( $sql );
                                            my $seq_id = $rep[0];

                                            ## check if the sequence is not already classified
                                            my $sql1 = "SELECT SI.family_id 
                                                                   FROM seq_is_in SI, found_in F 
                                                                    WHERE SI.seq_id = '"
                                              . $seq_id . "' and F.family_id = SI.family_id and F.class_id = '" . $rep7[$i][3] . "'";
                                            my @rep2 = $mysql->select( $sql1 );

                                            ##if classified
                                            if ( $rep2[0] ) {

                                                #Deja classifiee au niveau $rep7[$i][3] !!<br />\n";
                                                last;
                                            }

                                            #classify second hit
                                            else {
                                                print FAMILY "$family_id\n";
                                                print LOG "Insertion of the sequence $id in the $family_id family\n";
                                                %liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage );
                                                my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_id\")";
                                                $mysql->insert( $sql2 );
                                                $cpt_modif_family++;
                                            }
                                        }
                                    }
                                }
                                else    #length not ok : query = orphan
                                {
                                    $hit_while = 1;    #last the loop
                                    next;
                                }
                            }

                            # hit found est un orphan
                            else {
                                ##recup seq_id de query
                                my $sql    = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'";
                                my @rep    = $mysql->select( $sql );
                                my $seq_id = $rep[0];

                                ## verifie que query deja pas classifié dans la base avec 1er orphan de la sortie du blast
                                my $sql2         = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'";
                                my @rep2         = $mysql->select( $sql2 );
                                my $family_query = $rep2[0];

                                #pour premier orphan trouve cree une famille, voir else

                                ##deja classe:va voir si autre sorties de blast sont des orphans et si oui les classe dans meme famille
                                if ( $family_query ) {
                                    ##recup id des autres sortie du blast
                                    my $sql    = "SELECT seq_id FROM sequences WHERE seq_textid = '$seq_textid'";
                                    my @rep    = $mysql->select( $sql );
                                    my $seq_id = $rep[0];

                                    if ( $seq_id ) {
                                        ## verifie les autres sortie blast
                                        my $sql        = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'";
                                        my @rep        = $mysql->select( $sql );
                                        my $family_id2 = $rep[0];

                                        if ( $family_id2 ) {
                                            $hit_while = 1;    #on va pas regarder les autres hits, on stop la boucle ici.
                                        }
                                        else                   #sortie est un orphan
                                        {
                                            $count_verif = &count( $family_query, $id );    #test la taille de la sequence

                                            if ( $count_verif == 1 )                        #le test de taille est concluant
                                            {
                                                %liste_pourcentage = &dataFamily( $family_query, %liste_pourcentage );
                                                my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_query\")";
                                                $mysql->insert( $sql2 );
                                                $dbh->do( "TRUNCATE go_family_cache;" );
                                                print FAMILY "$family_query\n";
                                                $cpt_new_family++;
                                            }
                                        }
                                    }
                                }

                                #pas de famille cree : premier passage dans la boucle : creation d'une nouvelle famille avec l'orphan
                                #match avec un orphan
                                else {
                                    print "new_family  with $seq_textid\t";

                                    ###compare sequence length between the query and the hit
                                    my $sql00 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$id\"";            #query
                                    my $sql01 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\"";    #hit
                                    my @rep00 = $mysql->select( $sql00 );
                                    my @rep01 = $mysql->select( $sql01 );

                                    # should be set at the begining of the script
                                    my $soixante     = ( $rep01[0] * 60 ) / 100;
                                    my $centquarante = ( $rep01[0] * 140 ) / 100;

                                    # TODO: this size comparison is pretty broken and will need to be replaced once cleaning is done ( minimum to value is wider range than max to value )
                                    if (    ( $rep00[0] < $centquarante )
                                        and ( $rep00[0] > $soixante ) )                                                   #if length control is ok
                                    {

                                        # va chercher max family_number pour creer nouvelle famille
                                        my $sql1          = 'SELECT MAX(family_number) FROM family';
                                        my @rep           = $mysql->select( $sql1 );
                                        my $old_number    = $rep[0] || 0;
                                        my $family_number = $old_number + 1;

                                        ## create a new family in the db table family
                                        my $new = "New created family by $species_name";

                                        my $sql = "INSERT INTO family (family_name, family_number, inference) VALUES(\"$new\", \"$family_number\", '')";

                                        $mysql->insert( $sql );
                                        $dbh->do( "TRUNCATE go_family_cache;" );

                                        ## recup le family_id creer

                                        my $sql3 = "SELECT family_id FROM family WHERE family_number LIKE \"$family_number\"";

                                        my @rep1      = $mysql->select( $sql3 );
                                        my $family_id = $rep1[0];

                                        ## new family is classify at level 1
                                        my $nb   = 1;
                                        my $sql4 = "INSERT INTO found_in (family_id, class_id) values(\"$family_id\", \"$nb\")";

                                        $mysql->insert( $sql4 );
                                        $dbh->do( "TRUNCATE go_family_cache;" );

                                        print "New family  level 1: Family_id: $family_id\n";

                                        ##  2 sequences in the new family

                                        ##Query
                                        my $sql7   = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $seq_textid . "'";
                                        my @rep3   = $mysql->select( $sql7 );
                                        my $seq_id = $rep3[0];
                                        my $sql2   = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id\",\"$family_id\")";
                                        $mysql->insert( $sql2 );
                                        $dbh->do( "TRUNCATE go_family_cache;" );

                                        ##Ancien orphan
                                        my $sql6    = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'";
                                        my @rep2    = $mysql->select( $sql6 );
                                        my $seq_id2 = $rep2[0];
                                        my $sql5    = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id2\",\"$family_id\")";
                                        $mysql->insert( $sql5 );
                                        $dbh->do( "TRUNCATE go_family_cache;" );

                                        %liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage );
                                        print FAMILY "$family_id\n";
                                        print NFAM "$family_id\n";

                                    }
                                    else {    #taille est incorrecte = orphan
                                        print " too small  ORPHAN \n";
                                        $hit_while = 1;
                                    }
                                }
                            }
                        }    #end control if hit diff query
                    }    # end boucle sur list HIT
                }
            }
        }
    }

    $dbh->do( "call countFamilySeqBySpecies()" ) if $ENV{TESTING};    # TODO: this needs to be removed and count_family_seq updated properly!

    print LOG "number of families amended : $cpt_modif_family \n";
    print LOG "number of families created : $cpt_new_family \n";

    close FAMILY;
    close AFF;

    &pourcentage( $species_name, \%liste_pourcentage, \%family_rem );
    return $cpt_new_family, $cpt_modif_family;
 }

 #track the number of sequence added to the family
 #by tracking the number at the beginning and added, we can calculate the number of sequence removed
 #allow to calculate the percentage of modification
 sub dataFamily {
    my ( $family_id, %liste_pourcentage ) = @_;

    if ( keys( %liste_pourcentage ) ) {

        if ( exists $liste_pourcentage{$family_id} )    #family was modified at least once
        {
            $liste_pourcentage{$family_id} = $liste_pourcentage{$family_id} + 1;
        }
        else {
            my $sql      = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id";
            my @rep      = $mysql->select( $sql );
            my $old_size = scalar( @rep );
            $liste_pourcentage{$family_id} = 1;
        }
    }

    else {
        my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id";
        my @rep = $mysql->select( $sql );
        $liste_pourcentage{$family_id} = 1;
    }

    return %liste_pourcentage;
 }

 #track the number of changes in a family (number of new and/or removed sequences)
 #calculate the percentage of modification for a family
 # %liste_family contains id of each famiily with new sequences members : key = family_id value = number of sequences added
 # %family contains id of each famiily where sequences were removed

 sub pourcentage (\%)    #BAD way to specify paramter type
 {
    my ( $species_name, $ref_liste_pourcentage, $ref_family_rem ) = @_;
    my %liste_pourcentage = %$ref_liste_pourcentage;
    my %family_rem        = %$ref_family_rem;

    my $pass = new Greenphyl::Path( $species_name );

    open( POURCENTAGE, "> $pass->{file_result}$Greenphyl::Path::pourcentage " )
      || die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" );
    for my $key ( keys %liste_pourcentage )    #regarde toutes les familles modifiees avec des ADD
    {
        my $sql      = "SELECT seq_id FROM seq_is_in WHERE family_id = \"$key\"";
        my @rep      = $mysql->select( $sql );
        my $new_size = scalar( @rep );                                              #family size

        if ( exists $family_rem{$key} )                                             #famille a ete modifiee en + et -, on a une valeur de cle commune a %liste_family et %family_rem
        {
            my $old_size = $new_size + $liste_pourcentage{$key} - $family_rem{$key};                    #ancienne valeur de la taille
            my $pourcentage = ( ( $liste_pourcentage{$key} + $family_rem{$key} ) / $old_size ) * 100;
            print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add= > " . $liste_pourcentage{$key} . "\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n";
            delete( $family_rem{$key} );                                                                #efface de la table des familles - cette entree car deja traitee
        }

        else                                                                                            #famille a ete modifiee en + seulement
        {
            my $old_size    = $new_size - $liste_pourcentage{$key};                                     #ancienne taille = taille actuelle - ce qui a ete ajoute
            my $pourcentage = ( $liste_pourcentage{$key} / $old_size ) * 100;
            print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle = > " . $new_size . "\t Number of add => " . $liste_pourcentage{$key} . "\tnumber of remove = /\t pourcentage of modification => " . $pourcentage . "\n";
        }
    }

    for my $key ( keys %family_rem )                                                                    # que ce qui a ete modifie en - (car on a enleve ce qui etait commun avec %liste_family
    {
        my $sql         = "SELECT seq_id FROM seq_is_in WHERE family_id = $key";
        my @rep         = $mysql->select( $sql );
        my $new_size    = scalar( @rep );                                                               #taille de la famille
        my $old_size    = $new_size + $family_rem{$key};
        my $pourcentage = ( $family_rem{$key} / $old_size ) * 100;
        print POURCENTAGE "Family_id => " . $key . "\ttaille origine => " . $old_size . "\t taille actuelle => " . $new_size . "\t Number of  add => /\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n";
    }
    close POURCENTAGE;
 }

 # if new_size = 0 obsolete family was probably removed from the db!

 # function duplicated in many scripts, i don't know how to get rid of that function; TODO
 sub clipboard {
    my ( $clipboard, $file_temp ) = @_;
    my @donnees = ( split( /\n/, $clipboard ) );
    my ( @sequence, $name, $seq );
    $name ||= '';
    open( OUT, ">$file_temp" ) or die "cannot open file : $!";

    for my $i ( 0 .. $#donnees ) {

        if ( substr( $donnees[$i], 0, 1 ) eq ">" ) { $name = "$donnees[$i]"; }
        else {
            $seq .= $donnees[$i];
        }

        print OUT $name . $seq . "\n";
        $name = "";
        $seq  = "";
    }
    close OUT;
    return $file_temp;

 }

 =pod

 =head2 count

 B<Description>: perform control length on the sequences. sequences are inserted in the classification if the pass the test.

 B<ArgsCount>: 2

 =over 4

 =item $seq_textid: (string) 

 sequence name of the query

 =item $family_id: (int) 

 family identifier in which the query sequence may be classified

 =back

 =cut

 sub count {
    my ( $family_id, $seq_textid ) = @_;

    my $cpt          = 0;
    my $somme_length = 0;

    my %result;

    my $sql = "SELECT seq_length FROM  sequences  WHERE seq_textid = \"$seq_textid\"";
    my @rep = $mysql->select( $sql );

    if ( @rep ) {
        my $seq_length = $rep[0];

        #select les sequences qui appartiennent a la meme famille que la seq que l'on veut etudier (pour le niveau 1 uniquement)
        my $sql = "SELECT S.seq_textid, S.seq_length FROM sequences S, seq_is_in SI WHERE S.seq_id = SI.seq_id and SI.family_id = $family_id";
        my @rep = $mysql->select( $sql );

        if ( @rep )    #si trouve des sequences (si pas trouve alors que toutes les sequences de ALL sont dans la base = probleme !!!!!!)
        {
            for my $i ( 0 .. $#rep ) {
                $cpt++;
                $result{$cpt}{'length'} = $rep[$i][1];    #recupere pour chaque sequence de la famille dans une hashatable, la taille de chaque sequence
            }

            for my $test ( keys %result ) {
                $somme_length += $result{$test}{'length'};    #fait la somme des longueurs des sequences pour la famille
            }

            my $moy_length = $somme_length / $cpt;            #fait la moyenne

            # should be more visible for configuration and should work with median
            my $soixante     = ( $moy_length * 60 ) / 100;
            my $centquarante = ( $moy_length * 140 ) / 100;

            if ( $seq_length < $soixante ) {
                print "Too small sequence $seq_textid !!!  $seq_length < $soixante \n";
                return 0;                                     # orphan, too small, sequence not integrated
            }
            if ( $seq_length > $centquarante ) {
                print "Too large sequence $seq_textid !!!  $seq_length > $centquarante \n";
                return 0;                                     #mauvais c'est un orphan, il est trop  grand, on integre pas cette sequence.
            }
            else {
                print "OK min $soixante max $centquarante  for $seq_length\n";
                return 1;                                     #bon, on integre cette sequence.
            }
        }
        else {
            print "ERROR: the sequence is not in the Database ! \n";
            exit;
        }
    }
    else {
        print "cannot find length for $seq_textid , maybe the sequence is not in the database\n";
    }
 }

 1;
	package Greenphyl::Load::Family;

	use strict;
	use warnings;
	no warnings 'once';
	use Carp;
	use Bio::Seq;
	use Bio::SearchIO;
	use Bio::SeqIO;
	use lib "../../lib";
	use Greenphyl::Path;
	use Greenphyl::SequenceFilter 'filter_text_id';

	umask 0000; # set the appropriate permissions for created files/folder to anydody as deamon gives only read permission to others

	my ( $mysql, $dbh );

	sub new {
	my ( $class, %p ) = @_;
	my $self = {};

	$mysql = $p{mysql};
	$dbh = $p{dbh};

	bless( $self, $class );
	return $self;
	}

	#calculate the number of sequence in the fasta input file
	## inefficient. could be done with a grep in 1 line.
	## qx/ grep '>' $file \| wc -l /;

	sub nbSequence {
	my ( $file ) = @_;
	my $total_size = 0;
	open( IN, $file );

	while ( <IN> ) {
	$total_size++ if /^>/;
	}

	close( IN );
	return $total_size;
	}

	# $liste_family is a text file containing family name that were modified and created
	# this list will be useful for the maintenance when running MEME and the phylo pipeline

	# premier hit ignore car sequence contre elle-meme = meilleur hit
	# second hit : -> soit match contre une famille -> si longueur bonne integration dans la famille
	# -> sinon ORPHAN
	# -> match avec ORPHAN -> longueur bonne = nouvelle famille et on regarde les hit suivant -> si longueur bonne integration dans la nouvelle famille
	# -> autre arret de l'algo (longueur mauvaise ou plus de hit)
	# -> longueur mauvaise = ORPHAN
	# -> math avec rien -> ORPHAN

	sub loadFamily {
	my ( $self, $file, $count, $evalue, $database, $out_number, $file_out, $file_temp, $species_name, $restart, %family_rem ) = @_;

	my $pass = new Greenphyl::Path( $species_name );
	my $cpt = 0;
	my $cpt_new_family = 0;
	my $cpt_modif_family = 0;
	my $cpt_progress = 0;
	my $nb_sequence = nbSequence( $file );

	my %liste_pourcentage;

	open( LOG, ">> $Greenphyl::Path::data" )
	or die "cannot create the file $Greenphyl::Path::data $!";
	open( FAMILY, ">> $pass->{file_result}$Greenphyl::Path::liste_family " )
	\|\| die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" );
	open( NFAM, ">> $pass->{file_result}$Greenphyl::Path::new_family" )
	\|\| die( "error not create the file $pass->{file_result}$Greenphyl::Path::new_family\n" );
	open( AFF, ">> $Greenphyl::Path::data_path/$Greenphyl::Path::log" )
	\|\| die( "error not create the file $Greenphyl::Path::data_path/$Greenphyl::Path::log\n" );
	my $in = Bio::SeqIO->newFh( -file => $file, -format => 'fasta' );

	while ( <$in> ) {

	# extract a single query sequence from the fasta file with the newly added sequences

	my $id = filter_text_id( $_->display_id ); #query
	my $seq = $_->seq;
	my $fasta = $_;
	my $cpt_hit = 0; #counter for the number of hit with blast
	my $hit_while = 0; #to last the loop

	$cpt_progress++;

	if ( $cpt_progress > $restart ) {
	print "\n----------------------------------------------\n";
	print AFF "Gene family modified: " . $cpt_progress . " / " . $nb_sequence . "\n";
	print $cpt_progress. " / " . $nb_sequence . " => $id \n";

	$cpt++;
	##check if query is in the database
	my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'";
	my @rep = $mysql->select( $sql );

	unless ( $rep[0] ) {
	print "ERROR : sequence is not in the db!<br />";
	last;
	}

	my $seq = $_->seq;

	#++++ should be in run::blast
	##creation input fasta file as input for blast software
	$file_temp = clipboard( $seq, $file_temp );
	system( "chmod 777 $file_temp" );
	$count++;

	# run blast to compare it to blast database containing other sequences across all species
	my $cmd = "$Greenphyl::Config::BLASTALL -i $file_temp -e $evalue -p blastp -F none -d $database -K $out_number -v $out_number -b $out_number -o $file_out";
	system( $cmd );

	my $found = 0;
	if ( -e $file_out ) # if blast output exist
	{

	# load blast results
	my $in = new Bio::SearchIO(
	-format => 'blast',
	-file => "$file_out"
	);
	system( "chmod 666 $file_out" );

	#++++

	# collect all sequences that are similar to this one from the blast result
	# and see if the query sequence can be added to a family or combined with another sequence to form a new family
	# making sure that it is not accidentally combined with itself
	while ( my $result = $in->next_result and $hit_while == 0 ) {
	my $level1 = 0;
	my $cpt = 0;

	while ( my $hit = $result->next_hit
	and $hit_while == 0 ) #sortir ici
	{
	my $seq_textid = filter_text_id( $hit->name );
	$cpt_hit++;

	if ( $seq_textid ne $id ) # ignore first hit
	{
	$found++;

	$seq_textid = ucfirst( lc( $seq_textid ) );
	my $sql = "SELECT S.seq_textid, F.family_name, F.family_id, FO.class_id
	FROM sequences S, seq_is_in SI, family F, found_in FO
	WHERE S.seq_textid =\"$seq_textid\" and S.seq_id = SI.seq_id and SI.family_id = F.family_id and F.family_id= FO.family_id";
	my @rep7 = $mysql->select( $sql );
	my $count_verif; #pour verifier la taille de la sequence

	# hit found in an existing family
	if ( @rep7 ) {
	print "Find family for $seq_textid \t";
	my $family_id = $rep7[0][2]; #retrieve family id at level 1
	$count_verif = &count( $family_id, $id ); #verifier si ce sont les bons parametres

	if ( $count_verif == 1 ) # length ok
	{
	## retrieve family_id at level 1 down to 4 (if exists)
	for my $i ( 0 .. $#rep7 ) {
	my $family_id = $rep7[$i][2];

	if ( $found == 1 ) {

	#retrieve seq_id for the sequence to be inserted in seq_is_in
	my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'";
	my @rep = $mysql->select( $sql );
	my $seq_id = $rep[0];

	## check if the sequence is not already classified
	my $sql1 = "SELECT SI.family_id
	FROM seq_is_in SI, found_in F
	WHERE SI.seq_id = '"
	. $seq_id . "' and F.family_id = SI.family_id and F.class_id = '" . $rep7[$i][3] . "'";
	my @rep2 = $mysql->select( $sql1 );

	##if classified
	if ( $rep2[0] ) {

	#Deja classifiee au niveau $rep7[$i][3] !!<br />\n";
	last;
	}

	#classify second hit
	else {
	print FAMILY "$family_id\n";
	print LOG "Insertion of the sequence $id in the $family_id family\n";
	%liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage );
	my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_id\")";
	$mysql->insert( $sql2 );
	$cpt_modif_family++;
	}
	}
	}
	}
	else #length not ok : query = orphan
	{
	$hit_while = 1; #last the loop
	next;
	}
	}

	# hit found est un orphan
	else {
	##recup seq_id de query
	my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$id'";
	my @rep = $mysql->select( $sql );
	my $seq_id = $rep[0];

	## verifie que query deja pas classifié dans la base avec 1er orphan de la sortie du blast
	my $sql2 = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'";
	my @rep2 = $mysql->select( $sql2 );
	my $family_query = $rep2[0];

	#pour premier orphan trouve cree une famille, voir else

	##deja classe:va voir si autre sorties de blast sont des orphans et si oui les classe dans meme famille
	if ( $family_query ) {
	##recup id des autres sortie du blast
	my $sql = "SELECT seq_id FROM sequences WHERE seq_textid = '$seq_textid'";
	my @rep = $mysql->select( $sql );
	my $seq_id = $rep[0];

	if ( $seq_id ) {
	## verifie les autres sortie blast
	my $sql = "SELECT family_id FROM seq_is_in WHERE seq_id = '$seq_id'";
	my @rep = $mysql->select( $sql );
	my $family_id2 = $rep[0];

	if ( $family_id2 ) {
	$hit_while = 1; #on va pas regarder les autres hits, on stop la boucle ici.
	}
	else #sortie est un orphan
	{
	$count_verif = &count( $family_query, $id ); #test la taille de la sequence

	if ( $count_verif == 1 ) #le test de taille est concluant
	{
	%liste_pourcentage = &dataFamily( $family_query, %liste_pourcentage );
	my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) VALUES(\"$seq_id\",\"$family_query\")";
	$mysql->insert( $sql2 );
	$dbh->do( "TRUNCATE go_family_cache;" );
	print FAMILY "$family_query\n";
	$cpt_new_family++;
	}
	}
	}
	}

	#pas de famille cree : premier passage dans la boucle : creation d'une nouvelle famille avec l'orphan
	#match avec un orphan
	else {
	print "new_family with $seq_textid\t";

	###compare sequence length between the query and the hit
	my $sql00 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$id\""; #query
	my $sql01 = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\""; #hit
	my @rep00 = $mysql->select( $sql00 );
	my @rep01 = $mysql->select( $sql01 );

	# should be set at the begining of the script
	my $soixante = ( $rep01[0] * 60 ) / 100;
	my $centquarante = ( $rep01[0] * 140 ) / 100;

	# TODO: this size comparison is pretty broken and will need to be replaced once cleaning is done ( minimum to value is wider range than max to value )
	if ( ( $rep00[0] < $centquarante )
	and ( $rep00[0] > $soixante ) ) #if length control is ok
	{

	# va chercher max family_number pour creer nouvelle famille
	my $sql1 = 'SELECT MAX(family_number) FROM family';
	my @rep = $mysql->select( $sql1 );
	my $old_number = $rep[0] \|\| 0;
	my $family_number = $old_number + 1;

	## create a new family in the db table family
	my $new = "New created family by $species_name";

	my $sql = "INSERT INTO family (family_name, family_number, inference) VALUES(\"$new\", \"$family_number\", '')";

	$mysql->insert( $sql );
	$dbh->do( "TRUNCATE go_family_cache;" );

	## recup le family_id creer

	my $sql3 = "SELECT family_id FROM family WHERE family_number LIKE \"$family_number\"";

	my @rep1 = $mysql->select( $sql3 );
	my $family_id = $rep1[0];

	## new family is classify at level 1
	my $nb = 1;
	my $sql4 = "INSERT INTO found_in (family_id, class_id) values(\"$family_id\", \"$nb\")";

	$mysql->insert( $sql4 );
	$dbh->do( "TRUNCATE go_family_cache;" );

	print "New family level 1: Family_id: $family_id\n";

	## 2 sequences in the new family

	##Query
	my $sql7 = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $seq_textid . "'";
	my @rep3 = $mysql->select( $sql7 );
	my $seq_id = $rep3[0];
	my $sql2 = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id\",\"$family_id\")";
	$mysql->insert( $sql2 );
	$dbh->do( "TRUNCATE go_family_cache;" );

	##Ancien orphan
	my $sql6 = "SELECT seq_id FROM sequences WHERE seq_textid = '" . $id . "'";
	my @rep2 = $mysql->select( $sql6 );
	my $seq_id2 = $rep2[0];
	my $sql5 = "INSERT INTO seq_is_in (seq_id, family_id) values(\"$seq_id2\",\"$family_id\")";
	$mysql->insert( $sql5 );
	$dbh->do( "TRUNCATE go_family_cache;" );

	%liste_pourcentage = &dataFamily( $family_id, %liste_pourcentage );
	print FAMILY "$family_id\n";
	print NFAM "$family_id\n";

	}
	else { #taille est incorrecte = orphan
	print " too small ORPHAN \n";
	$hit_while = 1;
	}
	}
	}
	} #end control if hit diff query
	} # end boucle sur list HIT
	}
	}
	}
	}

	$dbh->do( "call countFamilySeqBySpecies()" ) if $ENV{TESTING}; # TODO: this needs to be removed and count_family_seq updated properly!

	print LOG "number of families amended : $cpt_modif_family \n";
	print LOG "number of families created : $cpt_new_family \n";

	close FAMILY;
	close AFF;

	&pourcentage( $species_name, \%liste_pourcentage, \%family_rem );
	return $cpt_new_family, $cpt_modif_family;
	}

	#track the number of sequence added to the family
	#by tracking the number at the beginning and added, we can calculate the number of sequence removed
	#allow to calculate the percentage of modification
	sub dataFamily {
	my ( $family_id, %liste_pourcentage ) = @_;

	if ( keys( %liste_pourcentage ) ) {

	if ( exists $liste_pourcentage{$family_id} ) #family was modified at least once
	{
	$liste_pourcentage{$family_id} = $liste_pourcentage{$family_id} + 1;
	}
	else {
	my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id";
	my @rep = $mysql->select( $sql );
	my $old_size = scalar( @rep );
	$liste_pourcentage{$family_id} = 1;
	}
	}

	else {
	my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $family_id";
	my @rep = $mysql->select( $sql );
	$liste_pourcentage{$family_id} = 1;
	}

	return %liste_pourcentage;
	}

	#track the number of changes in a family (number of new and/or removed sequences)
	#calculate the percentage of modification for a family
	# %liste_family contains id of each famiily with new sequences members : key = family_id value = number of sequences added
	# %family contains id of each famiily where sequences were removed

	sub pourcentage (\%) #BAD way to specify paramter type
	{
	my ( $species_name, $ref_liste_pourcentage, $ref_family_rem ) = @_;
	my %liste_pourcentage = %$ref_liste_pourcentage;
	my %family_rem = %$ref_family_rem;

	my $pass = new Greenphyl::Path( $species_name );

	open( POURCENTAGE, "> $pass->{file_result}$Greenphyl::Path::pourcentage " )
	\|\| die( "error not create the file $pass->{file_result}$Greenphyl::Path::liste_family\n" );
	for my $key ( keys %liste_pourcentage ) #regarde toutes les familles modifiees avec des ADD
	{
	my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = \"$key\"";
	my @rep = $mysql->select( $sql );
	my $new_size = scalar( @rep ); #family size

	if ( exists $family_rem{$key} ) #famille a ete modifiee en + et -, on a une valeur de cle commune a %liste_family et %family_rem
	{
	my $old_size = $new_size + $liste_pourcentage{$key} - $family_rem{$key}; #ancienne valeur de la taille
	my $pourcentage = ( ( $liste_pourcentage{$key} + $family_rem{$key} ) / $old_size ) * 100;
	print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add= > " . $liste_pourcentage{$key} . "\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n";
	delete( $family_rem{$key} ); #efface de la table des familles - cette entree car deja traitee
	}

	else #famille a ete modifiee en + seulement
	{
	my $old_size = $new_size - $liste_pourcentage{$key}; #ancienne taille = taille actuelle - ce qui a ete ajoute
	my $pourcentage = ( $liste_pourcentage{$key} / $old_size ) * 100;
	print POURCENTAGE "Family_id => " . $key . "\ttaille origine =>" . $old_size . "\t taille actuelle = > " . $new_size . "\t Number of add => " . $liste_pourcentage{$key} . "\tnumber of remove = /\t pourcentage of modification => " . $pourcentage . "\n";
	}
	}

	for my $key ( keys %family_rem ) # que ce qui a ete modifie en - (car on a enleve ce qui etait commun avec %liste_family
	{
	my $sql = "SELECT seq_id FROM seq_is_in WHERE family_id = $key";
	my @rep = $mysql->select( $sql );
	my $new_size = scalar( @rep ); #taille de la famille
	my $old_size = $new_size + $family_rem{$key};
	my $pourcentage = ( $family_rem{$key} / $old_size ) * 100;
	print POURCENTAGE "Family_id => " . $key . "\ttaille origine => " . $old_size . "\t taille actuelle => " . $new_size . "\t Number of add => /\tnumber of remove => " . $family_rem{$key} . "\t pourcentage of modification => " . $pourcentage . "\n";
	}
	close POURCENTAGE;
	}

	# if new_size = 0 obsolete family was probably removed from the db!

	# function duplicated in many scripts, i don't know how to get rid of that function; TODO
	sub clipboard {
	my ( $clipboard, $file_temp ) = @_;
	my @donnees = ( split( /\n/, $clipboard ) );
	my ( @sequence, $name, $seq );
	$name \|\|= '';
	open( OUT, ">$file_temp" ) or die "cannot open file : $!";

	for my $i ( 0 .. $#donnees ) {

	if ( substr( $donnees[$i], 0, 1 ) eq ">" ) { $name = "$donnees[$i]"; }
	else {
	$seq .= $donnees[$i];
	}

	print OUT $name . $seq . "\n";
	$name = "";
	$seq = "";
	}
	close OUT;
	return $file_temp;

	}

	=pod

	=head2 count

	B<Description>: perform control length on the sequences. sequences are inserted in the classification if the pass the test.

	B<ArgsCount>: 2

	=over 4

	=item $seq_textid: (string)

	sequence name of the query

	=item $family_id: (int)

	family identifier in which the query sequence may be classified

	=back

	=cut

	sub count {
	my ( $family_id, $seq_textid ) = @_;

	my $cpt = 0;
	my $somme_length = 0;

	my %result;

	my $sql = "SELECT seq_length FROM sequences WHERE seq_textid = \"$seq_textid\"";
	my @rep = $mysql->select( $sql );

	if ( @rep ) {
	my $seq_length = $rep[0];

	#select les sequences qui appartiennent a la meme famille que la seq que l'on veut etudier (pour le niveau 1 uniquement)
	my $sql = "SELECT S.seq_textid, S.seq_length FROM sequences S, seq_is_in SI WHERE S.seq_id = SI.seq_id and SI.family_id = $family_id";
	my @rep = $mysql->select( $sql );

	if ( @rep ) #si trouve des sequences (si pas trouve alors que toutes les sequences de ALL sont dans la base = probleme !!!!!!)
	{
	for my $i ( 0 .. $#rep ) {
	$cpt++;
	$result{$cpt}{'length'} = $rep[$i][1]; #recupere pour chaque sequence de la famille dans une hashatable, la taille de chaque sequence
	}

	for my $test ( keys %result ) {
	$somme_length += $result{$test}{'length'}; #fait la somme des longueurs des sequences pour la famille
	}

	my $moy_length = $somme_length / $cpt; #fait la moyenne

	# should be more visible for configuration and should work with median
	my $soixante = ( $moy_length * 60 ) / 100;
	my $centquarante = ( $moy_length * 140 ) / 100;

	if ( $seq_length < $soixante ) {
	print "Too small sequence $seq_textid !!! $seq_length < $soixante \n";
	return 0; # orphan, too small, sequence not integrated
	}
	if ( $seq_length > $centquarante ) {
	print "Too large sequence $seq_textid !!! $seq_length > $centquarante \n";
	return 0; #mauvais c'est un orphan, il est trop grand, on integre pas cette sequence.
	}
	else {
	print "OK min $soixante max $centquarante for $seq_length\n";
	return 1; #bon, on integre cette sequence.
	}
	}
	else {
	print "ERROR: the sequence is not in the Database ! \n";
	exit;
	}
	}
	else {
	print "cannot find length for $seq_textid , maybe the sequence is not in the database\n";
	}
	}

	1;