avrilcoghlan · March 1, 2013 14:44
diff --git a/treefam_QC4.pl b/treefam_QC4.pl
 #!/usr/local/bin/perl

 # 
 # Perl script treefam_QC4.pl
 # Written by Avril Coghlan ([email protected])
 # 3-Feb-09.
 #
 # This perl script finds TreeFam proteins that have a match to a family
 # in the treefam mysql hmmer_matches table, but where the gene does not
 # appear in the 'genes' table.
 #
 # The command-line format is:
 # % perl <treefam_QC4.pl> <release> 
 #   where <release> is the release of the TreeFam database to use.
 #
 #
 #------------------------------------------------------------------#

 # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

 $num_args                  = $#ARGV + 1;
 if ($num_args != 1)
 {
   print "Usage of treefam_QC4.pl\n\n";
   print "perl treefam_QC4.pl <release>\n";
   print "where <release> is the release of the TreeFam database to use.\n";
   print "For example, >perl -w treefam_QC4.pl 7\n";
   exit;
 }

 #------------------------------------------------------------------#

 # DECLARE MYSQL USERNAME AND HOST:
 use DBI;

 # FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:

 $release                   = $ARGV[0];

 #------------------------------------------------------------------#

 # READ IN THE PROTEINS CORRESPONDING TO EACH GENE:

 $database                  = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
 $dbh                       = DBI->connect("$database", 'anonymous', '') || return;
 $GENE                      = &read_proteins_for_genes($dbh);

 # GET A LIST OF ALL GENES THAT WERE PUT INTO FAMILIES:

 $FAMILY                    = &read_genes_in_families($dbh,$GENE);

 # READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND
 # CHECK THAT THE GENES WITH HMMER MATCHES TO A FAMILY APPEAR IN THE
 # 'genes' TABLE:

 &read_hmmer_matches_to_families($dbh,$FAMILY,$GENE);

 # NOW DISCONNECT FROM THE DATABASE:

 $rc                        = $dbh->disconnect();
 $rc                        = "";

 #------------------------------------------------------------------#

 print STDERR "FINISHED.\n";

 #------------------------------------------------------------------#
 # READ IN THE PROTEINS CORRESPONDING TO EACH GENE:

 sub read_proteins_for_genes
 {
   my $dbh                 = $_[0];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $GID;  
   my $ID; 
   my %GENE                = ();

   $table_w                = 'genes';
   $st                     = "SELECT GID, ID from $table_w"; 
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)    
      {
         $GID              = $array[0];  
   $ID               = $array[1];
         
         if ($GENE{$ID})
 	 {
 	    print STDERR "WARNING: have multiple gids for $ID\n";
 	    $GENE{$ID}     = $GID; 
 	 } 
 	 else
 	 {
 	    $GENE{$ID}     = $GID;
 	 }
      }
   }

   print STDERR "Read in gene id for each protein id\n";
   return(\%GENE);
 }

 #------------------------------------------------------------------#
 # READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND
 # CHECK THAT GENES WITH STRONG MATCHES TO A FAMILY APPEAR IN THE
 # 'genes' TABLE:

 sub read_hmmer_matches_to_families
 {
   my $dbh                 = $_[0];
   my $FAMILY              = $_[1];
   my $GENE                = $_[2];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $AC;
   my $ID;
   my $SCORE;
   my $EVALUE;
   my $family;
   my $GID;

   $table_w                = 'hmmer_matches';
   $st                     = "SELECT AC, ID, SCORE, EVALUE from $table_w"; 
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)    
      {
         $AC               = $array[0];  
         $ID               = $array[1]; # eg. ENSPPYT00000017479.1
 	 $SCORE            = $array[2];
 	 $EVALUE           = $array[3];
 	 # FIND THE GENE ID FOR PROTEIN $ID:
 	 if (!($GENE->{$ID})) 
         { 
            print "WARNING: do not know GID for protein $ID\n"; 
         }
         else
         {
   	    $GID           = $GENE->{$ID}; # eg. ENSPPYG00000015038
         }
      }
   }

 }

 #------------------------------------------------------------------#

 # READ THE GENES THAT WERE PUT INTO FAMILIES:

 sub read_genes_in_families
 {
   my $dbh                 = $_[0];
   my $GENE                = $_[1];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $FLAG;
   my $AC;
   my %FAMILY              = ();
   my $GID;

   $table_w                = 'fam_genes';
   $st                     = "SELECT AC, FLAG, ID from $table_w"; 
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)    
      {
         $AC               = $array[0];  
         $FLAG             = $array[1];
 	 $ID               = $array[2];
 	 
         # THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES:
         if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY).
         {
 	    # FIND THE GENE ID CORRESPONDING TO THIS PROTEIN ID:
            if (!($GENE->{$ID})) 
            { 
               if (!($ID =~ /null/) && !($ID =~ /NULL/))
               {
                  print "WARNING: do not know gene for $ID AC $AC FLAG $FLAG\n"; 
               }
            }
 	    else
 	    {
 	       $GID        = $GENE->{$ID};
 	       if ($FAMILY{$GID}) 
 	       {
 	          $FAMILY{$GID} = $FAMILY{$GID}.",".$AC;
 	       }
 	       else
 	       {
 	          $FAMILY{$GID}= $AC;    
 	       }
 	    }
         }
      }
   }

   print STDERR "Read in genes in each family\n";
   return(\%FAMILY);
 }

 #------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script treefam_QC4.pl
	# Written by Avril Coghlan ([email protected])
	# 3-Feb-09.
	#
	# This perl script finds TreeFam proteins that have a match to a family
	# in the treefam mysql hmmer_matches table, but where the gene does not
	# appear in the 'genes' table.
	#
	# The command-line format is:
	# % perl <treefam_QC4.pl> <release>
	# where <release> is the release of the TreeFam database to use.
	#
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 1)
	{
	print "Usage of treefam_QC4.pl\n\n";
	print "perl treefam_QC4.pl <release>\n";
	print "where <release> is the release of the TreeFam database to use.\n";
	print "For example, >perl -w treefam_QC4.pl 7\n";
	exit;
	}

	#------------------------------------------------------------------#

	# DECLARE MYSQL USERNAME AND HOST:
	use DBI;

	# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:

	$release = $ARGV[0];

	#------------------------------------------------------------------#

	# READ IN THE PROTEINS CORRESPONDING TO EACH GENE:

	$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
	$dbh = DBI->connect("$database", 'anonymous', '') \|\| return;
	$GENE = &read_proteins_for_genes($dbh);

	# GET A LIST OF ALL GENES THAT WERE PUT INTO FAMILIES:

	$FAMILY = &read_genes_in_families($dbh,$GENE);

	# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND
	# CHECK THAT THE GENES WITH HMMER MATCHES TO A FAMILY APPEAR IN THE
	# 'genes' TABLE:

	&read_hmmer_matches_to_families($dbh,$FAMILY,$GENE);

	# NOW DISCONNECT FROM THE DATABASE:

	$rc = $dbh->disconnect();
	$rc = "";

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#
	# READ IN THE PROTEINS CORRESPONDING TO EACH GENE:

	sub read_proteins_for_genes
	{
	my $dbh = $_[0];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $GID;
	my $ID;
	my %GENE = ();

	$table_w = 'genes';
	$st = "SELECT GID, ID from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$GID = $array[0];
	$ID = $array[1];

	if ($GENE{$ID})
	{
	print STDERR "WARNING: have multiple gids for $ID\n";
	$GENE{$ID} = $GID;
	}
	else
	{
	$GENE{$ID} = $GID;
	}
	}
	}

	print STDERR "Read in gene id for each protein id\n";
	return(\%GENE);
	}

	#------------------------------------------------------------------#
	# READ IN THE HMMER MATCHES BETWEEN GENES AND HMMS FOR FAMILIES, AND
	# CHECK THAT GENES WITH STRONG MATCHES TO A FAMILY APPEAR IN THE
	# 'genes' TABLE:

	sub read_hmmer_matches_to_families
	{
	my $dbh = $_[0];
	my $FAMILY = $_[1];
	my $GENE = $_[2];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $AC;
	my $ID;
	my $SCORE;
	my $EVALUE;
	my $family;
	my $GID;

	$table_w = 'hmmer_matches';
	$st = "SELECT AC, ID, SCORE, EVALUE from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[0];
	$ID = $array[1]; # eg. ENSPPYT00000017479.1
	$SCORE = $array[2];
	$EVALUE = $array[3];
	# FIND THE GENE ID FOR PROTEIN $ID:
	if (!($GENE->{$ID}))
	{
	print "WARNING: do not know GID for protein $ID\n";
	}
	else
	{
	$GID = $GENE->{$ID}; # eg. ENSPPYG00000015038
	}
	}
	}

	}

	#------------------------------------------------------------------#

	# READ THE GENES THAT WERE PUT INTO FAMILIES:

	sub read_genes_in_families
	{
	my $dbh = $_[0];
	my $GENE = $_[1];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $FLAG;
	my $AC;
	my %FAMILY = ();
	my $GID;

	$table_w = 'fam_genes';
	$st = "SELECT AC, FLAG, ID from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[0];
	$FLAG = $array[1];
	$ID = $array[2];

	# THIS TABLE 'fam_genes' HAS 'FULL'/'SEED' FAMILIES:
	if ($FLAG eq 'FULL') # IF THIS IS A 'FULL' TREEFAM FAMILY (NOT A 'SEED' FAMILY).
	{
	# FIND THE GENE ID CORRESPONDING TO THIS PROTEIN ID:
	if (!($GENE->{$ID}))
	{
	if (!($ID =~ /null/) && !($ID =~ /NULL/))
	{
	print "WARNING: do not know gene for $ID AC $AC FLAG $FLAG\n";
	}
	}
	else
	{
	$GID = $GENE->{$ID};
	if ($FAMILY{$GID})
	{
	$FAMILY{$GID} = $FAMILY{$GID}.",".$AC;
	}
	else
	{
	$FAMILY{$GID}= $AC;
	}
	}
	}
	}
	}

	print STDERR "Read in genes in each family\n";
	return(\%FAMILY);
	}

	#------------------------------------------------------------------#