avrilcoghlan · March 1, 2013 13:56
diff --git a/find_gene_pred_errors1.pl b/find_gene_pred_errors1.pl
 #!/usr/local/bin/perl

 # 
 # Perl script find_gene_pred_errors1.pl
 # Written by Avril Coghlan ([email protected])
 # 25-Feb-09.
 #
 # For the TreeFam project.
 #
 # This perl script finds cases where two adjacent genes in a species
 # (eg. C. elegans) should probably be merged, as one of them has its
 # best match to the first part of a TreeFam family alignment and the
 # second has its best match to the second part of the same TreeFam
 # family's alignment.
 #
 # The command-line format is:
 # % perl <find_gene_pred_errors1.pl> species release
 # where species is the species of interest,
 #       release is the release of TreeFam to use.
 #
 #------------------------------------------------------------------#

 # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

 $num_args                  = $#ARGV + 1;
 if ($num_args != 2)
 {
   print "Usage of find_gene_pred_errors1.pl\n\n";
   print "perl find_gene_pred_errors1.pl <species> <release>\n";
   print "where <species> is the species of interest (eg. C. elegans),\n";
   print "      <release> is the release of TreeFam to use.\n";
   print "For example, >perl find_gene_pred_errors1.pl CAEEL 7\n";
   exit;
 }

 use DBI;

 # FIND THE NAME OF THE SPECIES OF INTEREST:

 $the_species               = $ARGV[0];

 # FIND THE RELEASE OF TREEFAM TO USE:

 $release                   = $ARGV[1];

 #------------------------------------------------------------------#

 $database                  = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
 $dbh                       = DBI->connect("$database", 'anonymous', '') || return;

 # FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:
 ($GID,$IDS)                = &get_genes_from_species($dbh,$the_species);

 # READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND FAMILIES:
 ($BEST)                    = &read_hmmer_matches_to_families($dbh,$GID);

 # FIND ADJACENT GENES IN SPECIES $the_species, AND CHECK IF THEY HAVE
 # THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:
 &find_adjacent_genes($dbh,$GID,$IDS,$BEST);  

 # NOW DISCONNECT FROM THE DATABASE:

 $rc                        = $dbh->disconnect();
 $rc                        = "";

 #------------------------------------------------------------------#

 print STDERR "FINISHED.\n";

 #------------------------------------------------------------------#

 # FIND ADJACENT GENES IN THE SPECIES $the_species AND CHECK IF THEY
 # HAVE THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:

 sub find_adjacent_genes
 {
   my $dbh                 = $_[0];
   my $GID                 = $_[1];
   my $IDS                 = $_[2];
   my $BEST                = $_[3];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my $start;
   my $chr; 
   my %GENES               = ();
   my $GID;
   my %SEEN                = ();
   %START                  = ();
   my $genes;
   my @genes;
   my $i;
   my $gene;
   my $prev_gene;
   my $gene_ids;
   my $prev_gene_ids;
   my @gene_ids;
   my @prev_gene_ids;
   my $prev_gene_id;
   my $gene_id;
   my $j;
   my $k;
   my $prev_gene_id_best;
   my $gene_id_best;

   foreach $ID (keys %{$GID})
   {
      $GID                 = $GID{$ID};
      $start               = "none";
      $chr                 = "none";
      $table_w             = 'map';
      $st                  = "SELECT T_START, TARGET from $table_w WHERE ID=\'$ID\'"; 
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)    
         {
            $start         = $array[0];  
      $chr           = $array[1];
         }
      }
      if ($start eq 'none' || $chr eq 'none') { print STDERR "ERROR: ID $ID start $start chr $chr\n"; exit;}
      # RECORD THE LOCATION OF GENE $GID: 
      if (!($SEEN{$chr."=".$GID}))
      {
 	 $SEEN{$chr."=".$GID} = 1;
         if (!($GENES{$chr}))
         {
            $GENES{$chr}   = $GID;
         }
 	 else
 	 {
 	    $GENES{$chr}   = $GENES{$chr}.",".$GID;
 	 }
 	 if (!($START{$GID}))
 	 {
 	    $START{$GID}   = $start;
 	 }
 	 else
 	 {
 	    if ($start < $START{$GID})
 	    {
 	       $START{$GID} = $start;
 	    }
 	 }
      }
   }
   
   # CHECK IF ADJACENT GENES HAVE BEST HMMER MATCHES TO THE SAME TREEFAM FAMILY:
   foreach $chr (keys %GENES)
   {
      $genes                = $GENES{$chr};
      @genes                = split(/\,/,$genes);
      @genes                = sort by_start @genes;
      for ($i = 1; $i <= $#genes; $i++)
      {
 	 $gene              = $genes[$i];
 	 $prev_gene         = $genes[$i-1];
 	 # CHECK IF $gene AND $prev_gene HAVE TRANSCRIPTS THAT HAVE BEST HMMER MATCHES
 	 # TO THE SAME TREEFAM FAMILY:
 	 if (!($IDS{$gene})) { print STDERR "ERROR: do not know transcripts in $gene\n"; exit;}
 	 if (!($IDS{$prev_gene})) { print STDERR "ERROR: do not know transcripts in $prev_gene\n"; exit;}
 	 $gene_ids          = $IDS->{$gene};
 	 $prev_gene_ids     = $IDS->{$prev_gene};
 	 @gene_ids          = split(/\,/,$gene_ids);
 	 @prev_gene_ids     = split(/\,/,$prev_gene_ids);
 	 for ($j = 0; $j <= $#gene_ids; $j++)
 	 {
 	    $gene_id        = $gene_ids[$j];
 	    if (!($BEST->{$gene_id})) { $gene_id_best = "none";           }
 	    else                      { $gene_id_best = $BEST->{$gene_id};}
 	    for ($k = 0; $k <= $#prev_gene_ids; $k++)
 	    {
 	       $prev_gene_id = $prev_gene_ids[$k];
 	       if (!($BEST->{$prev_gene_id})) { $prev_gene_id_best = "none";                }
 	       else                           { $prev_gene_id_best = $BEST->{$prev_gene_id};}
 	       if ($gene_id_best eq $prev_gene_id_best && $gene_id_best ne 'none')
 	       {
 		  print "$gene_id ($gene) and $prev_gene_id ($prev_gene) both have best match to $gene_id_best\n";
 	       }
 	    } 
 	 }
      }
   }
 }

 #------------------------------------------------------------------#

 # FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:

 sub get_genes_from_species
 {
   my $dbh                 = $_[0];
   my $the_species         = $_[1];
   my %GID                 = ();
   my $taxid               = "none";
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my $GID;
   my %IDS                 = (); 

   # FIRST GET THE NCBI TAXONOMY ID FOR SPECIES $the_species:
   $table_w                = 'species';
   $st                     = "SELECT TAX_ID from $table_w WHERE SWCODE=\'$the_species\'"; 
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)    
      {
         $taxid            = $array[0];  
      }
   }
   if ($taxid eq 'none') { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}

   # NOW GET THE IDs FOR ALL TRANSCRIPTS FROM SPECIES $the_species:
   $table_w                = 'genes';
   $st                     = "SELECT ID, GID from $table_w WHERE TAX_ID=\'$taxid\'"; 
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)    
      {
         $ID               = $array[0];
 	 $GID              = $array[1];
 	 if ($GID{$ID}) { print STDERR "WARNING: already have GID $GID{$ID} for ID $ID (GID $GID)\n";}
 	 $GID{$ID}         = $GID; 
 	 # RECORD THE IDs FOR TRANSCRIPTS IN THIS GENE:
 	 if (!($IDS{$GID})) { $IDS{$GID} = $ID;}
 	 else {$IDS{$GID} = $IDS{$GID}.",".$ID;}
      }
   }

   return(\%GID,\%IDS); 
 }

 #------------------------------------------------------------------#

 # READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS FROM SPECIES $the_species
 # AND FAMILIES, AND FIND THE BEST MATCH FOR EACH TRANSCRIPT:

 sub read_hmmer_matches_to_families
 {
   my $dbh                 = $_[0];
   my $GID                 = $_[1];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my %BEST                = ();
   my %BEST_EVALUE         = ();
   my %BEST_SCORE          = ();
   my $best;
   my $best_evalue;
   my $best_score;

   foreach $ID (keys %{$GID})
   {
      $table_w             = 'hmmer_matches';
      $st                  = "SELECT AC, SCORE, EVALUE from $table_w"; 
      $sth                 = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
      $rv                  = $sth->execute or die "Cannot execute the query: $sth->errstr";
      if ($rv >= 1)
      {
         while ((@array) = $sth->fetchrow_array)    
         {
            $AC            = $array[0];  
 	    $SCORE         = $array[1];
 	    $EVALUE        = $array[2];
            if (!($BEST{$ID}))
 	    {
 	       if ($EVALUE == 0) { $EVALUE = "zero";}
 	       if ($SCORE  == 0) { $SCORE  = "zero";}
 	       $BEST{$ID}  = $AC;
 	       $BEST_EVALUE{$ID} = $EVALUE;
 	       $BEST_SCORE{$ID} = $SCORE;
 	    }
 	    else
 	    {
 	       $best       = $BEST{$ID};
 	       $best_evalue= $BEST_EVALUE{$ID};
 	       $best_score = $BEST_SCORE{$ID};
 	       if ($best_evalue eq 'zero') { $best_evalue = 0;}
 	       if ($best_score  eq 'zero') { $best_score  = 0;}
 	       if ($EVALUE < $best_evalue || ($EVALUE == $best_evalue && $SCORE > $best_score))
 	       {
 	          if ($EVALUE == 0) { $EVALUE = "zero";}
 	          if ($SCORE  == 0) { $SCORE  = "zero";}
 	          $BEST{$ID}  = $AC;
 	          $BEST_EVALUE{$ID} = $EVALUE;
 	          $BEST_SCORE{$ID} = $SCORE;
 	       }
 	    }
         }
      }
   }

   return(\%BEST);
 }

 #------------------------------------------------------------------#

 # SUBROUTINE TO SORT GENES BY THEIR START-POINT:

 sub by_start
 {
   if (!($START{$a})) { print STDERR "ERROR: do not know start of gene a $a.\n"; exit;}
   if (!($START{$b})) { print STDERR "ERROR: do not know start of gene b $b.\n"; exit;}
   $START{$a} <=> $START{$b}
      or
   $a cmp $b;
 }

 #------------------------------------------------------------------
	#!/usr/local/bin/perl

	#
	# Perl script find_gene_pred_errors1.pl
	# Written by Avril Coghlan ([email protected])
	# 25-Feb-09.
	#
	# For the TreeFam project.
	#
	# This perl script finds cases where two adjacent genes in a species
	# (eg. C. elegans) should probably be merged, as one of them has its
	# best match to the first part of a TreeFam family alignment and the
	# second has its best match to the second part of the same TreeFam
	# family's alignment.
	#
	# The command-line format is:
	# % perl <find_gene_pred_errors1.pl> species release
	# where species is the species of interest,
	# release is the release of TreeFam to use.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 2)
	{
	print "Usage of find_gene_pred_errors1.pl\n\n";
	print "perl find_gene_pred_errors1.pl <species> <release>\n";
	print "where <species> is the species of interest (eg. C. elegans),\n";
	print " <release> is the release of TreeFam to use.\n";
	print "For example, >perl find_gene_pred_errors1.pl CAEEL 7\n";
	exit;
	}

	use DBI;

	# FIND THE NAME OF THE SPECIES OF INTEREST:

	$the_species = $ARGV[0];

	# FIND THE RELEASE OF TREEFAM TO USE:

	$release = $ARGV[1];

	#------------------------------------------------------------------#

	$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
	$dbh = DBI->connect("$database", 'anonymous', '') \|\| return;

	# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:
	($GID,$IDS) = &get_genes_from_species($dbh,$the_species);

	# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND FAMILIES:
	($BEST) = &read_hmmer_matches_to_families($dbh,$GID);

	# FIND ADJACENT GENES IN SPECIES $the_species, AND CHECK IF THEY HAVE
	# THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:
	&find_adjacent_genes($dbh,$GID,$IDS,$BEST);

	# NOW DISCONNECT FROM THE DATABASE:

	$rc = $dbh->disconnect();
	$rc = "";

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# FIND ADJACENT GENES IN THE SPECIES $the_species AND CHECK IF THEY
	# HAVE THEIR BEST MATCH TO THE SAME TREEFAM FAMILY:

	sub find_adjacent_genes
	{
	my $dbh = $_[0];
	my $GID = $_[1];
	my $IDS = $_[2];
	my $BEST = $_[3];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my $start;
	my $chr;
	my %GENES = ();
	my $GID;
	my %SEEN = ();
	%START = ();
	my $genes;
	my @genes;
	my $i;
	my $gene;
	my $prev_gene;
	my $gene_ids;
	my $prev_gene_ids;
	my @gene_ids;
	my @prev_gene_ids;
	my $prev_gene_id;
	my $gene_id;
	my $j;
	my $k;
	my $prev_gene_id_best;
	my $gene_id_best;

	foreach $ID (keys %{$GID})
	{
	$GID = $GID{$ID};
	$start = "none";
	$chr = "none";
	$table_w = 'map';
	$st = "SELECT T_START, TARGET from $table_w WHERE ID=\'$ID\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$start = $array[0];
	$chr = $array[1];
	}
	}
	if ($start eq 'none' \|\| $chr eq 'none') { print STDERR "ERROR: ID $ID start $start chr $chr\n"; exit;}
	# RECORD THE LOCATION OF GENE $GID:
	if (!($SEEN{$chr."=".$GID}))
	{
	$SEEN{$chr."=".$GID} = 1;
	if (!($GENES{$chr}))
	{
	$GENES{$chr} = $GID;
	}
	else
	{
	$GENES{$chr} = $GENES{$chr}.",".$GID;
	}
	if (!($START{$GID}))
	{
	$START{$GID} = $start;
	}
	else
	{
	if ($start < $START{$GID})
	{
	$START{$GID} = $start;
	}
	}
	}
	}

	# CHECK IF ADJACENT GENES HAVE BEST HMMER MATCHES TO THE SAME TREEFAM FAMILY:
	foreach $chr (keys %GENES)
	{
	$genes = $GENES{$chr};
	@genes = split(/\,/,$genes);
	@genes = sort by_start @genes;
	for ($i = 1; $i <= $#genes; $i++)
	{
	$gene = $genes[$i];
	$prev_gene = $genes[$i-1];
	# CHECK IF $gene AND $prev_gene HAVE TRANSCRIPTS THAT HAVE BEST HMMER MATCHES
	# TO THE SAME TREEFAM FAMILY:
	if (!($IDS{$gene})) { print STDERR "ERROR: do not know transcripts in $gene\n"; exit;}
	if (!($IDS{$prev_gene})) { print STDERR "ERROR: do not know transcripts in $prev_gene\n"; exit;}
	$gene_ids = $IDS->{$gene};
	$prev_gene_ids = $IDS->{$prev_gene};
	@gene_ids = split(/\,/,$gene_ids);
	@prev_gene_ids = split(/\,/,$prev_gene_ids);
	for ($j = 0; $j <= $#gene_ids; $j++)
	{
	$gene_id = $gene_ids[$j];
	if (!($BEST->{$gene_id})) { $gene_id_best = "none"; }
	else { $gene_id_best = $BEST->{$gene_id};}
	for ($k = 0; $k <= $#prev_gene_ids; $k++)
	{
	$prev_gene_id = $prev_gene_ids[$k];
	if (!($BEST->{$prev_gene_id})) { $prev_gene_id_best = "none"; }
	else { $prev_gene_id_best = $BEST->{$prev_gene_id};}
	if ($gene_id_best eq $prev_gene_id_best && $gene_id_best ne 'none')
	{
	print "$gene_id ($gene) and $prev_gene_id ($prev_gene) both have best match to $gene_id_best\n";
	}
	}
	}
	}
	}
	}

	#------------------------------------------------------------------#

	# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species:

	sub get_genes_from_species
	{
	my $dbh = $_[0];
	my $the_species = $_[1];
	my %GID = ();
	my $taxid = "none";
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my $GID;
	my %IDS = ();

	# FIRST GET THE NCBI TAXONOMY ID FOR SPECIES $the_species:
	$table_w = 'species';
	$st = "SELECT TAX_ID from $table_w WHERE SWCODE=\'$the_species\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$taxid = $array[0];
	}
	}
	if ($taxid eq 'none') { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;}

	# NOW GET THE IDs FOR ALL TRANSCRIPTS FROM SPECIES $the_species:
	$table_w = 'genes';
	$st = "SELECT ID, GID from $table_w WHERE TAX_ID=\'$taxid\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$ID = $array[0];
	$GID = $array[1];
	if ($GID{$ID}) { print STDERR "WARNING: already have GID $GID{$ID} for ID $ID (GID $GID)\n";}
	$GID{$ID} = $GID;
	# RECORD THE IDs FOR TRANSCRIPTS IN THIS GENE:
	if (!($IDS{$GID})) { $IDS{$GID} = $ID;}
	else {$IDS{$GID} = $IDS{$GID}.",".$ID;}
	}
	}

	return(\%GID,\%IDS);
	}

	#------------------------------------------------------------------#

	# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS FROM SPECIES $the_species
	# AND FAMILIES, AND FIND THE BEST MATCH FOR EACH TRANSCRIPT:

	sub read_hmmer_matches_to_families
	{
	my $dbh = $_[0];
	my $GID = $_[1];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my %BEST = ();
	my %BEST_EVALUE = ();
	my %BEST_SCORE = ();
	my $best;
	my $best_evalue;
	my $best_score;

	foreach $ID (keys %{$GID})
	{
	$table_w = 'hmmer_matches';
	$st = "SELECT AC, SCORE, EVALUE from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$AC = $array[0];
	$SCORE = $array[1];
	$EVALUE = $array[2];
	if (!($BEST{$ID}))
	{
	if ($EVALUE == 0) { $EVALUE = "zero";}
	if ($SCORE == 0) { $SCORE = "zero";}
	$BEST{$ID} = $AC;
	$BEST_EVALUE{$ID} = $EVALUE;
	$BEST_SCORE{$ID} = $SCORE;
	}
	else
	{
	$best = $BEST{$ID};
	$best_evalue= $BEST_EVALUE{$ID};
	$best_score = $BEST_SCORE{$ID};
	if ($best_evalue eq 'zero') { $best_evalue = 0;}
	if ($best_score eq 'zero') { $best_score = 0;}
	if ($EVALUE < $best_evalue \|\| ($EVALUE == $best_evalue && $SCORE > $best_score))
	{
	if ($EVALUE == 0) { $EVALUE = "zero";}
	if ($SCORE == 0) { $SCORE = "zero";}
	$BEST{$ID} = $AC;
	$BEST_EVALUE{$ID} = $EVALUE;
	$BEST_SCORE{$ID} = $SCORE;
	}
	}
	}
	}
	}

	return(\%BEST);
	}

	#------------------------------------------------------------------#

	# SUBROUTINE TO SORT GENES BY THEIR START-POINT:

	sub by_start
	{
	if (!($START{$a})) { print STDERR "ERROR: do not know start of gene a $a.\n"; exit;}
	if (!($START{$b})) { print STDERR "ERROR: do not know start of gene b $b.\n"; exit;}
	$START{$a} <=> $START{$b}
	or
	$a cmp $b;
	}

	#------------------------------------------------------------------