avrilcoghlan · March 1, 2013 13:52
diff --git a/find_closest_worm_paralogs4.pl b/find_closest_worm_paralogs4.pl
 #!/usr/local/bin/perl 
 #
 # 
 # Perl script find_closest_worm_paralogs4.pl
 # Written by Avril Coghlan ([email protected])
 # 16-Feb-09.   
 #
 # This perl script finds the bootstrap value for the
 # clade containing a pair of C. elegans paralogs in a family (where
 # there is no other gene in that clade, ie. they are each other's
 # closest relatives in the family).

 # The command-line format is:
 # % perl find_closest_worm_paralogs4.pl <paralogs>
 #   where <paralogs> is the file of C. elegans paralog pairs.
 #
 #------------------------------------------------------------------#

 # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

 $num_args                  = $#ARGV + 1;
 if ($num_args != 1)
 {
   print "Usage of find_closest_worm_paralogs4.pl\n\n";
   print "perl find_closest_worm_paralogs4.pl <paralogs>\n";
   print "where <paralogs> is the file of C. elegans paralog pairs.\n";
   print "For example, >perl find_closest_worm_paralogs4.pl Ce_closest_paralogs5b\n";
   exit;
 }

 use lib "/home/bcri/acoghlan/x86_64-linux-thread-multi";
 use Treefam::DBConnection; #xxx make sure it points to the right treefam release
 use DBI;

 # FIND THE INPUT FILE WITH THE C. ELEGANS PARALOG PAIRS:

 $input                     = $ARGV[0];

 #------------------------------------------------------------------#

 # FIRST RECORD THE TREEFAM GENE IDS (GIDs) FOR TRANSCRIPT IDS, FOR
 # C. ELEGANS GENES:

 $release                   = 6; #xxx
 $database                  = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
 $dbh                       = DBI->connect("$database", 'anonymous', '') || return;
 $GENE                      = &read_proteins_for_genes($dbh);

 #------------------------------------------------------------------#

 # CONNECT TO THE TREEFAM DATABASE:

 $dbc                       = Treefam::DBConnection->new(); 

 # READ IN THE INPUT FILE:

 open(INPUT,"$input") || die "ERROR: cannot open $input\n";
 while(<INPUT>)
 {
   $line                   = $_;
   chomp $line;
   @temp                   = split(/\s+/,$line);
   $treefam_family         = $temp[0];
   $ce1                    = $temp[1];
   $ce2                    = $temp[2];
   # GET THE GENE IDs FOR THESE C. ELEGANS GENES:
   if (!($GENE->{$ce1})) { print STDERR "ERROR: do not know gene id for $ce1\n"; exit;}
   if (!($GENE->{$ce2})) { print STDERR "ERROR: do not know gene id for $ce2\n"; exit;}
   $ce1_gene               = $GENE->{$ce1};
   $ce2_gene               = $GENE->{$ce2};
   print STDERR "Looking at family $treefam_family... \n";
   $famh                   = $dbc->get_FamilyHandle();
   if (!(defined($famh->get_by_id($treefam_family))))
   {
      print "WARNING: there is no family $treefam_family...\n";
      goto NEXT_FAMILY;
   }
   $family                 = $famh->get_by_id($treefam_family);
   if (!(defined($family->ID())))
   {
      print "WARNING: do not have ID stored for family $treefam_family...\n";
      goto NEXT_FAMILY;
   }
   $AC                     = $family->ID();         # GET THE FAMILY ID.
   if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}

   # GET THE TREE FOR THE FAMILY:
   if (!(defined($family->get_tree('clean'))))
   {
      print "WARNING: AC $AC: there is no tree!\n";
      goto NEXT_FAMILY;
   }
   $tree                   = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. 
   
   # GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM SPECIES $the_species:
   @leaves                 = $tree->get_leaves;
   $leaf1                  = "none";
   $leaf2                  = "none";
   $num_ce_genes           = 0;
   foreach $leaf(@leaves)                 
   {
      # FIND THE GENE FOR THIS TRANSCRIPT:
      $gh                  = $leaf->gene;
      $gene                = $gh->ID(); # eg. ENSMUSG00000022770 OR WBGene00020391
      if    ($gene eq $ce1_gene) { $leaf1 = $leaf;}
      elsif ($gene eq $ce2_gene) { $leaf2 = $leaf;}
      # CHECK IF THIS IS A CE. ELEGANS GENE:
      if (defined($leaf->get_tag_value('S'))) 
      {
         $species          = $leaf->get_tag_value('S'); 
   if ($species eq 'CAEEL') { $num_ce_genes++;}
      }
      else
      {
         print STDERR "ERROR: AC $AC: do not know species of node $leaf\n";
 	 exit;
      }
   }
   if ($leaf1 eq 'none') { print STDERR "ERROR: did not find $ce1_gene ($ce1) in $AC tree\n"; exit;}
   if ($leaf2 eq 'none') { print STDERR "ERROR: did not find $ce2_gene ($ce2) in $AC tree\n"; exit;}
   # FIND THE TREE BOOTSTRAP VALUE FOR THE CLADE CONTAINING $ce1 AND $ce2: 
   if (defined($tree->get_last_common_ancestor($leaf1,$leaf2)))
   {
      $lca                 = $tree->get_last_common_ancestor($leaf1,$leaf2);
      if (defined($lca->get_tag_value('B')))
      {
 	 $bootstrap        = $lca->get_tag_value('B');
 	 print "$AC $ce1 $ce1_gene and $ce2 $ce2_gene : bootstrap $bootstrap, $num_ce_genes Ce genes in tree\n";
      }
      else
      {
 	 print "WARNING: bootstrap value not defined for $ce1, $ce2 - $num_ce_genes Ce genes in tree\n";
 	 goto NEXT_FAMILY;
      }
   }
   else
   {
      print STDERR "ERROR: lca of $ce1 and $ce2 is not defined\n";
      exit;
   }

   NEXT_FAMILY:
 }
 close(INPUT);

 print STDERR "FINISHED\n";

 #------------------------------------------------------------------#
 # READ IN THE WORM PROTEINS CORRESPONDING TO EACH C. ELEGANS GENE:

 sub read_proteins_for_genes
 {
   my $dbh                 = $_[0];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $GID;  
   my $ID; 
   my %GENE                = ();

   $table_w                = 'genes';
   $st                     = "SELECT GID, ID from $table_w WHERE TAX_ID=\'6239\'"; 
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)    
      {
         $GID              = $array[0];  
 	 $ID               = $array[1];
         
         if ($GENE{$ID})
 	 {
 	    print STDERR "WARNING: have multiple gids for $ID\n";
 	    $GENE{$ID}     = $GID; 
 	 } 
 	 else
 	 {
 	    $GENE{$ID}     = $GID;
 	 }
      }
   }

   print STDERR "Read in gene id for each protein id\n";
   return(\%GENE);
 }

 #------------------------------------------------------------------#
	#!/usr/local/bin/perl
	#
	#
	# Perl script find_closest_worm_paralogs4.pl
	# Written by Avril Coghlan ([email protected])
	# 16-Feb-09.
	#
	# This perl script finds the bootstrap value for the
	# clade containing a pair of C. elegans paralogs in a family (where
	# there is no other gene in that clade, ie. they are each other's
	# closest relatives in the family).

	# The command-line format is:
	# % perl find_closest_worm_paralogs4.pl <paralogs>
	# where <paralogs> is the file of C. elegans paralog pairs.
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 1)
	{
	print "Usage of find_closest_worm_paralogs4.pl\n\n";
	print "perl find_closest_worm_paralogs4.pl <paralogs>\n";
	print "where <paralogs> is the file of C. elegans paralog pairs.\n";
	print "For example, >perl find_closest_worm_paralogs4.pl Ce_closest_paralogs5b\n";
	exit;
	}

	use lib "/home/bcri/acoghlan/x86_64-linux-thread-multi";
	use Treefam::DBConnection; #xxx make sure it points to the right treefam release
	use DBI;

	# FIND THE INPUT FILE WITH THE C. ELEGANS PARALOG PAIRS:

	$input = $ARGV[0];

	#------------------------------------------------------------------#

	# FIRST RECORD THE TREEFAM GENE IDS (GIDs) FOR TRANSCRIPT IDS, FOR
	# C. ELEGANS GENES:

	$release = 6; #xxx
	$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
	$dbh = DBI->connect("$database", 'anonymous', '') \|\| return;
	$GENE = &read_proteins_for_genes($dbh);

	#------------------------------------------------------------------#

	# CONNECT TO THE TREEFAM DATABASE:

	$dbc = Treefam::DBConnection->new();

	# READ IN THE INPUT FILE:

	open(INPUT,"$input") \|\| die "ERROR: cannot open $input\n";
	while(<INPUT>)
	{
	$line = $_;
	chomp $line;
	@temp = split(/\s+/,$line);
	$treefam_family = $temp[0];
	$ce1 = $temp[1];
	$ce2 = $temp[2];
	# GET THE GENE IDs FOR THESE C. ELEGANS GENES:
	if (!($GENE->{$ce1})) { print STDERR "ERROR: do not know gene id for $ce1\n"; exit;}
	if (!($GENE->{$ce2})) { print STDERR "ERROR: do not know gene id for $ce2\n"; exit;}
	$ce1_gene = $GENE->{$ce1};
	$ce2_gene = $GENE->{$ce2};
	print STDERR "Looking at family $treefam_family... \n";
	$famh = $dbc->get_FamilyHandle();
	if (!(defined($famh->get_by_id($treefam_family))))
	{
	print "WARNING: there is no family $treefam_family...\n";
	goto NEXT_FAMILY;
	}
	$family = $famh->get_by_id($treefam_family);
	if (!(defined($family->ID())))
	{
	print "WARNING: do not have ID stored for family $treefam_family...\n";
	goto NEXT_FAMILY;
	}
	$AC = $family->ID(); # GET THE FAMILY ID.
	if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;}

	# GET THE TREE FOR THE FAMILY:
	if (!(defined($family->get_tree('clean'))))
	{
	print "WARNING: AC $AC: there is no tree!\n";
	goto NEXT_FAMILY;
	}
	$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE.

	# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM SPECIES $the_species:
	@leaves = $tree->get_leaves;
	$leaf1 = "none";
	$leaf2 = "none";
	$num_ce_genes = 0;
	foreach $leaf(@leaves)
	{
	# FIND THE GENE FOR THIS TRANSCRIPT:
	$gh = $leaf->gene;
	$gene = $gh->ID(); # eg. ENSMUSG00000022770 OR WBGene00020391
	if ($gene eq $ce1_gene) { $leaf1 = $leaf;}
	elsif ($gene eq $ce2_gene) { $leaf2 = $leaf;}
	# CHECK IF THIS IS A CE. ELEGANS GENE:
	if (defined($leaf->get_tag_value('S')))
	{
	$species = $leaf->get_tag_value('S');
	if ($species eq 'CAEEL') { $num_ce_genes++;}
	}
	else
	{
	print STDERR "ERROR: AC $AC: do not know species of node $leaf\n";
	exit;
	}
	}
	if ($leaf1 eq 'none') { print STDERR "ERROR: did not find $ce1_gene ($ce1) in $AC tree\n"; exit;}
	if ($leaf2 eq 'none') { print STDERR "ERROR: did not find $ce2_gene ($ce2) in $AC tree\n"; exit;}
	# FIND THE TREE BOOTSTRAP VALUE FOR THE CLADE CONTAINING $ce1 AND $ce2:
	if (defined($tree->get_last_common_ancestor($leaf1,$leaf2)))
	{
	$lca = $tree->get_last_common_ancestor($leaf1,$leaf2);
	if (defined($lca->get_tag_value('B')))
	{
	$bootstrap = $lca->get_tag_value('B');
	print "$AC $ce1 $ce1_gene and $ce2 $ce2_gene : bootstrap $bootstrap, $num_ce_genes Ce genes in tree\n";
	}
	else
	{
	print "WARNING: bootstrap value not defined for $ce1, $ce2 - $num_ce_genes Ce genes in tree\n";
	goto NEXT_FAMILY;
	}
	}
	else
	{
	print STDERR "ERROR: lca of $ce1 and $ce2 is not defined\n";
	exit;
	}

	NEXT_FAMILY:
	}
	close(INPUT);

	print STDERR "FINISHED\n";

	#------------------------------------------------------------------#
	# READ IN THE WORM PROTEINS CORRESPONDING TO EACH C. ELEGANS GENE:

	sub read_proteins_for_genes
	{
	my $dbh = $_[0];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $GID;
	my $ID;
	my %GENE = ();

	$table_w = 'genes';
	$st = "SELECT GID, ID from $table_w WHERE TAX_ID=\'6239\'";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$GID = $array[0];
	$ID = $array[1];

	if ($GENE{$ID})
	{
	print STDERR "WARNING: have multiple gids for $ID\n";
	$GENE{$ID} = $GID;
	}
	else
	{
	$GENE{$ID} = $GID;
	}
	}
	}

	print STDERR "Read in gene id for each protein id\n";
	return(\%GENE);
	}

	#------------------------------------------------------------------#