avrilcoghlan · March 1, 2013 14:51
diff --git a/treefam_QC8.pl b/treefam_QC8.pl
 #!/usr/local/bin/perl

 # 
 # Perl script treefam_QC8.pl
 # Written by Avril Coghlan ([email protected])
 # 4-Feb-09.
 #
 # This perl script finds cases where a transript listed in the 'genes' table
 # is missing an amino acid sequence in the 'aa_seq' table or missing a
 # DNA sequence in the 'nt_seq' table.
 #
 # The command-line format is:
 # % perl <treefam_QC8.pl> <release> 
 #   where <release> is the release of the TreeFam database to use.
 #
 #
 #------------------------------------------------------------------#

 # CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

 $num_args                  = $#ARGV + 1;
 if ($num_args != 1)
 {
   print "Usage of treefam_QC8.pl\n\n";
   print "perl treefam_QC8.pl <release>\n";
   print "where <release> is the release of the TreeFam database to use.\n";
   print "For example, >perl -w treefam_QC8.pl 7\n";
   exit;
 }

 #------------------------------------------------------------------#

 # DECLARE MYSQL USERNAME AND HOST:
 use DBI;

 # FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:

 $release                   = $ARGV[0];

 $VERBOSE                   = 1;

 #------------------------------------------------------------------#

 $database                  = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
 $dbh                       = DBI->connect("$database", 'anonymous', '') || return;

 # READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE
 # aa_seq TABLE:

 $AASEQ                     = &read_aaseq($dbh);

 # READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE
 # nt_seq TABLE:

 $DNASEQ                    = &read_dnaseq($dbh);

 # CHECK THAT ALL OF THE GENES IN THE genes TABLE HAVE DNA AND AMINO
 # ACID SEQUENCES:

 &read_genes($dbh,$AASEQ,$DNASEQ);

 # NOW DISCONNECT FROM THE DATABASE:

 $rc                        = $dbh->disconnect();
 $rc                        = "";

 #------------------------------------------------------------------#

 print STDERR "FINISHED.\n";

 #------------------------------------------------------------------#

 # READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE
 # nt_seq TABLE:

 sub read_dnaseq
 {
   my $dbh                 = $_[0];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my %DNASEQ              = ();

   $table_w                = 'nt_seq';
   $st                     = "SELECT ID from $table_w";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $ID               = $array[0];
         $DNASEQ{$ID}      = 1;
      }
   }

   print STDERR "Read in transcripts with DNA sequences\n";
   print "Read in transcripts with DNA sequences\n";

   return(\%DNASEQ);
 }

 #------------------------------------------------------------------#

 # READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE
 # aa_seq TABLE:

 sub read_aaseq
 {
   my $dbh                 = $_[0];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;
   my %AASEQ               = ();

   $table_w                = 'aa_seq';
   $st                     = "SELECT ID from $table_w";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $ID               = $array[0];
         $AASEQ{$ID}       = 1;
      }
   }

   print STDERR "Read in transcripts with amino acid sequences\n";
   print "Read in transcripts with amino acid sequences\n";

   return(\%AASEQ);
 }

 #------------------------------------------------------------------#

 # CHECK THAT ALL OF THE TRANSCRIPTS IN THE genes TABLE HAVE DNA AND
 # AMINO ACID SEQUENCES:

 sub read_genes
 {
   my $dbh                 = $_[0];
   my $AASEQ               = $_[1];
   my $DNASEQ              = $_[2];
   my $table_w;
   my $st;
   my $sth;
   my $rv;
   my @array;
   my $ID;

   $table_w                = 'genes';
   $st                     = "SELECT ID from $table_w";
   $sth                    = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
   $rv                     = $sth->execute or die "Cannot execute the query: $sth->errstr";
   if ($rv >= 1)
   {
      while ((@array) = $sth->fetchrow_array)
      {
         $ID               = $array[0];
         if (!($AASEQ->{$ID}))
         {
            print "WARNING: ID $ID appears in the genes table, but has no amino acid sequence in the aa_seq table\n";
         }
         if (!($DNASEQ->{$ID}))
         {
            print "WARNING: ID $ID appears in the genes table, but has no DNA sequence in the nt_seq table\n";
         }
      }
   }
 }

 #------------------------------------------------------------------#
	#!/usr/local/bin/perl

	#
	# Perl script treefam_QC8.pl
	# Written by Avril Coghlan ([email protected])
	# 4-Feb-09.
	#
	# This perl script finds cases where a transript listed in the 'genes' table
	# is missing an amino acid sequence in the 'aa_seq' table or missing a
	# DNA sequence in the 'nt_seq' table.
	#
	# The command-line format is:
	# % perl <treefam_QC8.pl> <release>
	# where <release> is the release of the TreeFam database to use.
	#
	#
	#------------------------------------------------------------------#

	# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS:

	$num_args = $#ARGV + 1;
	if ($num_args != 1)
	{
	print "Usage of treefam_QC8.pl\n\n";
	print "perl treefam_QC8.pl <release>\n";
	print "where <release> is the release of the TreeFam database to use.\n";
	print "For example, >perl -w treefam_QC8.pl 7\n";
	exit;
	}

	#------------------------------------------------------------------#

	# DECLARE MYSQL USERNAME AND HOST:
	use DBI;

	# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE:

	$release = $ARGV[0];

	$VERBOSE = 1;

	#------------------------------------------------------------------#

	$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308";
	$dbh = DBI->connect("$database", 'anonymous', '') \|\| return;

	# READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE
	# aa_seq TABLE:

	$AASEQ = &read_aaseq($dbh);

	# READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE
	# nt_seq TABLE:

	$DNASEQ = &read_dnaseq($dbh);

	# CHECK THAT ALL OF THE GENES IN THE genes TABLE HAVE DNA AND AMINO
	# ACID SEQUENCES:

	&read_genes($dbh,$AASEQ,$DNASEQ);

	# NOW DISCONNECT FROM THE DATABASE:

	$rc = $dbh->disconnect();
	$rc = "";

	#------------------------------------------------------------------#

	print STDERR "FINISHED.\n";

	#------------------------------------------------------------------#

	# READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE
	# nt_seq TABLE:

	sub read_dnaseq
	{
	my $dbh = $_[0];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my %DNASEQ = ();

	$table_w = 'nt_seq';
	$st = "SELECT ID from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$ID = $array[0];
	$DNASEQ{$ID} = 1;
	}
	}

	print STDERR "Read in transcripts with DNA sequences\n";
	print "Read in transcripts with DNA sequences\n";

	return(\%DNASEQ);
	}

	#------------------------------------------------------------------#

	# READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE
	# aa_seq TABLE:

	sub read_aaseq
	{
	my $dbh = $_[0];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;
	my %AASEQ = ();

	$table_w = 'aa_seq';
	$st = "SELECT ID from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$ID = $array[0];
	$AASEQ{$ID} = 1;
	}
	}

	print STDERR "Read in transcripts with amino acid sequences\n";
	print "Read in transcripts with amino acid sequences\n";

	return(\%AASEQ);
	}

	#------------------------------------------------------------------#

	# CHECK THAT ALL OF THE TRANSCRIPTS IN THE genes TABLE HAVE DNA AND
	# AMINO ACID SEQUENCES:

	sub read_genes
	{
	my $dbh = $_[0];
	my $AASEQ = $_[1];
	my $DNASEQ = $_[2];
	my $table_w;
	my $st;
	my $sth;
	my $rv;
	my @array;
	my $ID;

	$table_w = 'genes';
	$st = "SELECT ID from $table_w";
	$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n";
	$rv = $sth->execute or die "Cannot execute the query: $sth->errstr";
	if ($rv >= 1)
	{
	while ((@array) = $sth->fetchrow_array)
	{
	$ID = $array[0];
	if (!($AASEQ->{$ID}))
	{
	print "WARNING: ID $ID appears in the genes table, but has no amino acid sequence in the aa_seq table\n";
	}
	if (!($DNASEQ->{$ID}))
	{
	print "WARNING: ID $ID appears in the genes table, but has no DNA sequence in the nt_seq table\n";
	}
	}
	}
	}

	#------------------------------------------------------------------#