Created
March 1, 2013 14:51
-
-
Save avrilcoghlan/5065139 to your computer and use it in GitHub Desktop.
Perl script that finds cases where a transcript listed in the 'genes' table of the TreeFam mysql database lacks any amino acid sequence in the 'aa_seq' table, or lacks a DNA sequence in the 'nt_seq' table.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC8.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Feb-09. | |
# | |
# This perl script finds cases where a transript listed in the 'genes' table | |
# is missing an amino acid sequence in the 'aa_seq' table or missing a | |
# DNA sequence in the 'nt_seq' table. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC8.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC8.pl\n\n"; | |
print "perl treefam_QC8.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC8.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
$VERBOSE = 1; | |
#------------------------------------------------------------------# | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
# READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE | |
# aa_seq TABLE: | |
$AASEQ = &read_aaseq($dbh); | |
# READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE | |
# nt_seq TABLE: | |
$DNASEQ = &read_dnaseq($dbh); | |
# CHECK THAT ALL OF THE GENES IN THE genes TABLE HAVE DNA AND AMINO | |
# ACID SEQUENCES: | |
&read_genes($dbh,$AASEQ,$DNASEQ); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ IN ALL THE TRANSCRIPTS THAT HAVE DNA SEQUENCES IN THE | |
# nt_seq TABLE: | |
sub read_dnaseq | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my %DNASEQ = (); | |
$table_w = 'nt_seq'; | |
$st = "SELECT ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$DNASEQ{$ID} = 1; | |
} | |
} | |
print STDERR "Read in transcripts with DNA sequences\n"; | |
print "Read in transcripts with DNA sequences\n"; | |
return(\%DNASEQ); | |
} | |
#------------------------------------------------------------------# | |
# READ IN ALL THE TRANSCRIPTS THAT HAVE AMINO ACID SEQUENCES IN THE | |
# aa_seq TABLE: | |
sub read_aaseq | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my %AASEQ = (); | |
$table_w = 'aa_seq'; | |
$st = "SELECT ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$AASEQ{$ID} = 1; | |
} | |
} | |
print STDERR "Read in transcripts with amino acid sequences\n"; | |
print "Read in transcripts with amino acid sequences\n"; | |
return(\%AASEQ); | |
} | |
#------------------------------------------------------------------# | |
# CHECK THAT ALL OF THE TRANSCRIPTS IN THE genes TABLE HAVE DNA AND | |
# AMINO ACID SEQUENCES: | |
sub read_genes | |
{ | |
my $dbh = $_[0]; | |
my $AASEQ = $_[1]; | |
my $DNASEQ = $_[2]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
$table_w = 'genes'; | |
$st = "SELECT ID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
if (!($AASEQ->{$ID})) | |
{ | |
print "WARNING: ID $ID appears in the genes table, but has no amino acid sequence in the aa_seq table\n"; | |
} | |
if (!($DNASEQ->{$ID})) | |
{ | |
print "WARNING: ID $ID appears in the genes table, but has no DNA sequence in the nt_seq table\n"; | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment