Created
March 1, 2013 14:47
-
-
Save avrilcoghlan/5065111 to your computer and use it in GitHub Desktop.
Perl script that finds cases where more than one alternative splice form from the same gene was added to a family
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script treefam_QC6.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Feb-09. | |
# | |
# This perl script finds cases where more than one alternative splice form | |
# from the same gene was added to a family. | |
# | |
# The command-line format is: | |
# % perl <treefam_QC6.pl> <release> | |
# where <release> is the release of the TreeFam database to use. | |
# | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of treefam_QC6.pl\n\n"; | |
print "perl treefam_QC6.pl <release>\n"; | |
print "where <release> is the release of the TreeFam database to use.\n"; | |
print "For example, >perl -w treefam_QC6.pl 7\n"; | |
exit; | |
} | |
#------------------------------------------------------------------# | |
# DECLARE MYSQL USERNAME AND HOST: | |
use DBI; | |
# FIND WHICH RELEASE OF THE TREEFAM DATABASE TO USE: | |
$release = $ARGV[0]; | |
$VERBOSE = 1; | |
#------------------------------------------------------------------# | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
# READ IN THE GENE ID FOR EVERY TRANSCRIPT ID: | |
$GID = &read_geneids($dbh); | |
# CHECK IF THERE IS MORE THAN ONE TRANSCRIPT FROM ANY GENE IN A FAMILY: | |
&read_genes_in_families($dbh,$GID); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# READ THE GENE ID FOR EVERY TRANSCRIPT ID: | |
sub read_geneids | |
{ | |
my $dbh = $_[0]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my $GID; | |
my %GID = (); | |
$table_w = 'genes'; | |
$st = "SELECT ID, GID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$GID = $array[1]; | |
if (!($GID{$ID})) | |
{ | |
$GID{$ID} = $GID; | |
} | |
else | |
{ | |
if ($GID{$ID} ne $GID) { print STDERR "ERROR: id $ID GID $GID GID(ID) $GID{$ID}\n"; exit;} | |
} | |
} | |
} | |
print STDERR "Read in gene ids\n"; | |
return(\%GID); | |
} | |
#------------------------------------------------------------------# | |
# CHECK IF MORE THAN ONE TRANSCRIPT FROM A GENE WAS PUT INTO A FAMILY: | |
sub read_genes_in_families | |
{ | |
my $dbh = $_[0]; | |
my $GID = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $FLAG; | |
my %FAMILY = (); | |
my $geneid; | |
my $ids; | |
$table_w = 'fam_genes'; | |
$st = "SELECT AC, ID, FLAG from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$ID = $array[1]; | |
$FLAG = $array[2]; | |
# GET THE GENE ID FOR $ID: | |
if (!($GID->{$ID})) | |
{ | |
# THERE ARE SOME IDs IN THE fam_genes TABLE THAT DO NOT APPEAR IN THE genes | |
# TABLE BECAUSE THEY BELONGED TO AN OLD TREEFAM-A SEED FAMILY AND ARE NOT IN | |
# THE LATEST GENE SET FOR THAT SPECIES LOADED INTO THE genes TABLE. | |
print STDERR "WARNING: do not know gene id for $ID (AC $AC FLAG $FLAG)\n"; | |
} | |
else | |
{ | |
$geneid = $GID->{$ID}; | |
if (!($FAMILY{$AC."=".$FLAG."=".$geneid})) | |
{ | |
$FAMILY{$AC."=".$FLAG."=".$geneid} = $ID; | |
} | |
else | |
{ | |
$FAMILY{$AC."=".$FLAG."=".$geneid} = $FAMILY{$AC."=".$FLAG."=".$geneid}.",".$ID; | |
$ids = $FAMILY{$AC."=".$FLAG."=".$geneid}; | |
print "WARNING: gene $geneid has transcripts $ids in family $AC ($FLAG)\n"; | |
} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment