Created
March 1, 2013 13:56
-
-
Save avrilcoghlan/5064814 to your computer and use it in GitHub Desktop.
Perl script that uses TreeFam to find cases where two adjacent genes in a species (eg. Caenorhabditis elegans) should probably be merged, as one of them has its best match to the first part of a TreeFam family alignment, and the second has its best match to the second part of the same TreeFam family's alignment.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script find_gene_pred_errors1.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 25-Feb-09. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script finds cases where two adjacent genes in a species | |
# (eg. C. elegans) should probably be merged, as one of them has its | |
# best match to the first part of a TreeFam family alignment and the | |
# second has its best match to the second part of the same TreeFam | |
# family's alignment. | |
# | |
# The command-line format is: | |
# % perl <find_gene_pred_errors1.pl> species release | |
# where species is the species of interest, | |
# release is the release of TreeFam to use. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of find_gene_pred_errors1.pl\n\n"; | |
print "perl find_gene_pred_errors1.pl <species> <release>\n"; | |
print "where <species> is the species of interest (eg. C. elegans),\n"; | |
print " <release> is the release of TreeFam to use.\n"; | |
print "For example, >perl find_gene_pred_errors1.pl CAEEL 7\n"; | |
exit; | |
} | |
use DBI; | |
# FIND THE NAME OF THE SPECIES OF INTEREST: | |
$the_species = $ARGV[0]; | |
# FIND THE RELEASE OF TREEFAM TO USE: | |
$release = $ARGV[1]; | |
#------------------------------------------------------------------# | |
$database = "dbi:mysql:treefam_".$release.":db.treefam.org:3308"; | |
$dbh = DBI->connect("$database", 'anonymous', '') || return; | |
# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species: | |
($GID,$IDS) = &get_genes_from_species($dbh,$the_species); | |
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS AND FAMILIES: | |
($BEST) = &read_hmmer_matches_to_families($dbh,$GID); | |
# FIND ADJACENT GENES IN SPECIES $the_species, AND CHECK IF THEY HAVE | |
# THEIR BEST MATCH TO THE SAME TREEFAM FAMILY: | |
&find_adjacent_genes($dbh,$GID,$IDS,$BEST); | |
# NOW DISCONNECT FROM THE DATABASE: | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# FIND ADJACENT GENES IN THE SPECIES $the_species AND CHECK IF THEY | |
# HAVE THEIR BEST MATCH TO THE SAME TREEFAM FAMILY: | |
sub find_adjacent_genes | |
{ | |
my $dbh = $_[0]; | |
my $GID = $_[1]; | |
my $IDS = $_[2]; | |
my $BEST = $_[3]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my $start; | |
my $chr; | |
my %GENES = (); | |
my $GID; | |
my %SEEN = (); | |
%START = (); | |
my $genes; | |
my @genes; | |
my $i; | |
my $gene; | |
my $prev_gene; | |
my $gene_ids; | |
my $prev_gene_ids; | |
my @gene_ids; | |
my @prev_gene_ids; | |
my $prev_gene_id; | |
my $gene_id; | |
my $j; | |
my $k; | |
my $prev_gene_id_best; | |
my $gene_id_best; | |
foreach $ID (keys %{$GID}) | |
{ | |
$GID = $GID{$ID}; | |
$start = "none"; | |
$chr = "none"; | |
$table_w = 'map'; | |
$st = "SELECT T_START, TARGET from $table_w WHERE ID=\'$ID\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$start = $array[0]; | |
$chr = $array[1]; | |
} | |
} | |
if ($start eq 'none' || $chr eq 'none') { print STDERR "ERROR: ID $ID start $start chr $chr\n"; exit;} | |
# RECORD THE LOCATION OF GENE $GID: | |
if (!($SEEN{$chr."=".$GID})) | |
{ | |
$SEEN{$chr."=".$GID} = 1; | |
if (!($GENES{$chr})) | |
{ | |
$GENES{$chr} = $GID; | |
} | |
else | |
{ | |
$GENES{$chr} = $GENES{$chr}.",".$GID; | |
} | |
if (!($START{$GID})) | |
{ | |
$START{$GID} = $start; | |
} | |
else | |
{ | |
if ($start < $START{$GID}) | |
{ | |
$START{$GID} = $start; | |
} | |
} | |
} | |
} | |
# CHECK IF ADJACENT GENES HAVE BEST HMMER MATCHES TO THE SAME TREEFAM FAMILY: | |
foreach $chr (keys %GENES) | |
{ | |
$genes = $GENES{$chr}; | |
@genes = split(/\,/,$genes); | |
@genes = sort by_start @genes; | |
for ($i = 1; $i <= $#genes; $i++) | |
{ | |
$gene = $genes[$i]; | |
$prev_gene = $genes[$i-1]; | |
# CHECK IF $gene AND $prev_gene HAVE TRANSCRIPTS THAT HAVE BEST HMMER MATCHES | |
# TO THE SAME TREEFAM FAMILY: | |
if (!($IDS{$gene})) { print STDERR "ERROR: do not know transcripts in $gene\n"; exit;} | |
if (!($IDS{$prev_gene})) { print STDERR "ERROR: do not know transcripts in $prev_gene\n"; exit;} | |
$gene_ids = $IDS->{$gene}; | |
$prev_gene_ids = $IDS->{$prev_gene}; | |
@gene_ids = split(/\,/,$gene_ids); | |
@prev_gene_ids = split(/\,/,$prev_gene_ids); | |
for ($j = 0; $j <= $#gene_ids; $j++) | |
{ | |
$gene_id = $gene_ids[$j]; | |
if (!($BEST->{$gene_id})) { $gene_id_best = "none"; } | |
else { $gene_id_best = $BEST->{$gene_id};} | |
for ($k = 0; $k <= $#prev_gene_ids; $k++) | |
{ | |
$prev_gene_id = $prev_gene_ids[$k]; | |
if (!($BEST->{$prev_gene_id})) { $prev_gene_id_best = "none"; } | |
else { $prev_gene_id_best = $BEST->{$prev_gene_id};} | |
if ($gene_id_best eq $prev_gene_id_best && $gene_id_best ne 'none') | |
{ | |
print "$gene_id ($gene) and $prev_gene_id ($prev_gene) both have best match to $gene_id_best\n"; | |
} | |
} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# FIND ALL THE IDs FOR GENES FROM SPECIES $the_species: | |
sub get_genes_from_species | |
{ | |
my $dbh = $_[0]; | |
my $the_species = $_[1]; | |
my %GID = (); | |
my $taxid = "none"; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my $GID; | |
my %IDS = (); | |
# FIRST GET THE NCBI TAXONOMY ID FOR SPECIES $the_species: | |
$table_w = 'species'; | |
$st = "SELECT TAX_ID from $table_w WHERE SWCODE=\'$the_species\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$taxid = $array[0]; | |
} | |
} | |
if ($taxid eq 'none') { print STDERR "ERROR: do not know taxid for $the_species\n"; exit;} | |
# NOW GET THE IDs FOR ALL TRANSCRIPTS FROM SPECIES $the_species: | |
$table_w = 'genes'; | |
$st = "SELECT ID, GID from $table_w WHERE TAX_ID=\'$taxid\'"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$ID = $array[0]; | |
$GID = $array[1]; | |
if ($GID{$ID}) { print STDERR "WARNING: already have GID $GID{$ID} for ID $ID (GID $GID)\n";} | |
$GID{$ID} = $GID; | |
# RECORD THE IDs FOR TRANSCRIPTS IN THIS GENE: | |
if (!($IDS{$GID})) { $IDS{$GID} = $ID;} | |
else {$IDS{$GID} = $IDS{$GID}.",".$ID;} | |
} | |
} | |
return(\%GID,\%IDS); | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE HMMER MATCHES BETWEEN TRANSCRIPTS FROM SPECIES $the_species | |
# AND FAMILIES, AND FIND THE BEST MATCH FOR EACH TRANSCRIPT: | |
sub read_hmmer_matches_to_families | |
{ | |
my $dbh = $_[0]; | |
my $GID = $_[1]; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $ID; | |
my %BEST = (); | |
my %BEST_EVALUE = (); | |
my %BEST_SCORE = (); | |
my $best; | |
my $best_evalue; | |
my $best_score; | |
foreach $ID (keys %{$GID}) | |
{ | |
$table_w = 'hmmer_matches'; | |
$st = "SELECT AC, SCORE, EVALUE from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
$SCORE = $array[1]; | |
$EVALUE = $array[2]; | |
if (!($BEST{$ID})) | |
{ | |
if ($EVALUE == 0) { $EVALUE = "zero";} | |
if ($SCORE == 0) { $SCORE = "zero";} | |
$BEST{$ID} = $AC; | |
$BEST_EVALUE{$ID} = $EVALUE; | |
$BEST_SCORE{$ID} = $SCORE; | |
} | |
else | |
{ | |
$best = $BEST{$ID}; | |
$best_evalue= $BEST_EVALUE{$ID}; | |
$best_score = $BEST_SCORE{$ID}; | |
if ($best_evalue eq 'zero') { $best_evalue = 0;} | |
if ($best_score eq 'zero') { $best_score = 0;} | |
if ($EVALUE < $best_evalue || ($EVALUE == $best_evalue && $SCORE > $best_score)) | |
{ | |
if ($EVALUE == 0) { $EVALUE = "zero";} | |
if ($SCORE == 0) { $SCORE = "zero";} | |
$BEST{$ID} = $AC; | |
$BEST_EVALUE{$ID} = $EVALUE; | |
$BEST_SCORE{$ID} = $SCORE; | |
} | |
} | |
} | |
} | |
} | |
return(\%BEST); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO SORT GENES BY THEIR START-POINT: | |
sub by_start | |
{ | |
if (!($START{$a})) { print STDERR "ERROR: do not know start of gene a $a.\n"; exit;} | |
if (!($START{$b})) { print STDERR "ERROR: do not know start of gene b $b.\n"; exit;} | |
$START{$a} <=> $START{$b} | |
or | |
$a cmp $b; | |
} | |
#------------------------------------------------------------------ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment