Created
March 1, 2013 15:42
-
-
Save avrilcoghlan/5065479 to your computer and use it in GitHub Desktop.
Perl script that, given a gff file of Caenorhabditis elegans genes, finds their C. briggsae, human and yeast orthologs from TreeFam.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script check_if_have_treefam_ortholog.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Dec-07. | |
# | |
# This perls scripts finds the TreeFam ortholog in C. briggsae, human, yeast | |
# each gene in the ngasp benchmark gene set. | |
# | |
# The command-line format is: | |
# % perl <check_if_have_treefam_ortholog.pl> gff | |
# where gff is the input benchmark gff file. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of check_if_have_treefam_ortholog.pl\n\n"; | |
print "perl check_if_have_treefam_ortholog.pl <gff>\n"; | |
print "where <gff> is the input benchmark gff file.\n"; | |
print "For example, >perl check_if_have_treefam_ortholog.pl phase1_confirmed_isoforms_A.gff.1\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use Avril_modules; | |
use DBI; | |
# FIND THE NAME OF THE INPUT GFF FILE: | |
$gff = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE: | |
%TAKE = (); | |
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$exon = $temp[8]; | |
# FIND THE GENE NAME: | |
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1 | |
@temp = split(/\;/,$exon); | |
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME. | |
@temp = split(/\./,$gene); | |
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b | |
@temp = split(/=/,$gene); | |
$gene = $temp[1]; # eg. C49A1.4b | |
$gene =~ tr/[a-z]/[A-Z]/; | |
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME. | |
$TAKE{$gene} = 1; | |
} | |
close(GFF); | |
#------------------------------------------------------------------# | |
# FIND THE TREEFAM ID FOR ALL THE GENES FROM C. ELEGANS THAT I AM INTERESTED IN: | |
# HASH TABLES TO REMEMBER THE SPECIES OF GENES: | |
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE: | |
$table_w = 'genes'; | |
# THE FIRST COLUMN IN THIS TABLE IS AN IDENTIFIER (IDX), | |
# ANOTHER COLUMN IS THE TRANSCRIPT NAME, AND THE LAST COLUMN | |
# IS THE TAXONOMY ID, eg. 1 ENSANGT00000032162.1 7165 | |
$st = "SELECT IDX, TAX_ID, GID from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
%IDX = (); | |
%YEAST = (); | |
%CB = (); | |
%CR = (); | |
%HUMAN = (); | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) { | |
$IDX = $array[0]; # eg., 1 | |
$TAXID = $array[1]; # eg 7165 | |
$GID = $array[2]; | |
# CHECK IF THIS IS A C. ELEGANS GENE THAT WE WANT TO TAKE: | |
if ($TAKE{$GID}) | |
{ | |
if (!($IDX{$GID})) { $IDX{$GID} = $IDX;} | |
else {$IDX{$GID} = $IDX{$GID}.",".$IDX;} | |
} | |
# RECORD IF IT IS A YEAST GENE: | |
if ($TAXID eq '4932') | |
{ | |
$YEAST{$IDX} = $GID; | |
} | |
# RECORD IF IT IS A C. BRIGGSAE GENE: | |
if ($TAXID eq '6238') | |
{ | |
$CB{$IDX} = $GID; | |
} | |
# RECORD IF IT IS A C. REMANEI GENE: | |
if ($TAXID eq '31234') | |
{ | |
$CR{$IDX} = $GID; | |
} | |
# RECORD IF IT IS A HUMAN GENE: | |
if ($TAXID eq '9606') | |
{ | |
$HUMAN{$IDX} = $GID; | |
} | |
} | |
} | |
# NOW FIND ALL THE ORTHOLOGS FOR THE C. ELEGANS GENES OF INTEREST: | |
%YEAST_ORTH = (); | |
%CB_ORTH = (); | |
%CR_ORTH = (); | |
%HUMAN_ORTH = (); | |
foreach $GID (keys %IDX) | |
{ | |
$IDXS = $IDX{$GID}; | |
@IDXS = split(/\,/,$IDXS); | |
for ($i = 0; $i <= $#IDXS; $i++) | |
{ | |
$IDX = $IDXS[$i]; | |
# GET ALL THE ORTHOLGOS FOR $IDX: | |
$table_w = 'ortholog'; | |
$st = "SELECT idx1,idx2 from $table_w WHERE idx1=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$idx1 = $array[0]; | |
$idx2 = $array[1]; | |
if ($YEAST{$idx2}) | |
{ | |
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx2};} | |
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx2};} | |
} | |
if ($CB{$idx2}) | |
{ | |
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx2};} | |
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx2};} | |
} | |
if ($CR{$idx2}) | |
{ | |
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx2};} | |
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx2};} | |
} | |
if ($HUMAN{$idx2}) | |
{ | |
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx2};} | |
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx2};} | |
} | |
} | |
} | |
$st = "SELECT idx1,idx2 from $table_w WHERE idx2=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($IDX) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$idx1 = $array[0]; | |
$idx2 = $array[1]; | |
if ($YEAST{$idx1}) | |
{ | |
if (!($YEAST_ORTH{$GID})) { $YEAST_ORTH{$GID} = $YEAST{$idx1};} | |
else {$YEAST_ORTH{$GID} = $YEAST_ORTH{$GID}.",".$YEAST{$idx1};} | |
} | |
if ($CB{$idx1}) | |
{ | |
if (!($CB_ORTH{$GID})) { $CB_ORTH{$GID} = $CB{$idx1};} | |
else {$CB_ORTH{$GID} = $CB_ORTH{$GID}.",".$CB{$idx1};} | |
} | |
if ($CR{$idx1}) | |
{ | |
if (!($CR_ORTH{$GID})) { $CR_ORTH{$GID} = $CR{$idx1};} | |
else {$CR_ORTH{$GID} = $CR_ORTH{$GID}.",".$CR{$idx1};} | |
} | |
if ($HUMAN{$idx1}) | |
{ | |
if (!($HUMAN_ORTH{$GID})) { $HUMAN_ORTH{$GID} = $HUMAN{$idx1};} | |
else {$HUMAN_ORTH{$GID} = $HUMAN_ORTH{$GID}.",".$HUMAN{$idx1};} | |
} | |
} | |
} | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE: | |
print "GENE CBORTH CRORTH HUMORTH YEASTORTH\n"; | |
%SEEN = (); | |
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$exon = $temp[8]; | |
# FIND THE GENE NAME: | |
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1 | |
@temp = split(/\;/,$exon); | |
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME. | |
@temp = split(/\./,$gene); | |
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b | |
@temp = split(/=/,$gene); | |
$gene = $temp[1]; # eg. C49A1.4b | |
$gene =~ tr/[a-z]/[A-Z]/; | |
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME. | |
# SEE IF THIS GENE HAS C. BRIGGSAE, C. REMANEI, HUMAN AND YEAST ORTHOLOGS: | |
if ($CB_ORTH{$gene}) { $cb_orth = $CB_ORTH{$gene};} | |
else { $cb_orth = "none"; } | |
if ($CR_ORTH{$gene}) { $cr_orth = $CR_ORTH{$gene};} | |
else { $cr_orth = "none"; } | |
if ($HUMAN_ORTH{$gene}) { $human_orth = $HUMAN_ORTH{$gene};} | |
else { $human_orth = "none"; } | |
if ($YEAST_ORTH{$gene}) { $yeast_orth = $YEAST_ORTH{$gene};} | |
else { $yeast_orth = "none"; } | |
if (!($SEEN{$gene})) | |
{ | |
$SEEN{$gene} = 1; | |
print "$gene $cb_orth $cr_orth $human_orth $yeast_orth\n"; | |
} | |
} | |
close(GFF); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment