Created
March 1, 2013 15:40
-
-
Save avrilcoghlan/5065465 to your computer and use it in GitHub Desktop.
Perl script that, given a gff file of Caenorhabditis elegans genes, uses TreeFam to check whether adjacent gens are paralogs.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script check_if_adjacent_genes_are_paralogs.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 10-Dec-07. | |
# | |
# This perls scripts checks whether adjacent genes are paralogs for | |
# each gene in the ngasp benchmark gene set. | |
# | |
# The command-line format is: | |
# % perl <check_if_adjacent_genes_are_paralogs.pl> gff | |
# where gff is the input benchmark gff file. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of check_if_adjacent_genes_are_paralogs.pl\n\n"; | |
print "perl check_if_adjacent_genes_are_paralogs.pl <gff>\n"; | |
print "where <gff> is the input benchmark gff file.\n"; | |
print "For example, >perl check_if_adjacent_genes_are_paralogs.pl phase1_confirmed_isoforms_C.gff.1\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
# FIND THE NAME OF THE INPUT GFF FILE: | |
$gff = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# FIND ALL PARALOGS OF C. ELEGANS GENES: | |
# GET A LIST OF ALL FAMILIES IN TREEFAM, INCLUDING TF1 (TREEFAM-A), TF3 | |
# (TREEFAM-B) AND TF5 (TREEFAM-C) FAMILIES: | |
print STDERR "Finding TreeFam-A families... \n"; | |
$dbh = DBI->connect("dbi:mysql:treefam_5:db.treefam.org:3308", 'anonymous', '') || return; | |
# GET ALL THE TREEFAM-A FAMILIES: | |
$table_w = "familyA"; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
@families = (@families,$AC); | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; } | |
} | |
} | |
print STDERR "Finding TreeFam-B and TreeFam-C families... \n"; | |
# GET ALL THE TREEFAM-B AND TREEFAM-C FAMILIES: | |
$table_w = "familyB"; | |
$st = "SELECT AC from $table_w"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$AC = $array[0]; | |
if ($AC ne 'TF343806') # SOMETHING IS WRONG WITH THIS FAMILY IN TREEFAM-5. xxx | |
{ | |
@families = (@families,$AC); | |
} | |
if ($#families % 100 == 0) { print STDERR "Have $#families now ...\n"; } | |
} | |
} | |
#------------------------------------------------------------------# | |
$dbc = Treefam::DBConnection->new(); | |
%PARALOG = (); | |
# FIND PARALOGS WITHIN EACH FAMILY: | |
foreach my $treefam_family(@families) | |
{ | |
print STDERR "Looking at family $treefam_family... \n"; | |
$famh = $dbc->get_FamilyHandle(); | |
if (!(defined($famh->get_by_id($treefam_family)))) | |
{ | |
#xxx print "xxx there is no family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$family = $famh->get_by_id($treefam_family); | |
if (!(defined($family->ID()))) | |
{ | |
#xxx print "xxx do not have ID stored for family $treefam_family...\n"; | |
goto NEXT_FAMILY; | |
} | |
$AC = $family->ID(); # GET THE FAMILY ID. | |
if ($AC ne $treefam_family) { print STDERR "ERROR: AC $AC treefam_family $treefam_family\n"; exit;} | |
$no_families++; | |
print "\n$no_families: reading family $AC...\n"; | |
# GET THE TREE FOR THE FAMILY: | |
if (!(defined($family->get_tree('clean')))) | |
{ | |
#xxx print "xxx AC $AC: there is no tree!\n"; | |
goto NEXT_FAMILY; | |
} | |
$tree = $family->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
# GET ALL THE TRANSCRIPTS IN THIS FAMILY FROM C. ELEGANS: | |
@species_nodes = &Avril_modules::empty_array(@species_nodes); | |
@species_genes = &Avril_modules::empty_array(@species_genes); | |
@nodes = $tree->get_leaves; | |
foreach $node(@nodes) | |
{ | |
# FIND THE SPECIES FOR THIS TRANSCRIPT: | |
if (defined($node->get_tag_value('S'))) | |
{ | |
$species = $node->get_tag_value('S'); | |
} | |
else | |
{ | |
#xxx print "xxx AC $AC: do not know species of node\n"; | |
$species = "none"; | |
} | |
$species =~ tr/[a-z]/[A-Z]/; | |
if ($species eq 'CAEEL') # IF IT IS A C. ELEGANS GENE. | |
{ | |
@species_nodes = (@species_nodes,$node); | |
# FIND THE GENE FOR THIS TRANSCRIPT: | |
$gh = $node->gene; | |
$gene = $gh->ID(); # eg. ENSMUSG00000022770 | |
@species_genes = (@species_genes,$gene); | |
} | |
} | |
# FOR EACH PAIR OF C. ELEGANS GENES IN THE TREE, SEE | |
# WHEN THE DUPLICATION OCCURRED: | |
for ($i = 0; $i <= $#species_nodes; $i++) | |
{ | |
$species_gene_i = $species_genes[$i]; | |
for ($j = $i+1; $j <= $#species_nodes; $j++) | |
{ | |
$species_gene_j = $species_genes[$j]; | |
$pair1 = $species_gene_i."_".$species_gene_j; | |
$pair2 = $species_gene_j."_".$species_gene_i; | |
$PARALOG{$pair1} = 1; | |
$PARALOG{$pair2} = 1; | |
} | |
} | |
NEXT_FAMILY: | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE: | |
print "GENE PREVGENE NEXTGENE\n"; | |
%GENES = (); | |
%SEEN = (); | |
%START = (); | |
open(GFF,"$gff") || die "ERROR: cannot open $gff.\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$chr = $temp[0]; | |
$start = $temp[3]; | |
$exon = $temp[8]; | |
# FIND THE GENE NAME: | |
# eg. ID=CDS:Transcript:C49A1.4b.1;Name=C49A1.4b.1;Parent=Transcript:C49A1.4b.1 | |
@temp = split(/\;/,$exon); | |
$gene = $temp[1]; # Name=C49A1.4b.1 THIS IS THE TRANSCRIPT NAME. | |
@temp = split(/\./,$gene); | |
$gene = $temp[0].".".$temp[1]; # eg. Name=C49A1.4b | |
@temp = split(/=/,$gene); | |
$gene = $temp[1]; # eg. C49A1.4b | |
$gene =~ tr/[a-z]/[A-Z]/; | |
if (substr($gene,length($gene)-1,1) =~ /[A-Z]/) { chop($gene);} # eg. C49A1.4 THIS IS THE GENE NAME. | |
# RECORD THE GENE START: | |
if (!($START{$gene})) { $START{$gene} = $start;} | |
else { if ($start < $START{$gene}) { $START{$gene} = $start;}} | |
# RECORD THE GENES ON THIS CHROMOSOME: | |
if (!($SEEN{$gene})) | |
{ | |
$SEEN{$gene} = 1; | |
if (!($GENES{$chr})) { $GENES{$chr} = $gene;} | |
else {$GENES{$chr} = $GENES{$chr}.",".$gene;} | |
} | |
} | |
close(GFF); | |
# LOOK AT THE GENES ON EACH CHROMOSOME AT A TIME: | |
foreach $chr (keys %GENES) | |
{ | |
$genes = $GENES{$chr}; | |
@genes = split(/\,/,$genes); | |
@genes = sort by_start @genes; | |
$prev_gene = "none"; | |
# FOR EACH GENE, FIND THE STRAND OF THE ADJACENT GENES: | |
for ($i = 0; $i <= $#genes; $i++) | |
{ | |
$gene = $genes[$i]; | |
# CHECK IF THE PREVIOUS GENE IS A PARALOG OF THIS GENE: | |
$pair1 = $gene."_".$prev_gene; | |
$pair2 = $prev_gene."_".$gene; | |
$prev_paralog = 0; | |
if ($PARALOG{$pair1}) { $prev_paralog = 1;} | |
if ($PARALOG{$pair2}) { $prev_paralog = 1;} | |
# CHECK IF THE NEXT GENE IS A PARALOG OF THIS GENE: | |
$next_paralog = 0; | |
if ($i != $#genes) | |
{ | |
$next_gene = $genes[$i+1]; | |
$pair1 = $gene."_".$next_gene; | |
$pair2 = $next_gene."_".$gene; | |
if ($PARALOG{$pair1}) { $next_paralog = 1;} | |
if ($PARALOG{$pair2}) { $next_paralog = 1;} | |
} | |
print "$gene $prev_paralog $next_paralog\n"; | |
$prev_gene = $gene; | |
} | |
} | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO SORT EXONS BY START-POINT: | |
sub by_start | |
{ | |
if (!($START{$a})) { print STDERR "ERROR: do not know start of $a.\n"; exit;} | |
if (!($START{$b})) { print STDERR "ERROR: do not know start of $b.\n"; exit;} | |
$START{$a} <=> $START{$b} | |
or | |
$a cmp $b; | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment