Created
March 1, 2013 13:48
-
-
Save avrilcoghlan/5064774 to your computer and use it in GitHub Desktop.
Perl script that, given a file of Caenorhabditis elegans paralog pairs, analyses TreeFam trees to find the pairs of C. elegans paralogs in families that are separated by the least number of edges
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script find_closest_worm_paralogs3.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 7-Jan-09. | |
# | |
# This perl script finds the pairs of C. elegans paralogs in a family | |
# that are separated by the least number of edges. | |
# The command-line format is: | |
# % perl <find_closest_worm_paralogs3.pl> <input> | |
# where <input> is the file with the number of edges between paralog pairs. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 1) | |
{ | |
print "Usage of find_closest_worm_paralogs3.pl\n\n"; | |
print "perl find_closest_worm_paralogs3.pl <input>\n"; | |
print "where <input> is the number of edges between paralog pairs.\n"; | |
print "For example, >perl find_closest_worm_paralogs3.pl Ce_closest_paralogs4\n"; | |
exit; | |
} | |
# FIND THE INPUT FILE WITH THE NUMBER OF EDGES BETWEEN PARALOG PAIRS: | |
$input = $ARGV[0]; | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT FILE: | |
%MAXLCA = (); | |
%MINDIST = (); | |
%MINDISTGENE = (); | |
open(INPUT,"$input") || die "ERROR: cannot open $input\n"; | |
while(<INPUT>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line =~ /dist between/) | |
{ | |
@temp = split(/\s+/,$line); | |
$fam = $temp[0]; # eg. TF318673: | |
chop($fam); | |
$gene1 = $temp[3]; | |
$gene2 = $temp[4]; | |
if ($gene1 eq '' || $gene2 eq '') { print STDERR "ERROR: gene1 $gene1 gene2 $gene2 line $line\n"; exit;} | |
$gene1 = $fam."=".$gene1; | |
$gene2 = $fam."=".$gene2; | |
$dist = $temp[6]; | |
$lca = $temp[8]; | |
if ($lca == 0) { $lca = 0.1;} | |
# THE MINIMUM POSSIBLE NUMBER OF EDGES BETWEEN SEQUENCES IS 2: | |
if ($dist == 0 || $dist == 1) { print STDERR "ERROR: $line : dist $dist\n"; exit;} | |
# RECORD THE DISTANCE FOR EACH GENE: | |
if (!($MINDISTGENE{$gene1})) | |
{ | |
$MINDISTGENE{$gene1} = $gene2; | |
$MINDIST{$gene1} = $dist; | |
$MAXLCA{$gene1} = $lca; | |
} | |
else | |
{ | |
if ($dist < $MINDIST{$gene1}) | |
{ | |
$MINDISTGENE{$gene1} = $gene2; | |
$MINDIST{$gene1} = $dist; | |
$MAXLCA{$gene1}= $lca; | |
} | |
elsif ($dist == $MINDIST{$gene1}) | |
{ | |
if (!($MAXLCA{$gene1})) { print STDERR "ERROR: do not know maxlca for $gene1\n"; exit;} | |
if ($lca > $MAXLCA{$gene1}) | |
{ | |
$MINDISTGENE{$gene1} = $gene2; | |
$MINDIST{$gene1} = $dist; | |
$MAXLCA{$gene1} = $lca; | |
} | |
elsif ($lca == $MAXLCA{$gene1}) | |
{ | |
$MINDISTGENE{$gene1} = $MINDISTGENE{$gene1}.",".$gene2; | |
$MINDIST{$gene1} = $dist; | |
$MAXLCA{$gene1} = $lca; | |
} | |
# if $lca is less than $MAXLCA{$gene1}, then $gene2 is further away from the | |
# current closest gene to $gene1. | |
} | |
} | |
if (!($MINDISTGENE{$gene2})) | |
{ | |
$MINDISTGENE{$gene2} = $gene1; | |
$MINDIST{$gene2} = $dist; | |
$MAXLCA{$gene2} = $lca; | |
} | |
else | |
{ | |
if ($dist < $MINDIST{$gene2}) | |
{ | |
$MINDISTGENE{$gene2} = $gene1; | |
$MINDIST{$gene2} = $dist; | |
$MAXLCA{$gene2} = $lca; | |
} | |
elsif ($dist == $MINDIST{$gene2}) | |
{ | |
if (!($MAXLCA{$gene2})) { print STDERR "ERROR: do not know maxlca for $gene2\n"; exit;} | |
if ($lca > $MAXLCA{$gene2}) | |
{ | |
$MINDISTGENE{$gene2} = $gene1; | |
$MINDIST{$gene2} = $dist; | |
$MAXLCA{$gene2} = $lca; | |
} | |
elsif ($lca == $MAXLCA{$gene2}) | |
{ | |
$MINDISTGENE{$gene2} = $MINDISTGENE{$gene2}.",".$gene2; | |
$MINDIST{$gene2} = $dist; | |
$MAXLCA{$gene2} = $lca; | |
} | |
} | |
} | |
} | |
} | |
close(INPUT); | |
# NOW FIND THE PAIRS OF GENES THAT ARE EACH OTHER'S CLOSEST GENES: | |
foreach $gene1 (keys %MINDISTGENE) | |
{ | |
$gene2 = $MINDISTGENE{$gene1}; | |
$distA = $MINDIST{$gene1}; | |
if ($gene2 ne 'none') | |
{ | |
# CHECK THAT THE CLOSEST GENE TO $gene2 IS $gene1: | |
if ($MINDISTGENE{$gene2}) | |
{ | |
$gene2_mindistgene= $MINDISTGENE{$gene2}; | |
$distB = $MINDIST{$gene2}; | |
if ($gene1 eq $gene2_mindistgene) | |
{ | |
if ($distA != $distB) | |
{ | |
print STDERR "ERROR: $gene1 $gene2 distA $distA distB $distB\n"; | |
exit; | |
} | |
@temp = split(/=/,$gene1); | |
$fam = $temp[0]; | |
$gene1 = $temp[1]; | |
@temp = split(/=/,$gene2); | |
$gene2 = $temp[1]; | |
print "$fam $gene1 $gene2 $distA\n"; | |
} | |
} | |
} | |
} | |
print STDERR "FINISHED\n"; | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment