Created
March 1, 2013 16:00
-
-
Save avrilcoghlan/5065591 to your computer and use it in GitHub Desktop.
Perl script that, for a particular TreeFam family of interest (that has human paralogs), prints out the DNA alignment for the family, with the position of introns shown with respect to the DNA alignment.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script find_human_paralogs_alns.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 29-Aug-07. | |
# | |
# For Christine Bird. | |
# This gets the DNA alignments for TreeFam families | |
# that have within-species paralogs. | |
# | |
# The command-line format is: | |
# % perl <find_human_paralogs_alns.pl> <families> <the_family> | |
# where <families> is the file with the families to take, | |
# <the_family> is the family we are interested in. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 2) | |
{ | |
print "Usage of find_human_paralogs_alns.pl\n\n"; | |
print "perl find_human_paralogs_alns.pl <families> <the_family>\n"; | |
print "where <families> is the file with the families to take,\n"; | |
print " <the_family> is the family we are interested in.\n"; | |
print "For example, >perl find_human_paralogs_alns.pl\n"; | |
print "human_paralogs_3aug07 TF101011\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use lib "/nfs/team71/phd/alc/perl/treefam/Treefam_api_3may07/"; | |
use Avril_modules; | |
use Treefam::DBConnection; | |
use DBI; | |
# FIND THE FILE WITH THE FAMILIES TO TAKE: | |
$families = $ARGV[0]; | |
# FIND THE FAMILY WE ARE INTERESTED IN: | |
$the_family = $ARGV[1]; | |
#------------------------------------------------------------------# | |
# IF $VERBOSE == 1, PRINT OUT EXTRA INFORMATION: | |
$VERBOSE = 0; | |
$dbc = Treefam::DBConnection->new(); | |
%SEEN = (); | |
# READ IN THE LIST OF FAMILIES THAT WE ARE INTERESTED IN: | |
open(FAMILIES,"$families") || die "ERROR: cannot open $families\n"; | |
while(<FAMILIES>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line =~ /TAKE:/) | |
{ | |
@temp = split(/\s+/,$line); | |
$family = $temp[2]; | |
if ($SEEN{$family} || $family ne $the_family) { goto NEXT_FAMILY;} | |
$SEEN{$family} = 1; | |
print "==================================================================\n"; | |
print "==================================================================\n"; | |
print "FAMILY $family:\n"; | |
$famh = $dbc->get_FamilyHandle(); | |
$family1 = $famh->get_by_id($family); | |
$tree = $family1->get_tree('clean'); # GET THE TREEFAM CLEAN TREE. | |
# GET THE ALIGNMENT FOR THIS FAMILY: | |
$aln = $tree->get_alignment('nt'); | |
# MAP THE INTRON POSITIONS ONTO THE ALIGNMENT FOR THIS FAMILY: | |
&map_introns_to_alns($aln,$family); | |
} | |
NEXT_FAMILY: | |
} | |
close(FAMILIES); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# MAP THE INTRON POSITIONS ONTO THE ALIGNMENT FOR THIS FAMILY: | |
sub map_introns_to_alns | |
{ | |
my $aln = $_[0]; | |
my $family = $_[1]; | |
my @temp; | |
my $i; | |
my $gene; | |
my $line; | |
my @temp2; | |
my @genes; | |
my $all_introns; | |
my %ALN = (); | |
# FIND THE NAMES OF THE GENES IN THE ALIGNMENT: | |
@temp = split(/\n+/,$aln); | |
for ($i = 0; $i <= $#temp; $i++) | |
{ | |
$line = $temp[$i]; | |
if (substr($line,0,1) eq '>') | |
{ | |
@temp2 = split(/\s+/,$line); | |
$gene = $temp2[0]; | |
$gene = substr($gene,1,length($gene)-1); | |
@genes = (@genes,$gene); | |
} | |
else | |
{ | |
if (!($ALN{$gene})) { $ALN{$gene} = $line."\n"; } | |
else { $ALN{$gene} = $ALN{$gene}.$line."\n";} | |
} | |
} | |
# GET THE TREEFAM GENE NAMES FOR THE TRANSCRIPTS: | |
$TREEFAM_ID = &get_treefam_ids(\@genes); | |
# READ THE ALIGNMENT, TO GET THE POSITIONS OF EACH AMINO ACID: | |
($ALN_POS) = &read_aln($aln); | |
# GET THE POSITIONS OF INTRONS IN THE GENES: | |
$all_introns = &get_intron_positions(\@genes,$ALN_POS); | |
# PRINT OUT THE ALIGNMENT WITH THE INTRONS SHOWN: | |
&print_aln($aln,$all_introns,\@genes,\%ALN,$TREEFAM_ID); | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT THE ALIGNMENT WITH THE INTRONS SHOWN: | |
sub print_aln | |
{ | |
my $aln = $_[0]; | |
my $introns = $_[1]; | |
my $genes = $_[2]; | |
my $ALN = $_[3]; | |
my $TREEFAM_ID = $_[4]; | |
my $length; | |
my $i; | |
my $j; | |
my $gene; | |
my $gene_aln; | |
my $k; | |
my @temp; | |
my $line; | |
my $char; | |
my @introns = split(/\,/,$introns); | |
my %INTRON = (); | |
my %POS = (); | |
my $pos; | |
my $geneid; | |
# RECORD WHERE TO PUT INTRONS: | |
for ($i = 0; $i <= $#introns; $i++) | |
{ | |
$intron = $introns[$i]; | |
@temp = split(/_/,$intron); | |
$gene = $temp[0]; | |
$pos = $temp[1]; | |
$INTRON{$gene."_".$pos} = 1; | |
} | |
# FIND OUT THE NUMBER OF LINES IN THE ALIGNMENT: | |
if (!($ALN->{$genes->[0]})) { print STDERR "ERROR: no alignment for $genes->[0]\n"; exit;} | |
$gene_aln = $ALN->{$genes->[0]}; | |
@temp = split(/\n+/,$gene_aln); | |
$length = $#temp + 1; | |
# PRINT OUT THE ALIGNMENT LINE BY LINE: | |
for ($i = 0; $i < $length; $i++) | |
{ | |
# PRINT OUT EACH GENE: | |
for ($j = 0; $j < @$genes; $j++) | |
{ | |
$gene = $genes->[$j]; | |
if (substr($gene,0,4) ne 'ENST') { goto NEXT_GENE1;} | |
if (!($TREEFAM_ID->{$gene})) { print STDERR "ERROR: do not know gene name for $gene\n"; exit;} | |
$geneid = $TREEFAM_ID->{$gene}; | |
if (!($ALN->{$gene})) { print STDERR "ERROR: no aln for $gene\n"; exit;} | |
$gene_aln = $ALN->{$gene}; | |
@temp = split(/\n+/,$gene_aln); | |
if ($i <= $#temp) | |
{ | |
$line = $temp[$i]; | |
if ($line ne '') | |
{ | |
for ($k = 1; $k <= length($line); $k++) | |
{ | |
$char = substr($line,$k-1,1); | |
if ($char ne '') | |
{ | |
print "$char"; | |
if (!($POS{$gene})) { $POS{$gene} = 1;} | |
else { $POS{$gene}++; } | |
$pos = $POS{$gene}; | |
if ($INTRON{$gene."_".$pos}) | |
{ | |
print "*"; | |
} | |
else | |
{ | |
print " "; | |
} | |
} | |
} | |
} | |
print " : $geneid\n"; | |
} | |
NEXT_GENE1: | |
} | |
print "\n"; | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS: | |
sub read_aln | |
{ | |
my $aln = $_[0]; | |
my $line; | |
my @temp; | |
my $gene; | |
my @genes; | |
my $length; | |
my $i; | |
my $char; | |
my $prot_pos; | |
my $aln_pos; | |
my %ALN_POS = (); | |
my $no_genes = 0; | |
my @aln = split(/\n+/,$aln); | |
my $j; | |
for ($j = 0; $j <= $#aln; $j++) | |
{ | |
$line = $aln[$j]; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
@temp = split(/\s+/,$line); | |
$gene = $temp[0]; | |
$gene = substr($gene,1,length($gene)-1); | |
@genes = (@genes,$gene); | |
$no_genes++; | |
$prot_pos = 0; | |
$aln_pos = 0; | |
} | |
else | |
{ | |
$length = length($line); | |
for ($i = 0; $i < $length; $i++) | |
{ | |
$char = substr($line,$i,1); | |
$aln_pos++; | |
if ($char ne '-') | |
{ | |
$prot_pos++; | |
if ($ALN_POS{$gene."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $gene $prot_pos.\n"; exit;} | |
$ALN_POS{$gene."_".$prot_pos} = $aln_pos; | |
} | |
} | |
} | |
} | |
return(\%ALN_POS); | |
} | |
#------------------------------------------------------------------# | |
# GET THE TREEFAM IDs FOR THE GENES: | |
sub get_treefam_ids | |
{ | |
my $genes = $_[0]; | |
my %ID = (); | |
my $i; | |
my $gene; | |
my $dbh; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my $id; | |
my @array; | |
my $rc; | |
my $gid; | |
# FIND THE ID OF EACH GENE IN THE TREEFAM MYSQL DATABASE: | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS: | |
$table_w = 'genes'; | |
for ($i = 0; $i < @$genes; $i++) | |
{ | |
$gene = $genes->[$i]; | |
$st = "SELECT ID,GID from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; # eg., ENSMUST00000049178.2 | |
$gid = $array[1]; | |
$ID{$gene} = $gid; | |
} | |
} | |
if (!($ID{$gene})) | |
{ | |
$st = "SELECT ID from $table_w WHERE TID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; # eg., ENSMUST00000049178.2 | |
$ID{$gene} = $id; | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
return(\%ID); | |
} | |
#------------------------------------------------------------------# | |
# GET THE POSITIONS OF INTRONS IN THE GENES: | |
sub get_intron_positions | |
{ | |
my $genes = $_[0]; | |
my $ALN_POS = $_[1]; | |
my $dbh; | |
my $table_w; | |
my $i; | |
my $gene; | |
my $id; | |
my $st; | |
my $sth; | |
my $rv; | |
my @array; | |
my $strand; | |
my $start; | |
my $stop; | |
my $introns; | |
my $all_introns = ""; | |
my $rc; | |
my $cstart; | |
my $cstop; | |
# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE: | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS: | |
$table_w = 'map'; | |
for ($i = 0; $i < @$genes; $i++) | |
{ | |
$gene = $genes->[$i]; | |
if (substr($gene,0,4) ne 'ENST') { goto NEXT_GENE;} # ONLY LOOK AT HUMAN GENES. | |
$id = $gene; | |
$st = "SELECT STRAND,START,STOP,C_START,C_STOP from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$strand = $array[0]; # eg., + | |
$start = $array[1]; # eg., 144225,144806,145261,145497, | |
$stop = $array[2]; # eg., 144318,145216,145444,145804, | |
$cstart = $array[3]; | |
$cstop = $array[4]; | |
$start = &remove_noncoding($start,$cstart,$cstop,"start"); | |
$stop = &remove_noncoding($stop,$cstart,$cstop,"stop"); | |
# FIND THE INTRONS IN THIS GENE: | |
$introns = &infer_intron_positions($strand,$start,$stop,$gene,$ALN_POS); | |
if ($introns ne "") { $all_introns = $all_introns.",".$introns;} | |
} | |
} | |
NEXT_GENE: | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
$all_introns = substr($all_introns,1,length($all_introns)-1); | |
return($all_introns); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO REMOVE NON-CODING REGIONS FROM THE EXONS: | |
sub remove_noncoding | |
{ | |
my $exons = $_[0]; | |
my $cstart = $_[1]; | |
my $cstop = $_[2]; | |
my $type = $_[3]; | |
my $new_exons = ""; | |
chop($exons); | |
my @exons = split(/\,/,$exons); | |
my $i; | |
my $seen_start = 0; | |
my $seen_end = 0; | |
for ($i = 0; $i <= $#exons; $i++) | |
{ | |
if ($exons[$i] >= $cstart && $exons[$i] <= $cstop) | |
{ | |
if ($i == 0) { $seen_start = 1;} | |
elsif ($i == $#exons) { $seen_end = 1;} | |
$new_exons = $new_exons.",".$exons[$i]; | |
if ($exons[$i] == $cstart && $seen_start == 0) { $seen_start = 1;} | |
if ($exons[$i] == $cstop && $seen_end == 0) { $seen_end = 1;} | |
} | |
elsif ($exons[$i] <= $cstart) | |
{ | |
# ignore this exon. | |
} | |
elsif ($exons[$i] >= $cstop) | |
{ | |
# ignore this exon. | |
} | |
} | |
$new_exons = substr($new_exons,1,length($new_exons)-1); | |
if ($seen_start == 0 && $type eq 'start') { $new_exons = $cstart.",".$new_exons;} | |
if ($seen_end == 0 && $type eq 'stop') { $new_exons = $new_exons.",".$cstop; } | |
if (substr($new_exons,0,1) eq ',') { $new_exons = substr($new_exons,1,length($new_exons)-1);} | |
if (substr($new_exons,length($new_exons)-1,1) eq ',') { chop($new_exons); } | |
return($new_exons); | |
} | |
#------------------------------------------------------------------# | |
# INFER THE INTRON POSITIONS IN A GENE: | |
sub infer_intron_positions | |
{ | |
my $strand = $_[0]; | |
my $start = $_[1]; # 144225,144806,145261,145497, | |
my $stop = $_[2]; # 144318,145216,145444,145804, | |
my $gene = $_[3]; | |
my $ALN_POS = $_[4]; | |
my @start = split(/\,/,$start); | |
my @stop = split(/\,/,$stop); | |
my $i; | |
my $exon_start; | |
my $exon_stop; | |
my $coding_length; | |
my $hangover; | |
my $intron_phase; | |
my $prev_aa_pos = 0; | |
my @temp; | |
my $aln_pos; | |
my $hmm_pos; | |
my $intron; | |
my $introns = ""; | |
if ($#start != $#stop) | |
{ | |
print STDERR "ERROR: gene $gene : start $#start ($start) stop $#stop ($stop).\n"; | |
exit; | |
} | |
if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";} | |
if ($VERBOSE == 1) { print "gene $gene : start $start stop $stop\n";} | |
if ($strand eq '+') | |
{ | |
$coding_length = 0; | |
for ($i = 0; $i <= $#start; $i++) | |
{ | |
# FIND THE COORDINATES OF EXON $i: | |
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB. | |
$exon_stop = $stop[$i]; | |
# CALCULATE THE CODING LENGTH SO FAR: | |
$coding_length = $coding_length + ($exon_stop - $exon_start + 1); | |
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID: | |
$prev_aa_pos = $coding_length; | |
@temp = split(/\./,$prev_aa_pos); | |
$prev_aa_pos = $temp[0]; | |
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS: | |
$hangover = 0; | |
if ($hangover == 0) { $intron_phase = 0;} | |
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1). | |
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2). | |
if ($i != $#start) | |
{ | |
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT: | |
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;} | |
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos}; | |
$intron = $gene."_".$aln_pos."_".$intron_phase; | |
$introns = $introns.",".$intron; | |
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";} | |
} | |
} | |
} | |
elsif ($strand eq '-') | |
{ | |
$coding_length = 0; | |
for ($i = $#start; $i >= 0; $i--) | |
{ | |
# FIND THE COORDINATES OF EXON $i: | |
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB. | |
$exon_stop = $stop[$i]; | |
# CALCULATE THE CODING LENGTH SO FAR: | |
$coding_length = $coding_length + ($exon_stop - $exon_start + 1); | |
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID: | |
$prev_aa_pos = $coding_length; | |
@temp = split(/\./,$prev_aa_pos); | |
$prev_aa_pos = $temp[0]; | |
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS: | |
$hangover = 0; | |
if ($hangover == 0) { $intron_phase = 0;} | |
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1). | |
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2). | |
if ($i != 0) | |
{ | |
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT: | |
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;} | |
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos}; | |
$intron = $gene."_".$aln_pos."_".$intron_phase; | |
$introns = $introns.",".$intron; | |
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos intron_phase $intron_phase\n";} | |
} | |
} | |
} | |
else | |
{ | |
print STDERR "ERROR: strand $strand.\n"; exit; | |
} | |
if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);} | |
return($introns); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment