Created
March 1, 2013 16:24
-
-
Save avrilcoghlan/5065775 to your computer and use it in GitHub Desktop.
Perl script that reads in an alignment file corresponding to a HMM, and retrieves the positions of introns in the genes from the TreeFam database, and figures out the positions of introns with respect to the HMM columns.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
# | |
# Perl script map_introns_to_HMM.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 26-Apr-07. | |
# | |
# For the TreeFam project. | |
# | |
# This perl script reads in an alignment file corresponding | |
# to a HMM: | |
# (i) gets the positions of introns in each gene, by | |
# reading them from the TreeFam mysql database, | |
# (ii) figures the positions of introns in the alignment, | |
# (iii) figures out the positions of introns in the HMM, | |
# by using the mapping file produced by hmmbuild. | |
# | |
# The command-line format is: | |
# % perl <map_introns_to_HMM.pl> aln map output | |
# where aln is the input alignment file, | |
# map is the mapping file produced by hmmbuild, | |
# output is the output file name. | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
$num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of map_introns_to_HMM.pl\n\n"; | |
print "perl map_introns_to_HMM.pl <aln> <map> <output>\n"; | |
print "where <aln> is the input alignment file,\n"; | |
print " <map> is the mapping file produced by hmmbuild,\n"; | |
print " <output> is the output file name.\n"; | |
print "For example, >perl map_introns_to_HMM.pl new_TF101002_fam.aln.pep\n"; | |
print "new_TF101002_fam.aln.pep.map new_TF101002_fam.introns\n"; | |
exit; | |
} | |
# READ IN MY PERL MODULES: | |
BEGIN { | |
unshift (@INC, '/nfs/team71/phd/alc/perl/modules'); | |
} | |
use Avril_modules; | |
use DBI; | |
# FIND THE NAME OF THE ALIGNMENT FILE: | |
$aln = $ARGV[0]; | |
# FIND THE NAME OF THE MAPPING FILE: | |
$map = $ARGV[1]; | |
# FIND THE OUTPUT FILE NAME: | |
$output = $ARGV[2]; | |
open(OUTPUT,">$output") || die "ERROR: cannot open $output.\n"; | |
$VERBOSE = 0; | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS: | |
($genes,$ALN_POS,$no_genes)= &read_aln($aln); | |
# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM: | |
$HMM_POS = &read_map($map); | |
# GET THE TREEFAM IDs FOR THESE GENES: | |
$TREEFAM_ID = &get_treefam_ids($genes); | |
# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES: | |
$all_introns = &get_intron_positions($genes,$TREEFAM_ID,$ALN_POS,$HMM_POS); | |
# PRINT OUT THE INTRON POSITION FILE: | |
&print_intron_file($all_introns,$no_genes); | |
close(OUTPUT); | |
#------------------------------------------------------------------# | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# SORT INTRONS BY POSITION IN THE HMM: | |
sub by_pos | |
{ | |
$POS{$a} <=> $POS{$b} | |
or | |
$a cmp $b; | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT THE INTRON POSITION FILE: | |
sub print_intron_file | |
{ | |
my $all_introns = $_[0]; | |
my $no_genes = $_[1]; | |
my @introns; | |
my $intron; | |
my $i; | |
my @temp; | |
my $pos; | |
my $phase; | |
%POS = (); | |
my %SEEN = (); | |
my %COUNT = (); | |
my $count; | |
my @introns2; | |
my $fraction; | |
my $last_intron; | |
my $last_pos; | |
my $intron0; | |
my $intron1; | |
my $intron2; | |
my $count0; | |
my $count1; | |
my $count2; | |
my $frac0; | |
my $frac1; | |
my $frac2; | |
if ($no_genes <= 0) { print STDERR "ERROR: no_genes $no_genes.\n"; exit;} | |
print OUTPUT "# intron position file\n"; | |
print OUTPUT "# columns: position phase 0 phase 1 phase 2\n"; | |
if ($all_introns ne '') | |
{ | |
$all_introns = substr($all_introns,1,length($all_introns)-1); | |
@introns = split(/\,/,$all_introns); | |
for ($i = 0; $i <= $#introns; $i++) | |
{ | |
$intron = $introns[$i]; | |
@temp = split(/_/,$intron); | |
$pos = $temp[0]; # POSITION OF INTRON WRT HMMER HMM. | |
$POS{$intron} = $pos; | |
if (!($SEEN{$intron})) | |
{ | |
$SEEN{$intron} = 1; | |
$COUNT{$intron} = 1; | |
@introns2 = (@introns2,$intron); | |
} | |
else | |
{ | |
$COUNT{$intron}++; | |
} | |
} | |
@introns2 = sort by_pos @introns2; | |
$last_intron = $introns2[$#introns2]; | |
@temp = split(/_/,$last_intron); | |
$last_pos = $temp[0]; | |
# PRINT OUT THE INTRONS AT EACH POSITION IN THE HMM: | |
for ($i = 1; $i <= $last_pos; $i++) | |
{ | |
$intron0 = $i."_0"; | |
$intron1 = $i."_1"; | |
$intron2 = $i."_2"; | |
if ($COUNT{$intron0} || $COUNT{$intron1} || $COUNT{$intron2}) | |
{ | |
if ($COUNT{$intron0}) { $count0 = $COUNT{$intron0};} else { $count0 = 0;} | |
if ($COUNT{$intron1}) { $count1 = $COUNT{$intron1};} else { $count1 = 0;} | |
if ($COUNT{$intron2}) { $count2 = $COUNT{$intron2};} else { $count2 = 0;} | |
$frac0 = $count0/$no_genes; | |
$frac1 = $count1/$no_genes; | |
$frac2 = $count2/$no_genes; | |
print OUTPUT "$i $frac0 $frac1 $frac2\n"; | |
} | |
} | |
} | |
else | |
{ | |
print OUTPUT "no introns\n"; | |
} | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE MAPPING OF THE ALIGNMENT TO THE HMM: | |
sub read_map | |
{ | |
my $map = $_[0]; | |
my $line; | |
my @temp; | |
my $aln; | |
my $length; | |
my $i; | |
my $char; | |
my $aln_pos = 0; | |
my $hmm_pos = 0; | |
my %HMM_POS = (); | |
open(MAP,"$map") || die "ERROR: cannot open $map.\n"; | |
while(<MAP>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line =~ /#=GC RF/) | |
{ | |
@temp = split(/\s+/,$line); | |
$aln = $temp[2]; | |
$length = length($aln); | |
for ($i = 0; $i < $length; $i++) | |
{ | |
$char = substr($aln,$i,1); | |
$aln_pos++; | |
if ($char eq 'x') | |
{ | |
$hmm_pos++; | |
} | |
# POSITION $aln_pos IN THE ALIGNMENT MAPS TO POSITION $hmm_pos IN THE HMM: | |
if ($HMM_POS{$aln_pos}) { print STDERR "ERROR: already have hmm_pos for $aln_pos.\n"; exit;} | |
$HMM_POS{$aln_pos} = $hmm_pos; | |
} | |
} | |
} | |
close(MAP); | |
return(\%HMM_POS); | |
} | |
#------------------------------------------------------------------# | |
# GET THE TREEFAM IDs FOR THE GENES: | |
sub get_treefam_ids | |
{ | |
my $genes = $_[0]; | |
my %ID = (); | |
my $i; | |
my $gene; | |
my $dbh; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my $id; | |
my @array; | |
my $rc; | |
# FIND THE ID OF EACH GENE IN THE TREEFAM MYSQL DATABASE: | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS: | |
$table_w = 'genes'; | |
for ($i = 0; $i < @$genes; $i++) | |
{ | |
$gene = $genes->[$i]; | |
$st = "SELECT ID from $table_w WHERE GID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; # eg., ENSMUST00000049178.2 | |
$ID{$gene} = $id; | |
} | |
} | |
if (!($ID{$gene})) | |
{ | |
$st = "SELECT ID from $table_w WHERE TID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($gene) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$id = $array[0]; # eg., ENSMUST00000049178.2 | |
$ID{$gene} = $id; | |
} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
return(\%ID); | |
} | |
#------------------------------------------------------------------# | |
# GET THE POSITIONS OF INTRONS IN EACH OF THE GENES: | |
sub get_intron_positions | |
{ | |
my $genes = $_[0]; | |
my $TREEFAM_ID = $_[1]; | |
my $ALN_POS = $_[2]; | |
my $HMM_POS = $_[3]; | |
my $i; | |
my $gene; | |
my $dbh; | |
my $table_w; | |
my $st; | |
my $sth; | |
my $rv; | |
my $id; | |
my @array; | |
my $rc; | |
my $strand; | |
my $start; | |
my $stop; | |
my $introns; | |
my $all_introns = ""; | |
# FIND THE POSITION OF EACH GENE IN THE TREEFAM MYSQL DATABASE: | |
$dbh = DBI->connect("dbi:mysql:treefam_4:db.treefam.org:3308", 'anonymous', '') || return; | |
# SPECIFY THE TABLE, FOR READING IN THE POSITIONS OF INTRONS: | |
$table_w = 'map'; | |
for ($i = 0; $i < @$genes; $i++) | |
{ | |
$gene = $genes->[$i]; | |
if (!($TREEFAM_ID->{$gene})) { print STDERR "ERROR: do not know id for gene $gene.\n"; exit;} | |
$id = $TREEFAM_ID->{$gene}; | |
$st = "SELECT STRAND,START,STOP from $table_w WHERE ID=?"; | |
$sth = $dbh->prepare($st) or die "Cannot prepare $st: $dbh->errstr\n"; | |
$rv = $sth->execute($id) or die "Cannot execute the query: $sth->errstr"; | |
if ($rv >= 1) | |
{ | |
while ((@array) = $sth->fetchrow_array) | |
{ | |
$strand = $array[0]; # eg., + | |
$start = $array[1]; # eg., 144225,144806,145261,145497, | |
$stop = $array[2]; # eg., 144318,145216,145444,145804, | |
# FIND THE INTRONS IN THIS GENE: | |
$introns = &infer_intron_positions($strand,$start,$stop,$gene,$ALN_POS,$HMM_POS); | |
if ($introns ne "") { $all_introns = $all_introns.",".$introns;} | |
} | |
} | |
} | |
$rc = $dbh->disconnect(); | |
$rc = ""; | |
return($all_introns); | |
} | |
#------------------------------------------------------------------# | |
# INFER THE INTRON POSITIONS IN A GENE: | |
sub infer_intron_positions | |
{ | |
my $strand = $_[0]; | |
my $start = $_[1]; # 144225,144806,145261,145497, | |
my $stop = $_[2]; # 144318,145216,145444,145804, | |
my $gene = $_[3]; | |
my $ALN_POS = $_[4]; | |
my $HMM_POS = $_[5]; | |
my @start = split(/\,/,$start); | |
my @stop = split(/\,/,$stop); | |
my $i; | |
my $exon_start; | |
my $exon_stop; | |
my $coding_length; | |
my $hangover; | |
my $intron_phase; | |
my $prev_aa_pos = 0; | |
my @temp; | |
my $aln_pos; | |
my $hmm_pos; | |
my $intron; | |
my $introns = ""; | |
if ($#start != $#stop) { print STDERR "ERROR: start $#start stop $#stop.\n"; exit;} | |
if ($VERBOSE == 1) { print "$gene-------------------------------------------------------\n";} | |
if ($strand eq '+') | |
{ | |
$coding_length = 0; | |
for ($i = 0; $i <= $#start; $i++) | |
{ | |
# FIND THE COORDINATES OF EXON $i: | |
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB. | |
$exon_stop = $stop[$i]; | |
# CALCULATE THE CODING LENGTH SO FAR: | |
$coding_length = $coding_length + ($exon_stop - $exon_start + 1); | |
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID: | |
$prev_aa_pos = $coding_length/3; | |
@temp = split(/\./,$prev_aa_pos); | |
$prev_aa_pos = $temp[0]; | |
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS: | |
$hangover = $coding_length % 3; | |
if ($hangover == 0) { $intron_phase = 0;} | |
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1). | |
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2). | |
if ($i != $#start) | |
{ | |
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT: | |
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;} | |
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos}; | |
if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS. | |
{ | |
$hmm_pos = $HMM_POS->{$aln_pos}; | |
$intron = $hmm_pos."_".$intron_phase; | |
$introns = $introns.",".$intron; | |
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n"; } | |
} | |
} | |
} | |
} | |
elsif ($strand eq '-') | |
{ | |
$coding_length = 0; | |
for ($i = $#start; $i >= 0; $i--) | |
{ | |
# FIND THE COORDINATES OF EXON $i: | |
$exon_start = $start[$i] + 1; # +1 TO CORRECT FOR THE WAY EXONS ARE STORED IN THE TREEFAM MYSQL DB. | |
$exon_stop = $stop[$i]; | |
# CALCULATE THE CODING LENGTH SO FAR: | |
$coding_length = $coding_length + ($exon_stop - $exon_start + 1); | |
# WORK OUT THE POSITION OF THE PREVIOUS AMINO ACID: | |
$prev_aa_pos = $coding_length/3; | |
@temp = split(/\./,$prev_aa_pos); | |
$prev_aa_pos = $temp[0]; | |
# LEFT-OVER BASES AFTER THE CODING LENGTH SO FAR HAS BEEN TRANSLATED INTO CODONS: | |
$hangover = $coding_length % 3; | |
if ($hangover == 0) { $intron_phase = 0;} | |
elsif ($hangover == 1) { $intron_phase = 1;} # IF 1 LEFT OVER, THE INTRON LIES AFTER THE 1ST BASE OF CODON (PHASE 1). | |
elsif ($hangover == 2) { $intron_phase = 2;} # IF 2 LEFT OVER, THE INTRON LIES AFTER THE 2ND BASE OF CODON (PHASE 2). | |
if ($i != 0) | |
{ | |
# FIND THE POSITION OF AMINO ACID $prev_aa_pos IN THE ALIGNMENT: | |
if (!($ALN_POS->{$gene."_".$prev_aa_pos})) { print STDERR "ERROR: do not know aln pos of $gene $prev_aa_pos.\n"; exit;} | |
$aln_pos = $ALN_POS->{$gene."_".$prev_aa_pos}; | |
if ($HMM_POS->{$aln_pos}) # NOTE THAT THERE WILL NOT BE A HMM POS IF THE INTRON IS BEFORE THE HMM STARTS. | |
{ | |
$hmm_pos = $HMM_POS->{$aln_pos}; | |
$intron = $hmm_pos."_".$intron_phase; | |
$introns = $introns.",".$intron; | |
if ($VERBOSE == 1) { print "strand $strand : $exon_start $exon_stop and aa $prev_aa_pos aln_pos $aln_pos hmm_pos $hmm_pos intron_phase $intron_phase\n";} | |
} | |
} | |
} | |
} | |
else | |
{ | |
print STDERR "ERROR: strand $strand.\n"; exit; | |
} | |
if ($introns ne '') { $introns = substr($introns,1,length($introns)-1);} | |
return($introns); | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT ALIGNMENT FILE, TO GET THE NAMES OF THE PROTEINS: | |
sub read_aln | |
{ | |
my $aln = $_[0]; | |
my $line; | |
my @temp; | |
my $gene; | |
my @genes; | |
my $length; | |
my $i; | |
my $char; | |
my $prot_pos; | |
my $aln_pos; | |
my %ALN_POS = (); | |
my $no_genes = 0; | |
open(ALN,"$aln") || die "ERROR: cannot open $aln.\n"; | |
while(<ALN>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
@temp = split(/\s+/,$line); | |
$gene = $temp[0]; | |
$gene = substr($gene,1,length($gene)-1); | |
@genes = (@genes,$gene); | |
$no_genes++; | |
$prot_pos = 0; | |
$aln_pos = 0; | |
print "xxx gene $gene\n"; | |
} | |
else | |
{ | |
$length = length($line); | |
for ($i = 0; $i < $length; $i++) | |
{ | |
$char = substr($line,$i,1); | |
$aln_pos++; | |
if ($char ne '-') | |
{ | |
$prot_pos++; | |
if ($ALN_POS{$gene."_".$prot_pos}) { print STDERR "ERROR: already have aln pos for $gene $prot_pos.\n"; exit;} | |
$ALN_POS{$gene."_".$prot_pos} = $aln_pos; | |
} | |
print "xxx gene $gene aln_pos $aln_pos prot_pos $prot_pos\n"; | |
} | |
} | |
} | |
close(ALN); | |
return(\@genes,\%ALN_POS,$no_genes); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment