Created
October 21, 2013 08:22
-
-
Save avrilcoghlan/7080383 to your computer and use it in GitHub Desktop.
Perl module with functions to process gene-finding data.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package HelminthGenomeAnalysis::AvrilGenefindingUtils; | |
use strict; | |
use warnings; | |
use Bio::Seq; | |
use Bio::SeqIO; | |
use Moose; | |
use Math::Round; # HAS THE nearest() FUNCTION | |
use Carp::Assert; # HAS THE assert() FUNCTION | |
use Scalar::Util qw(looks_like_number); | |
use Math::Round; # CONTAINS FUNCTION nearest() FOR ROUNDING | |
use base 'Exporter'; | |
our @EXPORT_OK = qw( make_exonerate_hints read_eval_stats ); | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: make_exonerate_hints(): GIVEN AN EXONERATE GFF FILE, MAKE A HINTS FILE FOR AUGUSTUS (NOTE: THE HINTS ARE GIVEN PRIORITY 4): | |
# NOTE: /nfs/users/nfs_a/alc/Documents/bin/augustus/scripts/blat2hints.pl COMES WITH AUGUSTUS AND CONVERTS A BLAT FILE (OF mRNA/EST ALIGNMENTS) | |
# TO A HINTS FILE, AND CUTS 10 bp OFF EACH END OF THE ALIGNMENT TO MAKE THE exonpart HINT. | |
# THE SCRIPT /nfs/users/nfs_a/alc/Documents/bin/augustus/scripts/exonerate2hints.pl COMES WITH AUGUSTUS AND CONVERTS AN exonerate OUTPUT FILE | |
# TO A HINTS FILE, AND CUTS 9 bp OFF EACH END OF EACH exonerate CDS TO MAKE CDSpart HINTS. | |
sub make_exonerate_hints | |
{ | |
my $input_gff = $_[0]; # EXONERATE GFF FILE | |
my $hints_file = $_[1]; # OUTPUT HINTS FILE | |
my $type = $_[2]; # TYPE OF EXONERATE ALIGNMENT ('est'/'prot') | |
my $minintronlen = $_[3]; # MINIMUM INTRON LENGTH | |
my $maxintronlen = $_[4]; # MAXIMUM INTRON LENGTH | |
my $CDSpart_cutoff = $_[5]; # NUMBER OF BASE-PAIRS TO CUT OFF EACH exonerate CDS/exon TO MAKE CDSpart/exonpart HINTS. | |
my $line; # | |
my @temp; # | |
my $scaffold; # SCAFFOLD NAME | |
my $feature_type; # FEATURE TYPE | |
my $start; # FEATURE START | |
my $end; # FEATURE END | |
my $strand; # FEATURE STRAND | |
my $score; # FEATURE SCORE | |
my $gene; # GENE NAME | |
my $name; # FEATURE NAME | |
my $input_gff_file; # NAME OF THE INPUT GFF FILE | |
my $intronlen; # INTRON LENGTH | |
my $tmp; # VARIABLE FOR STORING TEMPORARY VALUE | |
# CHECK THAT $type IS 'prot' OR 'est': | |
throw Error::Simple("ERRORCODE=2: make_exonerate_hints: type $type - should be prot/est") if ($type ne 'prot' && $type ne 'est'); # TESTED FOR | |
# FIND THE NAME OF THE INPUT GFF FILE: | |
@temp = split(/\//,$input_gff); | |
$input_gff_file = $temp[$#temp]; | |
# OPEN THE OUTPUT FILE: | |
open(HINTSFILE,">$hints_file") || die "ERROR: make_exonerate_hints: cannot open $hints_file\n"; | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: make_exonerate_hints: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
# THROW AN ERROR IF THERE ARE NOT 9 COLUMNS: | |
throw Error::Simple("ERRORCODE=1: make_exonerate_hints: gff $input_gff does not have 9 columns") if ($#temp != 8); # TESTED FOR | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local gene 610226 611381 1295 + . ID=gene1;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local CDS 610226 610399 . + . ID=gene1:cds;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local exon 610226 610399 . + . ID=gene1:exon:1;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice5 610400 610401 . + . ID=gene1:splice5;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local intron 610400 610445 . + . ID=gene1:intron;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice3 610444 610445 . + . ID=gene1:splice3;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local CDS 610446 610521 . + . ID=gene1:cds;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local exon 610446 610521 . + . ID=gene1:exon:2;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice5 610522 610523 . + . ID=gene1:splice5;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local intron 610522 610623 . + . ID=gene1:intron;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice3 610622 610623 . + . ID=gene1:splice3;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local CDS 610624 611381 . + . ID=gene1:cds;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local exon 610624 611381 . + . ID=gene1:exon:3;Parent=gene1 | |
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local similarity 610226 611381 1295 + . alignment_id 0 ; Query SRAE_2000311000.t1:mRNA ; Align 24781 1 108 ; Align 24895 37 60 ; Align 25001 57 75 ; Align 25181 83 756 | |
$scaffold = $temp[0]; | |
$feature_type = $temp[2]; | |
$start = $temp[3]; | |
$end = $temp[4]; | |
if ($feature_type eq 'CDS' || $feature_type eq 'exon' || $feature_type eq 'intron') | |
{ | |
# $start SHOULD BE BEFORE $end: | |
assert($start <= $end); | |
$score = $temp[5]; | |
$strand = $temp[6]; | |
$name = $temp[8]; | |
# GET THE GENE NAME: | |
@temp = split(/Parent=/,$name); | |
$gene = $temp[1]; # eg. gene1 | |
$gene = $input_gff_file."_".$gene; | |
if ($feature_type eq 'CDS') # CODING EXON | |
{ | |
# IF IT IS A PROTEIN ALIGNMENT, WE WANT TO USE THE CODING EXONS AS 'CDSpart' HINTS: | |
if ($type eq 'prot') | |
{ | |
$start = $start + $CDSpart_cutoff; | |
$end = $end - $CDSpart_cutoff; | |
if ($start > $end) | |
{ | |
$tmp = int(($start + $end)/2); | |
$start = $tmp; | |
$end = $tmp; | |
} | |
print HINTSFILE "$scaffold\texoneratehints\tCDSpart\t$start\t$end\t.\t$strand\t.\tgroup=$gene; source=P; priority=4\n"; | |
} | |
} | |
elsif ($feature_type eq 'exon') # EXON | |
{ | |
# IF IT IS AN EST ALIGNMENT, WE WANT TO USE THE 'exon' FEATURES AS 'exonpart' HINTS: | |
if ($type eq 'est') | |
{ | |
$start = $start + $CDSpart_cutoff; | |
$end = $end - $CDSpart_cutoff; | |
if ($start > $end) | |
{ | |
$tmp = int(($start + $end)/2); | |
$start = $tmp; | |
$end = $tmp; | |
} | |
print HINTSFILE "$scaffold\texoneratehints\texonpart\t$start\t$end\t.\t$strand\t.\tgroup=$gene; source=E; priority=4\n"; | |
} | |
} | |
elsif ($feature_type eq 'intron') # INTRON | |
{ | |
$intronlen = $end - $start + 1; | |
if ($intronlen >= $minintronlen && $intronlen <= $maxintronlen) | |
{ | |
# IF IT IS A PROTEIN OR EST ALIGNMENT, WE WANT TO USE THE INTRONS AS 'intron' HINTS: | |
if ($type eq 'prot') | |
{ | |
print HINTSFILE "$scaffold\texoneratehints\tintron\t$start\t$end\t0\t$strand\t.\tgroup=$gene; source=P; priority=4\n"; | |
} | |
elsif ($type eq 'est') | |
{ | |
print HINTSFILE "$scaffold\texoneratehints\tintron\t$start\t$end\t0\t$strand\t.\tgroup=$gene; source=E; priority=4\n"; | |
} | |
} | |
} | |
} | |
} | |
close(INPUT_GFF); | |
close(HINTSFILE); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: read_eval_stats: GIVEN AN EVAL OUTPUT FILE, RETURN A LOT OF STATISTICS: | |
sub read_eval_stats | |
{ | |
my $eval_file = $_[0]; # EVAL FILE | |
my $num_genes = -1; # NUMBER OF GENES | |
my $num_transcripts = -1; # NUMBER OF TRANSCRIPTS | |
my $av_prot_len = -1; # AVERAGE PROTEIN LENGTH IN AMINO ACIDS | |
my $av_exon_len = -1; # AVERAGE EXON LENGTH IN BASE-PAIRS | |
my $median_exon_len = -1; # MEDIAN EXON LENGTH IN BASE-PAIRS | |
my $av_exons = -1; # AVERAGE EXONS PER TRANSCRIPT | |
my $median_exons = -1; # MEDIAN EXONS PER TRANSCRIPT | |
my $av_intron_len = -1; # AVERAGE INTRON LENGTH | |
my $median_intron_len = -1; # MEDIAN INTRON LENGTH | |
my $proteome_len = -1; # PROTEOME LENGTH IN AMINO ACIDS | |
my $line; # | |
my @temp; # | |
my $type = "NA"; # TYPE OF FEATURE | |
my $subtype = "NA"; # SUBTYPE OF FEATURE | |
open(EVAL,"$eval_file") || die "ERROR: read_eval_stats: cannot open $eval_file\n"; | |
while(<EVAL>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
if (substr($line,0,4) eq 'Gene') { $type = 'Gene'; } | |
elsif (substr($line,0,10) eq 'Transcript') { $type = 'Transcript';} | |
elsif (substr($line,0,4) eq 'Exon') { $type = 'Exon'; } | |
elsif (substr($line,0,3) eq 'Nuc') { $type = 'Nuc'; } | |
elsif (substr($line,0,6) eq 'Signal') { $type = 'Signal'; } | |
# GET THE NUMBER OF GENES AND TRANSCRIPTS: | |
if ($type eq 'Gene') | |
{ | |
# Count 27189.00 | |
if ($line =~ /Count/) | |
{ | |
assert($num_genes == -1); | |
$num_genes = $temp[$#temp]; | |
} | |
# Total Transcripts 27189.00 | |
if ($line =~ /Total Transcripts/) | |
{ | |
assert($num_transcripts == -1); | |
$num_transcripts = $temp[$#temp]; | |
} | |
} | |
# GET THE AVERAGE PROTEIN LENGTH, AND TOTAL PROTEOME LENGTH, AND AVERAGE AND MEDIAN EXONS PER GENE: | |
if ($type eq 'Transcript') | |
{ | |
if ($line =~ /All/) { $subtype = 'All'; } | |
elsif ($line =~ /Complete/) { $subtype = 'Complete'; } | |
# Average Coding Length 2135.66 | |
if ($line =~ /Average Coding Length/ && $subtype eq 'All') | |
{ | |
assert($av_prot_len == -1); | |
$av_prot_len = $temp[$#temp] / 3; # DIVIDE BY 3 TO GET AMINO ACIDS | |
$av_prot_len = nearest(0.01, $av_prot_len); | |
} | |
# Total Coding Length 14689651.00 | |
if ($line =~ /Total Coding Length/ && $subtype eq 'All') | |
{ | |
assert($proteome_len == -1); | |
$proteome_len = $temp[$#temp] / 3; # DIVIDE BY 3 TO GET AMINO ACIDS | |
$proteome_len = nearest(0.01, $proteome_len); | |
} | |
# Ave Exons Per 3.31 | |
if ($line =~ /Ave Exons Per/ && $subtype eq 'All') | |
{ | |
assert($av_exons == -1); | |
$av_exons = $temp[$#temp]; | |
} | |
# Med Exons Per 3.00 | |
if ($line =~ /Med Exons Per/ && $subtype eq 'All') | |
{ | |
assert($median_exons == -1); | |
$median_exons = $temp[$#temp]; | |
} | |
} | |
# GET THE AVERAGE EXON LENGTH, MEDIAN EXON LENGTH, AND AVERAGE AND MEDIAN INTRON LENGTH: | |
if ($type eq 'Exon') | |
{ | |
if ($line =~ /All/) { $subtype = 'All'; } | |
elsif ($line =~ /Initial/) { $subtype = 'Initial'; } | |
elsif ($line =~ /Intron/) { $subtype = 'Intron'; } | |
elsif ($line =~ /InframeOptional/) { $subtype = 'InframeOptional';} | |
# Average Length 163.56 | |
if ($line =~ /Average Length/ && $subtype eq 'All') | |
{ | |
assert($av_exon_len == -1); | |
$av_exon_len = $temp[$#temp]; # IN BASE-PAIRS | |
} | |
# Median Length 145.00 | |
if ($line =~ /Median Length/ && $subtype eq 'All') | |
{ | |
assert($median_exon_len == -1); | |
$median_exon_len = $temp[$#temp]; # IN BASE-PAIRS | |
} | |
# Average Length 683.62 | |
if ($line =~ /Average Length/ && $subtype eq 'Intron') | |
{ | |
assert($av_intron_len == -1); | |
$av_intron_len = $temp[$#temp]; | |
} | |
# Median Length 520.00 | |
if ($line =~ /Median Length/ && $subtype eq 'Intron') | |
{ | |
assert($median_intron_len == -1); | |
$median_intron_len = $temp[$#temp]; | |
} | |
} | |
} | |
close(EVAL); | |
assert(looks_like_number($num_genes)); assert($num_genes != -1); | |
assert(looks_like_number($num_transcripts)); assert($num_transcripts != -1); | |
assert(looks_like_number($av_prot_len)); assert($av_prot_len != -1); | |
assert(looks_like_number($av_exon_len)); assert($av_exon_len != -1); | |
assert(looks_like_number($median_exon_len)); assert($median_exon_len != -1); | |
assert(looks_like_number($av_exons)); assert($av_exons != -1); | |
assert(looks_like_number($median_exons)); assert($median_exons != -1); | |
assert(looks_like_number($av_intron_len)); assert($av_intron_len != -1); | |
assert(looks_like_number($median_intron_len)); assert($median_intron_len != -1); | |
assert(looks_like_number($proteome_len)); assert($proteome_len != -1); | |
return($num_genes,$num_transcripts,$av_prot_len,$av_exon_len,$median_exon_len,$av_exons,$median_exons,$av_intron_len,$median_intron_len,$proteome_len); | |
} | |
#------------------------------------------------------------------# | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment