Skip to content

Instantly share code, notes, and snippets.

@avrilcoghlan
Created October 21, 2013 08:22
Show Gist options
  • Save avrilcoghlan/7080383 to your computer and use it in GitHub Desktop.
Save avrilcoghlan/7080383 to your computer and use it in GitHub Desktop.
Perl module with functions to process gene-finding data.
package HelminthGenomeAnalysis::AvrilGenefindingUtils;
use strict;
use warnings;
use Bio::Seq;
use Bio::SeqIO;
use Moose;
use Math::Round; # HAS THE nearest() FUNCTION
use Carp::Assert; # HAS THE assert() FUNCTION
use Scalar::Util qw(looks_like_number);
use Math::Round; # CONTAINS FUNCTION nearest() FOR ROUNDING
use base 'Exporter';
our @EXPORT_OK = qw( make_exonerate_hints read_eval_stats );
#------------------------------------------------------------------#
# SUBROUTINE SYNOPSIS: make_exonerate_hints(): GIVEN AN EXONERATE GFF FILE, MAKE A HINTS FILE FOR AUGUSTUS (NOTE: THE HINTS ARE GIVEN PRIORITY 4):
# NOTE: /nfs/users/nfs_a/alc/Documents/bin/augustus/scripts/blat2hints.pl COMES WITH AUGUSTUS AND CONVERTS A BLAT FILE (OF mRNA/EST ALIGNMENTS)
# TO A HINTS FILE, AND CUTS 10 bp OFF EACH END OF THE ALIGNMENT TO MAKE THE exonpart HINT.
# THE SCRIPT /nfs/users/nfs_a/alc/Documents/bin/augustus/scripts/exonerate2hints.pl COMES WITH AUGUSTUS AND CONVERTS AN exonerate OUTPUT FILE
# TO A HINTS FILE, AND CUTS 9 bp OFF EACH END OF EACH exonerate CDS TO MAKE CDSpart HINTS.
sub make_exonerate_hints
{
my $input_gff = $_[0]; # EXONERATE GFF FILE
my $hints_file = $_[1]; # OUTPUT HINTS FILE
my $type = $_[2]; # TYPE OF EXONERATE ALIGNMENT ('est'/'prot')
my $minintronlen = $_[3]; # MINIMUM INTRON LENGTH
my $maxintronlen = $_[4]; # MAXIMUM INTRON LENGTH
my $CDSpart_cutoff = $_[5]; # NUMBER OF BASE-PAIRS TO CUT OFF EACH exonerate CDS/exon TO MAKE CDSpart/exonpart HINTS.
my $line; #
my @temp; #
my $scaffold; # SCAFFOLD NAME
my $feature_type; # FEATURE TYPE
my $start; # FEATURE START
my $end; # FEATURE END
my $strand; # FEATURE STRAND
my $score; # FEATURE SCORE
my $gene; # GENE NAME
my $name; # FEATURE NAME
my $input_gff_file; # NAME OF THE INPUT GFF FILE
my $intronlen; # INTRON LENGTH
my $tmp; # VARIABLE FOR STORING TEMPORARY VALUE
# CHECK THAT $type IS 'prot' OR 'est':
throw Error::Simple("ERRORCODE=2: make_exonerate_hints: type $type - should be prot/est") if ($type ne 'prot' && $type ne 'est'); # TESTED FOR
# FIND THE NAME OF THE INPUT GFF FILE:
@temp = split(/\//,$input_gff);
$input_gff_file = $temp[$#temp];
# OPEN THE OUTPUT FILE:
open(HINTSFILE,">$hints_file") || die "ERROR: make_exonerate_hints: cannot open $hints_file\n";
# READ IN THE INPUT GFF FILE:
open(INPUT_GFF,"$input_gff") || die "ERROR: make_exonerate_hints: cannot open $input_gff\n";
while(<INPUT_GFF>)
{
$line = $_;
chomp $line;
@temp = split(/\t+/,$line);
# THROW AN ERROR IF THERE ARE NOT 9 COLUMNS:
throw Error::Simple("ERRORCODE=1: make_exonerate_hints: gff $input_gff does not have 9 columns") if ($#temp != 8); # TESTED FOR
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local gene 610226 611381 1295 + . ID=gene1;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local CDS 610226 610399 . + . ID=gene1:cds;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local exon 610226 610399 . + . ID=gene1:exon:1;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice5 610400 610401 . + . ID=gene1:splice5;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local intron 610400 610445 . + . ID=gene1:intron;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice3 610444 610445 . + . ID=gene1:splice3;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local CDS 610446 610521 . + . ID=gene1:cds;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local exon 610446 610521 . + . ID=gene1:exon:2;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice5 610522 610523 . + . ID=gene1:splice5;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local intron 610522 610623 . + . ID=gene1:intron;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local splice3 610622 610623 . + . ID=gene1:splice3;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local CDS 610624 611381 . + . ID=gene1:cds;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local exon 610624 611381 . + . ID=gene1:exon:3;Parent=gene1
# PTRK_scaffold1_size6144505 exonerate:protein2genome:local similarity 610226 611381 1295 + . alignment_id 0 ; Query SRAE_2000311000.t1:mRNA ; Align 24781 1 108 ; Align 24895 37 60 ; Align 25001 57 75 ; Align 25181 83 756
$scaffold = $temp[0];
$feature_type = $temp[2];
$start = $temp[3];
$end = $temp[4];
if ($feature_type eq 'CDS' || $feature_type eq 'exon' || $feature_type eq 'intron')
{
# $start SHOULD BE BEFORE $end:
assert($start <= $end);
$score = $temp[5];
$strand = $temp[6];
$name = $temp[8];
# GET THE GENE NAME:
@temp = split(/Parent=/,$name);
$gene = $temp[1]; # eg. gene1
$gene = $input_gff_file."_".$gene;
if ($feature_type eq 'CDS') # CODING EXON
{
# IF IT IS A PROTEIN ALIGNMENT, WE WANT TO USE THE CODING EXONS AS 'CDSpart' HINTS:
if ($type eq 'prot')
{
$start = $start + $CDSpart_cutoff;
$end = $end - $CDSpart_cutoff;
if ($start > $end)
{
$tmp = int(($start + $end)/2);
$start = $tmp;
$end = $tmp;
}
print HINTSFILE "$scaffold\texoneratehints\tCDSpart\t$start\t$end\t.\t$strand\t.\tgroup=$gene; source=P; priority=4\n";
}
}
elsif ($feature_type eq 'exon') # EXON
{
# IF IT IS AN EST ALIGNMENT, WE WANT TO USE THE 'exon' FEATURES AS 'exonpart' HINTS:
if ($type eq 'est')
{
$start = $start + $CDSpart_cutoff;
$end = $end - $CDSpart_cutoff;
if ($start > $end)
{
$tmp = int(($start + $end)/2);
$start = $tmp;
$end = $tmp;
}
print HINTSFILE "$scaffold\texoneratehints\texonpart\t$start\t$end\t.\t$strand\t.\tgroup=$gene; source=E; priority=4\n";
}
}
elsif ($feature_type eq 'intron') # INTRON
{
$intronlen = $end - $start + 1;
if ($intronlen >= $minintronlen && $intronlen <= $maxintronlen)
{
# IF IT IS A PROTEIN OR EST ALIGNMENT, WE WANT TO USE THE INTRONS AS 'intron' HINTS:
if ($type eq 'prot')
{
print HINTSFILE "$scaffold\texoneratehints\tintron\t$start\t$end\t0\t$strand\t.\tgroup=$gene; source=P; priority=4\n";
}
elsif ($type eq 'est')
{
print HINTSFILE "$scaffold\texoneratehints\tintron\t$start\t$end\t0\t$strand\t.\tgroup=$gene; source=E; priority=4\n";
}
}
}
}
}
close(INPUT_GFF);
close(HINTSFILE);
return(1);
}
#------------------------------------------------------------------#
# SUBROUTINE SYNOPSIS: read_eval_stats: GIVEN AN EVAL OUTPUT FILE, RETURN A LOT OF STATISTICS:
sub read_eval_stats
{
my $eval_file = $_[0]; # EVAL FILE
my $num_genes = -1; # NUMBER OF GENES
my $num_transcripts = -1; # NUMBER OF TRANSCRIPTS
my $av_prot_len = -1; # AVERAGE PROTEIN LENGTH IN AMINO ACIDS
my $av_exon_len = -1; # AVERAGE EXON LENGTH IN BASE-PAIRS
my $median_exon_len = -1; # MEDIAN EXON LENGTH IN BASE-PAIRS
my $av_exons = -1; # AVERAGE EXONS PER TRANSCRIPT
my $median_exons = -1; # MEDIAN EXONS PER TRANSCRIPT
my $av_intron_len = -1; # AVERAGE INTRON LENGTH
my $median_intron_len = -1; # MEDIAN INTRON LENGTH
my $proteome_len = -1; # PROTEOME LENGTH IN AMINO ACIDS
my $line; #
my @temp; #
my $type = "NA"; # TYPE OF FEATURE
my $subtype = "NA"; # SUBTYPE OF FEATURE
open(EVAL,"$eval_file") || die "ERROR: read_eval_stats: cannot open $eval_file\n";
while(<EVAL>)
{
$line = $_;
chomp $line;
@temp = split(/\s+/,$line);
if (substr($line,0,4) eq 'Gene') { $type = 'Gene'; }
elsif (substr($line,0,10) eq 'Transcript') { $type = 'Transcript';}
elsif (substr($line,0,4) eq 'Exon') { $type = 'Exon'; }
elsif (substr($line,0,3) eq 'Nuc') { $type = 'Nuc'; }
elsif (substr($line,0,6) eq 'Signal') { $type = 'Signal'; }
# GET THE NUMBER OF GENES AND TRANSCRIPTS:
if ($type eq 'Gene')
{
# Count 27189.00
if ($line =~ /Count/)
{
assert($num_genes == -1);
$num_genes = $temp[$#temp];
}
# Total Transcripts 27189.00
if ($line =~ /Total Transcripts/)
{
assert($num_transcripts == -1);
$num_transcripts = $temp[$#temp];
}
}
# GET THE AVERAGE PROTEIN LENGTH, AND TOTAL PROTEOME LENGTH, AND AVERAGE AND MEDIAN EXONS PER GENE:
if ($type eq 'Transcript')
{
if ($line =~ /All/) { $subtype = 'All'; }
elsif ($line =~ /Complete/) { $subtype = 'Complete'; }
# Average Coding Length 2135.66
if ($line =~ /Average Coding Length/ && $subtype eq 'All')
{
assert($av_prot_len == -1);
$av_prot_len = $temp[$#temp] / 3; # DIVIDE BY 3 TO GET AMINO ACIDS
$av_prot_len = nearest(0.01, $av_prot_len);
}
# Total Coding Length 14689651.00
if ($line =~ /Total Coding Length/ && $subtype eq 'All')
{
assert($proteome_len == -1);
$proteome_len = $temp[$#temp] / 3; # DIVIDE BY 3 TO GET AMINO ACIDS
$proteome_len = nearest(0.01, $proteome_len);
}
# Ave Exons Per 3.31
if ($line =~ /Ave Exons Per/ && $subtype eq 'All')
{
assert($av_exons == -1);
$av_exons = $temp[$#temp];
}
# Med Exons Per 3.00
if ($line =~ /Med Exons Per/ && $subtype eq 'All')
{
assert($median_exons == -1);
$median_exons = $temp[$#temp];
}
}
# GET THE AVERAGE EXON LENGTH, MEDIAN EXON LENGTH, AND AVERAGE AND MEDIAN INTRON LENGTH:
if ($type eq 'Exon')
{
if ($line =~ /All/) { $subtype = 'All'; }
elsif ($line =~ /Initial/) { $subtype = 'Initial'; }
elsif ($line =~ /Intron/) { $subtype = 'Intron'; }
elsif ($line =~ /InframeOptional/) { $subtype = 'InframeOptional';}
# Average Length 163.56
if ($line =~ /Average Length/ && $subtype eq 'All')
{
assert($av_exon_len == -1);
$av_exon_len = $temp[$#temp]; # IN BASE-PAIRS
}
# Median Length 145.00
if ($line =~ /Median Length/ && $subtype eq 'All')
{
assert($median_exon_len == -1);
$median_exon_len = $temp[$#temp]; # IN BASE-PAIRS
}
# Average Length 683.62
if ($line =~ /Average Length/ && $subtype eq 'Intron')
{
assert($av_intron_len == -1);
$av_intron_len = $temp[$#temp];
}
# Median Length 520.00
if ($line =~ /Median Length/ && $subtype eq 'Intron')
{
assert($median_intron_len == -1);
$median_intron_len = $temp[$#temp];
}
}
}
close(EVAL);
assert(looks_like_number($num_genes)); assert($num_genes != -1);
assert(looks_like_number($num_transcripts)); assert($num_transcripts != -1);
assert(looks_like_number($av_prot_len)); assert($av_prot_len != -1);
assert(looks_like_number($av_exon_len)); assert($av_exon_len != -1);
assert(looks_like_number($median_exon_len)); assert($median_exon_len != -1);
assert(looks_like_number($av_exons)); assert($av_exons != -1);
assert(looks_like_number($median_exons)); assert($median_exons != -1);
assert(looks_like_number($av_intron_len)); assert($av_intron_len != -1);
assert(looks_like_number($median_intron_len)); assert($median_intron_len != -1);
assert(looks_like_number($proteome_len)); assert($proteome_len != -1);
return($num_genes,$num_transcripts,$av_prot_len,$av_exon_len,$median_exon_len,$av_exons,$median_exons,$av_intron_len,$median_intron_len,$proteome_len);
}
#------------------------------------------------------------------#
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment