Created
August 15, 2013 11:10
-
-
Save avrilcoghlan/6240052 to your computer and use it in GitHub Desktop.
AvrilGffUtils.pm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package HelminthGenomeAnalysis::AvrilGffUtils; | |
use strict; | |
use warnings; | |
use Math::Round; # HAS THE nearest() FUNCTION | |
use Carp::Assert; # HAS THE assert() FUNCTION | |
use Scalar::Util qw(looks_like_number); | |
use base 'Exporter'; | |
our @EXPORT_OK = qw( read_gene_ids replace_gene_ids_in_gff get_gff_lines_for_feature_types add_flanking_region_to_gff_features make_fasta_for_gff make_gff_for_features_in_subsequences read_genenames_for_transcripts get_gene_name add_gene_features_to_gff_for_subsequences make_gff_for_features_in_sequences sort_gff merge_overlapping_cds sort_gff_lines_for_genes count_features_in_region convert_exonerate_gff_to_standard_gff ); | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: make_fasta_for_gff(): MAKE A FASTA FILE OF SEQUENCES SPECIFIED BY A GFF FILE. | |
sub make_fasta_for_gff | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $input_fasta = $_[1]; # INPUT FASTA FILE | |
my $output_fasta = $_[2]; # OUTPUT FASTA FILE | |
my $outputdir = $_[3]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $cmd; # COMMAND TO RUN | |
my $systemcall; # RETURN VALUE FROM SYSTEM CALL | |
my $temp_output_fasta; # TEMPORARY OUTPUT FASTA FILE | |
my $line; # | |
my @temp; # | |
my $region; # NAME OF REGION IN FASTA FILE | |
my $scaffold; # SCAFFOLD OF REGION IN FASTA FILE | |
my $start; # START OF REGION IN FASTA FILE IN $scaffold | |
my $end; # END OF REGION IN FASTA FILE IN $scaffold | |
$temp_output_fasta = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); # MAKE A TEMPORARY FILE IN THE CURRENT DIRECTORY | |
# MAKE A FASTA FILE WITH THE SEQUENCES SPECIFIED BY $input_gff: | |
$cmd = "bedtools getfasta -fi $input_fasta -bed $input_gff -fo $temp_output_fasta"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=1: add_flanking_region_to_gff_features: return value $systemcall when tried to run $cmd") if ($systemcall != 0); # TESTED FOR THIS | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
# NOTE: THE OUTPUT FASTA FILE $output_fasta HAS THE COORDINATES LIKE IN BED FILES: | |
# BED FILES ARE ZERO-BASED, HALF-OPEN: http://code.google.com/p/bedtools/wiki/FAQ#What_does_zero-based,_half-open_mean? | |
# WE WANT TO RE-WRITE THESE COORDINATES SO THEY ARE 1-BASED: | |
open(OUTPUT_FASTA,">$output_fasta") || die "ERROR: make_fasta_for_gff: cannot open $output_fasta\n"; | |
open(TEMP_OUTPUT_FASTA,"$temp_output_fasta") || die "ERROR: make_fasta_for_gff: cannot open $temp_output_fasta\n"; | |
while(<TEMP_OUTPUT_FASTA>) | |
{ | |
$line = $_; | |
chomp $line; | |
if (substr($line,0,1) eq '>') | |
{ | |
@temp = split(/\s+/,$line); | |
$region = $temp[0]; | |
$region = substr($region,1,length($region)-1); # eg. scaff1:0-7 | |
@temp = split(/:/,$region); | |
$scaffold = $temp[0]; # eg. scaff1 | |
$start = $temp[1]; # eg. 0-7 | |
@temp = split(/-/,$start); | |
$start = $temp[0]; # eg. 0 | |
$end = $temp[1]; # eg. 7 | |
# CHANGE 0-BASED START COORDINATE TO A 1-BASED START COORDINATE: | |
$start = $start + 1; | |
print OUTPUT_FASTA ">$scaffold:$start-$end\n"; | |
} | |
else { print OUTPUT_FASTA "$line\n";} | |
} | |
close(TEMP_OUTPUT_FASTA); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: make_gff_for_features_in_subsequences(): GIVEN AN INPUT GFF OF FEATURES (eg. GENES) IN SCAFFOLDS, AND A SECOND INPUT GFF OF SUBSEQUENCES OF THE SCAFFOLDS, MAKES AN OUTPUT GFF OF FEATURES (eg. GENES) IN THE SUBSEQUENCES. | |
sub make_gff_for_features_in_subsequences | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE OF FEATURES IN SCAFFOLDS | |
my $input_gff2 = $_[1]; # INPUT GFF FILE OF SUBSEQUENCES OF SCAFFOLDS | |
my $output_gff = $_[2]; # OUTPUT GFF FILE OF FEATURES IN SUBSEQUENCES | |
my $line; # | |
my @temp; # | |
my $feature; # FEATURE NAME | |
my %START = (); # START POSITIONS OF FEATURES IN SCAFFOLDS | |
my %END = (); # END POSITIONS OF FEATURES IN SCAFFOLDS | |
my $feature_start_in_scaff; # START OF FEATURE $feature IN THE SCAFFOLD | |
my $feature_end_in_scaff; # END OF FEATURE $feature IN THE SCAFFOLD | |
my $subsequence_start_in_scaff; # START OF THE SUBSEQUENCE IN THE SCAFFOLD | |
my $subsequence_end_in_scaff; # END OF THE SUBSEQUENCE IN THE SCAFFOLD | |
my $feature_start_in_subsequence;# START OF THE FEATURE $feature IN THE SUBSEQUENCE | |
my $feature_end_in_subsequence; # END OF THE FEATURE $feature IN THE SUBSEQUENCE | |
my $scaffold; # SCAFFOLD NAME | |
my $feature_length; # FEATURE LENGTH | |
my $diff_in_start; # DIFFERENCE IN THE START OF A FEATURE BETWEEN $input_gff AND $output_gff | |
my %DIFF_IN_START = (); # HASH TABLE TO STORE $diff_in_start | |
# READ IN THE GFF FILE OF FEATURES IN SCAFFOLDS: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: make_gff_for_features_in_subsequences: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
# eg. scaff1 source feature 20 30 . + . feature1 | |
@temp = split(/\t+/,$line); | |
$feature_start_in_scaff = $temp[3]; | |
$feature_end_in_scaff= $temp[4]; | |
$feature = $temp[8]; | |
# THROW AN ERROR IF WE ALREADY HAVE STORED THE START POSITION OF $feature: | |
throw Error::Simple("ERRORCODE=1: make_gff_for_features_in_subsequences: already stored start for $feature") if (defined($START{$feature})); # TESTED FOR | |
$START{$feature} = $feature_start_in_scaff; | |
# THROW AN ERROR IF WE ALREADY HAVE STORED THE END POSITION OF $feature: | |
throw Error::Simple("ERRORCODE=2: make_gff_for_features_in_subsequences: already stored end for $feature") if (defined($END{$feature})); # TESTED FOR | |
$END{$feature} = $feature_end_in_scaff; | |
} | |
close(INPUT_GFF); | |
# OPEN THE OUTPUT GFF FILE: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: make_gff_for_features_in_subsequences: cannot open $output_gff\n"; | |
# READ IN THE GFF FILE OF SUBSEQUENCES IN SCAFFOLDS: | |
open(INPUT_GFF2,"$input_gff2") || die "ERROR: make_gff_for_features_in_subsequences: cannot open $input_gff2\n"; | |
while(<INPUT_GFF2>) | |
{ | |
$line = $_; | |
chomp $line; | |
# eg. scaff1 source feature 15 35 . + . feature1 | |
@temp = split(/\t+/,$line); | |
$scaffold = $temp[0]; | |
$subsequence_start_in_scaff = $temp[3]; | |
$subsequence_end_in_scaff = $temp[4]; | |
$feature = $temp[8]; | |
# THROW AN ERROR IF WE DO NOT KNOW THE START OF THIS FEATURE IN THE SCAFFOLD: | |
assert(defined($START{$feature})); # THIS SHOULD NEVER HAPPEN, PROGRAM WILL DIE IF IT DOES. | |
$feature_start_in_scaff = $START{$feature}; | |
# THROW AN ERROR IF WE DO NOT KNOW THE END OF THIS FEATURE IN THE SCAFFOLD: | |
assert(defined($END{$feature})); # THIS SHOULD NEVER HAPPEN, PROGRAM WILL DIE IF IT DOES. | |
$feature_end_in_scaff = $END{$feature}; | |
$feature_length = $feature_end_in_scaff - $feature_start_in_scaff; | |
# FIND THE START AND END OF THE FEATURE WITH RESPECT TO THE SUBSEQUENCE: | |
$feature_start_in_subsequence = $feature_start_in_scaff - $subsequence_start_in_scaff + 1; | |
# eg. feature_start_in_scaff=20, subsequence_start_in_scaff=15 ---> 20-15+1 = 6 | |
$feature_end_in_subsequence = $feature_start_in_subsequence + $feature_length; | |
# eg. feature_end_in_scaff = 30, feature_length = 10, feature_end_in_subsequence = 6 + 10 = 16 | |
print OUTPUT_GFF "$scaffold:$subsequence_start_in_scaff-$subsequence_end_in_scaff\t$temp[1]\t$temp[2]\t$feature_start_in_subsequence\t$feature_end_in_subsequence\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n"; | |
# RECORD THE DIFFERENCE IN THE START OF THE FEATURE BETWEEN $input_gff AND $output_gff: | |
$diff_in_start = $feature_start_in_scaff - $feature_start_in_subsequence; | |
assert(!(defined($DIFF_IN_START{$feature}))); # THIS SHOULD NEVER HAPPEN, PROGRAM WILL DIE IF IT DOES. | |
$DIFF_IN_START{$feature} = $diff_in_start; | |
} | |
close(INPUT_GFF2); | |
close(OUTPUT_GFF); | |
return(\%DIFF_IN_START); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: add_flanking_region_to_gff_features(): ADD flank_size ON EITHER SIDE OF EACH GFF FEATURE IN AN INPUT GFF FILE, TO MAKE A NEW OUTPUT GFF. | |
sub add_flanking_region_to_gff_features | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $flank_size = $_[1]; # FLANKING REGION SIZE | |
my $scaffold_length_file= $_[2]; # FILE WITH LENGTHS OF SCAFFOLDS | |
my $output_gff = $_[3]; # OUTPUT GFF FILE | |
my $cmd; # COMMAND TO RUN | |
my $systemcall; # RETURN VALUE FROM SYSTEM CALL | |
my $line; # | |
my @temp; # | |
# MAKE AN OUTPUT GFF BY ADDING A FLANKING REGION OF $flank_size bp TO EACH SIDE OF EACH FEATURE IN $input_gff: | |
$cmd = "bedtools slop -i $input_gff -g $scaffold_length_file -b $flank_size > $output_gff"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=1: add_flanking_region_to_gff_features: return value $systemcall when tried to run $cmd") if ($systemcall != 0); # TESTED FOR THIS | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: get_gff_lines_for_feature_types(): EXTRACT THE GFF LINES FOR PARTICULAR FEATURE TYPES FROM A GFF FILE, AND PUT IN A NEW GFF FILE. THIS DOES NOT INCLUDE FASTA SEQUENCE IN THE OUTPUT GFF. | |
sub get_gff_lines_for_feature_types | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $feature_types = $_[1]; # ARRAY OF FEATURE TYPES TO TAKE | |
my $output_gff = $_[2]; # OUTPUT GFF FILE | |
my $new_gff; # NEW GFF FILE | |
my $line; # | |
my @temp; # | |
my %TAKE = (); # HASH OF FEATURE TYPES TO TAKE | |
my $i; # | |
my $feature_type; # A FEATURE TYPE | |
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE INPUT GFF FILE | |
# RECORD THE FEATURE TYPES TO TAKE IN A HASH: | |
for ($i = 0; $i < @$feature_types; $i++) | |
{ | |
$feature_type = $feature_types->[$i]; | |
$TAKE{$feature_type} = 1; | |
} | |
# OPEN THE OUTPUT GFF: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: get_gff_lines_for_feature_types: cannot open output_gff $output_gff\n"; | |
# READ THROUGH THE INPUT GFF: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: get_gff_lines_for_feature_types: cannot open input_gff $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
# THROW AN EXCEPTION IF THERE ARE NOT 9 COLUMNS IN THE GFF LINE: | |
throw Error::Simple("ERRORCODE=1: get_gff_lines_for_feature_types: do not have 9 columns in $input_gff: line $line") if ($#temp != 8); # TESTED FOR | |
$feature_type = $temp[2]; | |
if ($TAKE{$feature_type}) { print OUTPUT_GFF "$line\n";} | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT_GFF); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: read_gene_ids(): READ IN A FILE OF NEW VERSUS OLD GENE IDs, AND RETURN A HASH: | |
sub read_gene_ids | |
{ | |
my $new_gene_ids = $_[0]; # FILE WITH NEW VERSUS OLD GENE IDS | |
my %OLD2NEW = (); # HASH TABLE TO STORE NEW GENE IDS | |
my $line; # | |
my @temp; # | |
my $new; # NEW GENE ID | |
my $old; # OLD GENE ID | |
# READ IN THE FILE OF OLD VERSUS NEW GENE IDS: | |
# eg. | |
# SRAE_0036900 (old) = SRAE_0000000100 (new) | |
# SRAE_0037000 (old) = SRAE_0000000200 (new) | |
open(GENE_IDS,"$new_gene_ids") || die "ERROR: read_gene_ids: cannot open $new_gene_ids\n"; | |
while(<GENE_IDS>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$new = $temp[3]; | |
$old = $temp[0]; | |
# THROW AN EXCEPTION IF WE ALREADY HAVE A NEW GENE ID. FOR $old: | |
throw Error::Simple("ERRORCODE=1: read_gene_ids: already have new gene id. for old gene id. $old") if (defined($OLD2NEW{$old})); # TESTED FOR | |
$OLD2NEW{$old} = $new; | |
} | |
close(GENE_IDS); | |
return(\%OLD2NEW); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: replace_gene_ids_in_gff(): READ IN AN OLD GFF FILE, AND REPLACE THE GENE NAMES, WRITE OUT A NEW GFF FILE: | |
sub replace_gene_ids_in_gff | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE NAME | |
my $output_gff = $_[1]; # OUTPUT GFF FILE NAME | |
my $OLD2NEW = $_[2]; # HASH TABLE OF NEW GENE IDS. | |
my $line; # | |
my @temp; # | |
my @temp2; # | |
my $feature; # GFF FEATURE | |
my $gene; # GENE IN GFF LINE | |
my $new_gene; # NEW GENE NAME | |
my $name; # LAST COLUMN OF GFF LINE | |
my $transcript; # TRANSCRIPT FOR GENE | |
my $cds; # CDS FOR GENE | |
# OPEN THE OUTPUT GFF FILE: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: replace_gene_ids_in_gff: cannot open $output_gff\n"; | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: replace_gene_ids_in_gff: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$feature = $temp[2]; | |
if ($feature eq 'gene') | |
{ | |
$gene = $temp[8]; # eg. ID=SRAE_2424300 | |
@temp2 = split(/ID=/,$gene); | |
$gene = $temp2[1]; # eg. SRAE_2424300 | |
# THROW AN EXCEPTION IF WE DO NOT HAVE THE NEW GENE ID. FOR $gene: | |
throw Error::Simple("ERRORCODE=2: replace_gene_ids_in_gff: do not know new gene id. for $gene") if !(defined($OLD2NEW->{$gene})); # TESTED FOR | |
$new_gene = $OLD2NEW->{$gene}; | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$new_gene\n"; | |
} | |
elsif ($feature eq 'mRNA') | |
{ | |
$name = $temp[8]; # eg. ID=SRAE_2424300.t1:mRNA;Parent=SRAE_2424300 | |
@temp2 = split(/ID=/,$name); | |
$transcript = $temp2[1]; # eg. SRAE_2424300.t1:mRNA;Parent=SRAE_2424300 | |
@temp2 = split(/\;/,$transcript); | |
$transcript = $temp2[0]; # eg. SRAE_2424300.t1:mRNA | |
@temp2 = split(/\./,$transcript); | |
$gene = $temp2[0]; # eg. SRAE_2424300 | |
$transcript = $temp2[1]; # .t1:mRNA | |
# THROW AN EXCEPTION IF WE DO NOT HAVE THE NEW GENE ID. FOR $gene: | |
throw Error::Simple("ERRORCODE=2: replace_gene_ids_in_gff: do not know new gene id. for $gene") if !(defined($OLD2NEW->{$gene})); # TESTED FOR | |
$new_gene = $OLD2NEW->{$gene}; | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$new_gene.$transcript;Parent=$new_gene\n"; | |
} | |
elsif ($feature eq 'CDS') | |
{ | |
$name = $temp[8]; # ID=SRAE_2424300.t1:exon:1;Parent=SRAE_2424300.t1:mRNA | |
@temp2 = split(/ID=/,$name); | |
$cds = $temp2[1]; # SRAE_2424300.t1:exon:1;Parent=SRAE_2424300.t1:mRNA | |
@temp2 = split(/\;/,$cds); | |
$cds = $temp2[0]; # eg. SRAE_X125400.t1:exon:1 | |
@temp2 = split(/\./,$cds); | |
$gene = $temp2[0]; # eg. SRAE_X125400 | |
$cds = $temp2[1]; # eg. .t1:exon:1 | |
@temp2 = split(/Parent=/,$name); | |
$transcript = $temp2[1]; # eg. SRAE_2424300.t1:mRNA | |
@temp2 = split(/\./,$transcript); | |
$gene = $temp2[0]; # eg. SRAE_2424300 | |
$transcript = $temp2[1]; # eg. t1:mRNA | |
# THROW AN EXCEPTION IF WE DO NOT HAVE THE NEW GENE ID. FOR $gene: | |
throw Error::Simple("ERRORCODE=2: replace_gene_ids_in_gff: do not know new gene id. for $gene") if !(defined($OLD2NEW->{$gene})); # TESTED FOR | |
$new_gene = $OLD2NEW->{$gene}; | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$new_gene.$cds;Parent=$new_gene.$transcript\n"; | |
} | |
else | |
{ | |
# THROW AN EXCEPTION IF THE FEATURE IS NOT OF TYPE 'gene'/'mRNA'/'CDS': | |
throw Error::Simple("ERRORCODE=3: replace_gene_ids_in_gff: feature $feature"); # TESTED FOR | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT_GFF); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: add_gene_features_to_gff_for_subsequences(): GIVEN A GFF OF GENES, AND A SECOND GFF OF THE GENES WITH RESPECT TO SUBSEQUENCES OF THE SCAFFOLDS, AND A HASH SAYING HOW MUCH THE START OF THE GENES DIFFER BETWEEN THE FIRST AND SECOND GFFs, MAKE AN OUTPUT GFF WITH ALL THE GENES' FEATURES (eg. EXONS, etc.) WITH RESPECT TO THE SUBSEQUENCES OF SCAFFOLDS: | |
sub add_gene_features_to_gff_for_subsequences | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE OF GENE FEATURES IN SCAFFOLDS | |
my $gene_gff = $_[1]; # SECOND GFF FILE, OF THE GENES WITH RESPECT TO SUBSEQUENCES OF THE SCAFFOLDS | |
my $DIFF_IN_START = $_[2]; # HASH TABLE SAYING HOW MUCH THE START OF THE GENES DIFFER BETWEEN THE FIRST AND SECOND GFFs | |
my $output_gff = $_[3]; # OUTPUT GFF FILE | |
my $TRANSCRIPT2GENE = $_[4]; # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES | |
my $line; # | |
my @temp; # | |
my $feature_type; # FEATURE TYPE | |
my $feature_name; # FEATURE NAME | |
my $feature_start; # FEATURE START POSITION | |
my $feature_end; # FEATURE END POSITION | |
my $gene; # GENE NAME | |
my $diff_in_start; # DIFFERENCE IN START OF A GENE BETWEEN THE FIRST AND SECOND GFFs | |
my $scaffold; # SCAFFOLD NAME | |
my %SCAFFOLD = (); # HASH TABLE TO RECORD SCAFFOLD NAME FOR A GENE | |
my $feature_length; # LENGTH OF A FEATURE | |
# OPEN THE OUTPUT GFF FILE: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: add_gene_features_to_gff_for_subsequences: cannot open output_gff $output_gff\n"; | |
# PUT ALL THE LINES FROM $gene_gff IN $output_gff: | |
open(GENE_GFF,"$gene_gff") || die "ERROR: add_gene_features_to_gff_for_subsequences: cannot open $gene_gff\n"; | |
while(<GENE_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$scaffold = $temp[0]; | |
$gene = $temp[8]; | |
# THROW AN EXCEPTION IF WE ALREADY HAVE STORED A SCAFFOLD NAME FOR THIS GENE: | |
throw Error::Simple("ERRORCODE=3: add_gene_features_to_gff_for_subsequences: already stored a scaffold name $SCAFFOLD{$gene} for gene $gene, scaffold $scaffold") if (defined($SCAFFOLD{$gene})); # TESTED FOR | |
$SCAFFOLD{$gene} = $scaffold; | |
print OUTPUT_GFF "$line\n"; | |
} | |
close(GENE_GFF); | |
# OPEN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: add_gene_features_to_gff_for_subsequences: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
# eg. | |
# SSTP.contig.00005.803316 source CDS 8340 9119 . + . T1.SSTP.contig.00005.803316_1_1;Parent=T1.SSTP.contig.00005.803316_1_mRNA | |
# SSTP.contig.00005.803316 source gene 8171 9119 . + . T1.SSTP.contig.00005.803316_1 | |
# SSTP.contig.00005.803316 source mRNA 8171 9119 . + . T1.SSTP.contig.00005.803316_1_mRNA;Parent=T1.SSTP.contig.00005.803316_1 | |
$scaffold = $temp[0]; | |
$feature_type = $temp[2]; | |
$feature_start = $temp[3]; | |
$feature_end = $temp[4]; | |
$feature_length = $feature_end - $feature_start + 1; | |
$feature_name = $temp[8]; | |
if ($feature_type eq 'CDS' || $feature_type eq 'mRNA' || $feature_type eq 'gene') | |
{ | |
if ($feature_type eq 'CDS' || $feature_type eq 'mRNA') | |
{ | |
# GET THE GENE NAME: | |
$gene = HelminthGenomeAnalysis::AvrilGffUtils::get_gene_name($feature_type,$feature_name,$TRANSCRIPT2GENE,$scaffold); | |
# THROW AN EXCEPTION IF WE DO NOT KNOW THE DIFFERENCE IN START FOR THIS GENE: | |
throw Error::Simple("ERRORCODE=2: add_gene_features_to_gff_for_subsequences: do not know diff_in_start for $gene") if !(defined($DIFF_IN_START->{$gene})); # TESTED FOR | |
$diff_in_start = $DIFF_IN_START->{$gene}; | |
$feature_start = $feature_start - $diff_in_start; | |
$feature_end = $feature_start + $feature_length - 1; | |
# GET THE SCAFFOLD NAME: | |
throw Error::Simple("ERRORCODE=4: add_gene_features_to_gff_for_subsequences: do not know scaffold for gene $gene") if !(defined($SCAFFOLD{$gene})); # TESTED FOR | |
$scaffold = $SCAFFOLD{$gene}; | |
print OUTPUT_GFF "$scaffold\t$temp[1]\t$temp[2]\t$feature_start\t$feature_end\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n"; | |
} | |
} | |
else | |
{ | |
# THROW AN EXCEPTION IF THE FEATURE IS NOT OF TYPE 'gene'/'mRNA'/'CDS': | |
throw Error::Simple("ERRORCODE=1: add_gene_features_to_gff_for_subsequences: feature_type $feature_type"); # TESTED FOR | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT_GFF); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS get_gene_name(): GIVEN A FEATURE NAME IN A GFF FILE, RETURN THE GENE NAME: | |
sub get_gene_name | |
{ | |
my $feature_type = $_[0]; # FEATURE TYPE | |
my $feature_name = $_[1]; # FEATURE NAME | |
my $TRANSCRIPT2GENE = $_[2]; # HASH TABLE OF TRANSCRIPT NAMES TO GENE NAMES | |
my $scaffold = $_[3]; # THE SCAFFOLD NAME | |
my @temp; # | |
my $transcript; # TRANSCRIPT NAME | |
my $gene; # GENE NAME | |
if ($feature_type eq 'CDS') | |
{ | |
# eg. T1.SSTP.contig.00005.803316_1_1;Parent=T1.SSTP.contig.00005.803316_1_mRNA | |
# eg. ID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1 | |
if ($feature_name =~ /ID/) | |
{ | |
@temp = split(/ID=/,$feature_name); | |
$feature_name = $temp[1]; | |
} | |
@temp = split(/Parent=/,$feature_name); | |
$transcript = $temp[1]; # eg. T1.SSTP.contig.00005.803316_1_mRNA | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
$transcript = $scaffold."=".$transcript; | |
# THROW AN EXCEPTION IF THE GENE NAME IS NOT KNOWN FOR THIS TRANSCRIPT: | |
throw Error::Simple("ERRORCODE=1: get_gene_name: gene name is not known for transcript $transcript") if !(defined($TRANSCRIPT2GENE->{$transcript})); # TESTED FOR | |
$gene = $TRANSCRIPT2GENE->{$transcript}; | |
} | |
elsif ($feature_type eq 'exon') | |
{ | |
# eg. ID=CGOC_0001473701-mRNA-1:exon:1;Parent=CGOC_0001473701-mRNA-1 | |
if ($feature_name =~ /ID/) | |
{ | |
@temp = split(/ID=/,$feature_name); | |
$feature_name = $temp[1]; | |
} | |
@temp = split(/Parent=/,$feature_name); | |
$transcript = $temp[1]; # eg. T1.SSTP.contig.00005.803316_1_mRNA | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
$transcript = $scaffold."=".$transcript; | |
# THROW AN EXCEPTION IF THE GENE NAME IS NOT KNOWN FOR THIS TRANSCRIPT: | |
throw Error::Simple("ERRORCODE=2: get_gene_name: gene name is not known for transcript $transcript") if !(defined($TRANSCRIPT2GENE->{$transcript})); # TESTED FOR | |
$gene = $TRANSCRIPT2GENE->{$transcript}; | |
} | |
elsif ($feature_type eq 'mRNA') | |
{ | |
# eg. T1.SSTP.contig.00005.803316_1_mRNA;Parent=T1.SSTP.contig.00005.803316_1 | |
# eg. ID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392 | |
if ($feature_name =~ /ID/) | |
{ | |
@temp = split(/ID=/,$feature_name); | |
$feature_name = $temp[1]; | |
} | |
@temp = split(/Parent=/,$feature_name); | |
$gene = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392 | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.3 | |
} | |
elsif ($feature_type eq 'three_prime_UTR' || $feature_type eq 'five_prime_UTR') | |
{ | |
# eg. ID=CGOC_0000014201-mRNA-1:five_prime_utr;Parent=CGOC_0000014201-mRNA-1 | |
if ($feature_name =~ /ID/) | |
{ | |
@temp = split(/ID=/,$feature_name); | |
$feature_name = $temp[1]; | |
} | |
@temp = split(/Parent=/,$feature_name); | |
$transcript = $temp[1]; # eg. CGOC_0000014201-mRNA-1 | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
$transcript = $scaffold."=".$transcript; | |
# THROW AN EXCEPTION IF THE GENE NAME IS NOT KNOWN FOR THIS TRANSCRIPT: | |
throw Error::Simple("ERRORCODE=3: get_gene_name: gene name is not known for transcript $transcript") if !(defined($TRANSCRIPT2GENE->{$transcript})); | |
$gene = $TRANSCRIPT2GENE->{$transcript}; | |
} | |
elsif ($feature_type eq 'gene') | |
{ | |
# eg. ID=CGOC_0001473701;Name=CGOC_0001473701 | |
if ($feature_name =~ /ID/) | |
{ | |
@temp = split(/ID=/,$feature_name); | |
$feature_name = $temp[1]; | |
} | |
@temp = split(/\;/,$feature_name); | |
$gene = $temp[0]; # eg. CGOC_0001473701 | |
} | |
else | |
{ | |
throw Error::Simple("ERRORCODE=2: get_gene_name: feature_type $feature_type"); # TESTED FOR | |
} | |
return($gene); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS read_genenames_for_transcripts(): READS IN THE GENE NAME FOR EACH TRANSCRIPT IN AN INPUT GFF FILE, AND RETURNS IT IN A HASH TABLE (WITH KEYS AS 'scaffold=transcript'): | |
sub read_genenames_for_transcripts | |
{ | |
my $gff = $_[0]; # INPUT GFF FILE | |
my %TRANSCRIPT2GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES | |
my $line; # | |
my @temp; # | |
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE | |
my $scaffold; # NAME OF THE SCAFFOLD | |
my $gene; # GENE NAME | |
my $transcript; # TRANSCRIPT NAME | |
open(GFF,"$gff") || die "ERROR: read_genenames_for_transcripts: cannot open gff $gff\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
# THROW AN EXCEPTION IF THE GFF LINE DOES NOT HAVE 9 COLUMNS: | |
throw Error::Simple("ERRORCODE=1 read_genenames_for_transcripts: line $line of gff $gff does not have 9 columns") if ($#temp != 8); # TESTED FOR | |
# IF THIS IS A LINE WITH A TRANSCRIPT ON IT: | |
if ($temp[2] eq 'mRNA') | |
{ | |
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;... | |
# eg. SSTP.contig.00005.803316 source mRNA 8171 9119 . + . T1.SSTP.contig.00005.803316_1_mRNA;Parent=T1.SSTP.contig.00005.803316_1 | |
$scaffold = $temp[0]; | |
$gene = $temp[8]; | |
# GET THE NAME OF THE TRANSCRIPT: | |
$transcript = $temp[8]; | |
if ($transcript =~ /ID=/) | |
{ | |
@temp = split(/ID=/,$transcript); | |
$transcript = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;... | |
} | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1 | |
$transcript = $scaffold."=".$transcript; | |
# GET THE NAME OF THE GENE: | |
@temp = split(/Parent=/,$gene); | |
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;... | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21 | |
# RECORD THE GENE NAME FOR THE mRNA: | |
throw Error::Simple("ERRORCODE=2 read_genenames_for_transcripts: already have gene name for transcript $transcript") if (defined($TRANSCRIPT2GENE{$transcript})); # TESTED FOR | |
$TRANSCRIPT2GENE{$transcript} = $gene; | |
} | |
} | |
} | |
return(\%TRANSCRIPT2GENE); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: make_gff_for_features_in_sequence(): GIVEN AN INPUT GFF OF FEATURES (eg. GENES) IN SUBSEQUENCES OF A SCAFFOLD, AND THE START POSITION OF THE SUBSEQUENCES IN THE SCAFFOLDS, MAKES AN OUTPUT GFF OF FEATURES (eg. GENES) IN THE SEQUENCES. | |
sub make_gff_for_features_in_sequences | |
{ | |
my $input_gff = $_[0]; # INPUT GFF OF FEATURES IN SUBSEQUENCES OF SCAFFOLDS | |
my $input_gff2 = $_[1]; # INPUT GFF OF SUBSEQUENCES IN SCAFFOLDS | |
my $output_gff = $_[2]; # OUTPUT GFF OF FEATURES IN SCAFFOLD SEQUENCES | |
my $line; # | |
my @temp; # | |
my $feature_start_in_subsequence;# START OF THE FEATURE IN THE SUBSEQUENCE OF THE SCAFFOLD | |
my $feature_end_in_subsequence; # END OF THE FEATURE IN THE SUBSEQUENCE OF THE SCAFFOLD | |
my $feature; # FEATURE NAME | |
my $subsequence_start_in_scaff; # START OF A SUBSEQUENCE IN A SCAFFOLD | |
my $subsequence; # NAME OF A SUBSEQUECE | |
my %START = (); # START POSITIONS OF SUBSEQUENCES IN SCAFFOLDS | |
my $scaffold; # SCAFFOLD NAME | |
my $feature_start; # FEATURE START IN SCAFFOLD | |
my $feature_end; # FEATURE END IN SCAFFOLD | |
my @temp2; # | |
# OPEN THE OUTPUT GFF FILE: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: make_gff_for_features_in_sequences: cannot open $output_gff\n"; | |
# READ IN THE GFF FILE OF SUBSEQUENCES IN SCAFFOLDS: | |
open(INPUT_GFF2,"$input_gff2") || die "ERROR: make_gff_for_features_in_sequences: cannot open $input_gff2\n"; | |
while(<INPUT_GFF2>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$subsequence_start_in_scaff = $temp[3]; | |
$subsequence = $temp[8]; # eg. scaffold1=10=20 | |
# THROW AN ERROR IF WE HAVE ALREADY STORED THE START POSITION OF $subsequence: | |
throw Error::Simple("ERRORCODE=1: make_gff_for_features_in_sequences: already stored start for $subsequence") if (defined($START{$subsequence})); # TESTED FOR | |
# THROW AN ERROR IF WE ALREADY HAVE STORED THE END POSITION OF $subsequence: | |
$START{$subsequence} = $subsequence_start_in_scaff; | |
} | |
close(INPUT_GFF2); | |
# READ IN THE GFF FILE OF FEATURES IN SUBSEQUENCES OF SCAFFOLDS: | |
open(INPUT_GFF, "$input_gff") || die "ERROR: make_gff_for_features_in_sequences: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$subsequence = $temp[0]; # eg. scaffold1=10=20 | |
$feature_start_in_subsequence = $temp[3]; | |
$feature_end_in_subsequence = $temp[4]; | |
$feature = $temp[8]; | |
assert(defined($START{$subsequence})); | |
$subsequence_start_in_scaff = $START{$subsequence}; | |
# FIND THE START AND END OF THE FEATURE WITH RESPECT TO THE SCAFFOLD: | |
$feature_start = $feature_start_in_subsequence + $subsequence_start_in_scaff - 1; | |
$feature_end = $feature_end_in_subsequence + $subsequence_start_in_scaff - 1; | |
# eg. feature_start_in_subsequence = 2, subsequence_start_in_scaff = 10 ---> 10+2-1 = 11 | |
@temp2 = split(/=/,$subsequence); | |
$scaffold = $temp2[0]; # eg. scaffold1 | |
print OUTPUT_GFF "$scaffold\t$temp[1]\t$temp[2]\t$feature_start\t$feature_end\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n"; | |
} | |
close(INPUT_GFF); | |
close(OUTPUT_GFF); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: sort_gff: SORTS A GFF FILE BY CHROMOSOME, THEN BY THE LAST COLUMN (eg. GENE NAME), THEN BY START POINT (4th COLUMN): | |
sub sort_gff | |
{ | |
my $gff = $_[0]; # INPUT GFF FILE | |
my $sorted_gff = $_[1]; # NAME OF SORTED GFF FILE | |
my $gff_type = $_[2]; # GFF TYPE: CAN BE 'exon' OR 'CDS' | |
my $cmd; # COMMAND TO RUN | |
my $systemcall; # RETURN VALUE FROM SYSTEM CALL TO RUN $cmd | |
my $temp_gff; # TEMPORARY GFF FILE | |
my $temp_gff2; # TEMPORARY GFF FILE | |
my $line; # | |
my @temp; # | |
# THROW AN ERROR IF $gff_type IS NOT 'exon' OR 'CDS': | |
throw Error::Simple("ERRORCODE=1: sort_gff: gff_type is $gff_type (should be exon/CDS") if ($gff_type ne 'exon' && $gff_type ne 'CDS'); # TESTED FOR | |
if ($gff_type eq 'CDS') | |
{ | |
$cmd = "sort $gff -k1,1 -k9,9 -k4,4n > $sorted_gff"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=2: sort_gff: return value $systemcall when tried to run $cmd") if ($systemcall != 0); | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
} | |
elsif ($gff_type eq 'exon') | |
{ | |
# eg. ID=CGOC_0000249501-mRNA-1:exon:1;Parent=CGOC_0000249501-mRNA-1 IN LAST COLUMN. | |
# BREAK LAST COLUMN INTO TWO COLUMNS: | |
$temp_gff = HelminthGenomeAnalysis::AvrilFileUtils::make_filename('.'); | |
$cmd = "cat $gff | tr '\;' '\t' > $temp_gff"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=3: sort_gff: return value $systemcall when tried to run $cmd") if ($systemcall != 0); | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
# SORT $temp_gff BY CHROMOSOME, THEN GENE (TRANSCRIPT) NAME, THEN START POINT: | |
$temp_gff2 = HelminthGenomeAnalysis::AvrilFileUtils::make_filename('.'); | |
$cmd = "sort $temp_gff -k1,1 -k10,10 -k4,4n > $temp_gff2"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=4: sort_gff: return value $systemcall when tried to run $cmd") if ($systemcall != 0); | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
# STICK THE LAST TWO COLUMNS OF $temp_gff2 TOGETHER IN $output: | |
open(OUTPUT,">$sorted_gff") || die "ERROR: sort_gff: cannot open $sorted_gff\n"; | |
open(TEMP_GFF2,"$temp_gff2") || die "ERROR: sort_gff: cannot open $temp_gff2\n"; | |
while(<TEMP_GFF2>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\;$temp[9]\n"; | |
} | |
close(TEMP_GFF2); | |
close(OUTPUT); | |
# DELETE TEMPORARY FILES: | |
system "rm -f $temp_gff"; | |
system "rm -f $temp_gff2"; | |
} | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: merge_overlapping_cds: MERGE OVERLAPPING CDS (EXON) FEATURES, OR CDS FEATURES THAT HAVE A 0-bp INTRON BETWEEN THEM: | |
sub merge_overlapping_cds | |
{ | |
my $sorted_cds_gff = $_[0]; # SORTED CDS GFF FILE | |
my $sorted_exon_gff = $_[1]; # SORTED EXON GFF FILE | |
my $input_gff = $_[2]; # THE INPUT GFF FILE | |
my $output_gff = $_[3]; # THE OUTPUT GFF FILE | |
my $outputdir = $_[4]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $scaffold_length_file= $_[5]; # FILE WITH LENGTHS OF SCAFFOLDS | |
my $line; # | |
my $line2; # GFF LINE FOR CDS 2 | |
my @temp; # | |
my @temp2; # | |
my $scaffold; # SCAFFOLD NAME | |
my $scaffold2; # SCAFFOLD 2 | |
my $cds; # CDS NAME | |
my $cds2; # CDS 2 NAME | |
my $cds_start; # CDS START | |
my $cds_start2; # CDS 2 START | |
my $cds_end; # CDS END | |
my $cds_end2; # CDS 2 END | |
my $transcript; # TRANSCRIPT NAME | |
my $transcript2; # TRANSCRIPT 2 NAME | |
my $end_of_gff = 0; # SAYS WHETHER WE'VE REACHED THE END OF THE GFF FILE | |
my $feature_type; # FEATURE TYPE IN GFF FILE | |
my $feature_name; # FEATURE NAME IN GFF FILE | |
my %PRINTED = (); # HASH TABLE TO RECORD WHICH LINES WERE PRINTED TO THE OUTPUT GFF ALREADY | |
my $exon; # EXON NAME | |
my $exon2; # EXON 2 NAME | |
my $exon_start; # EXON START | |
my $exon_start2; # EXON 2 START | |
my $exon_end; # EXON END | |
my $exon_end2; # EXON 2 END | |
my $cmd; # COMMAND TO RUN | |
my $overlaps; # FILE WITH OVERLAPS | |
my $systemcall; # RETURN VALUE FROM SYSTEM CALL | |
my %SEEN = (); # SAYS WHETHER WE HAVE ALREADY SEEN A PARTICULAR PAIR OF EXONS/CDSs | |
my $sorted_cds_gff2; # COPY OF $sorted_cds_gff, WITH EACH CDS EXTENDED BY 1 bp IN EACH DIRECTION. | |
my $sorted_exon_gff2; # COPY OF $sorted_exon_gff, WITH EACH EXON EXTENDED BY 1 IN EACH DIRECTION. | |
my $returnvalue; # RETURN VALUE FROM A FUNCTION | |
my %REPLACE = (); # HASH TABLE TO RECORD WHICH NUMBER TO REPLACE A COORDINATE IN A GENE WITH. | |
my $feature_start; # START OF FEATURE | |
my $feature_end; # END OF FEATURE | |
# MAKE A COPY OF $sorted_cds_gff THAT HAS EACH CDS EXTENDED BY 1 bp IN EACH DIRECTION: | |
$sorted_cds_gff2 = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::add_flanking_region_to_gff_features($sorted_cds_gff,1,$scaffold_length_file,$sorted_cds_gff2); | |
# MAKE A COPY OF $sorted_exon_gff THAT HAS EACH EXON EXTENDED BY 1 bp IN EACH DIRECTION: | |
$sorted_exon_gff2 = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); | |
$returnvalue = HelminthGenomeAnalysis::AvrilGffUtils::add_flanking_region_to_gff_features($sorted_exon_gff,1,$scaffold_length_file,$sorted_exon_gff2); | |
# FIND CASES OF OVERLAPPING CDS IN THE $sorted_cds_gff FILE: | |
$overlaps = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); | |
$cmd = "bedtools intersect -loj -a $sorted_cds_gff2 -b $sorted_cds_gff2 > $overlaps"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=2: merge_overlapping_cds: return value $systemcall when tried to run $cmd") if ($systemcall != 0); # TESTED FOR THIS | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
open(FILE,"$overlaps") || die "ERROR: merge_overlapping_cds: cannot open $overlaps\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$scaffold = $temp[0]; | |
$cds_start = $temp[3] + 1; # ADJUSTED TO BE LIKE IN $sorted_cds_gff | |
$cds_end = $temp[4] - 1; # ADJUSTED TO BE LIKE IN $sorted_cds_gff | |
$cds = $temp[8]; | |
$cds = $scaffold."_".$cds_start."_".$cds_end."_".$cds; | |
$scaffold2 = $temp[9]; | |
$cds_start2 = $temp[12] + 1; # ADJUSTED TO BE LIKE IN $sorted_cds_gff | |
$cds_end2 = $temp[13] - 1; # ADJUSTED TO BE LIKE IN $sorted_cds_gff | |
$cds2 = $temp[17]; | |
$cds2 = $scaffold2."_".$cds_start2."_".$cds_end2."_".$cds2; | |
# DEFINE GFF LINE FOR CDS 1: | |
$line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$cds_start."\t".$cds_end."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\t".$temp[8]; | |
# DEFINE GFF LINE FOR CDS 2: | |
$line2 = $temp[9]."\t".$temp[10]."\t".$temp[11]."\t".$cds_start2."\t".$cds_end2."\t".$temp[14]."\t".$temp[15]."\t".$temp[16]."\t".$temp[17]; | |
# GET THE TRANSCRIPT NAME: | |
@temp2 = split(/Parent=/,$cds); | |
$transcript = $temp2[1]; # eg. CGOC_0000249501-mRNA-1 | |
@temp2 = split(/\;/,$transcript); | |
$transcript = $temp2[0]; | |
# GET THE TRANSCRIPT 2 NAME: | |
@temp2 = split(/Parent=/,$cds2); | |
$transcript2 = $temp2[1]; | |
@temp2 = split(/\;/,$transcript); | |
$transcript = $temp2[0]; | |
# CHECK IF THE TWO TRANSCRIPTS ARE THE SAME: | |
if ($transcript eq $transcript2 && $scaffold eq $scaffold2 && $cds ne $cds2) | |
{ | |
if ($cds_start2 <= $cds_start) | |
{ | |
if (!(defined($REPLACE{$transcript."_CDSstart_".$cds_start}))) | |
{ | |
$REPLACE{$transcript."_CDSstart_".$cds_start} = $cds_start2; | |
} | |
else | |
{ | |
if ($cds_start2 < $REPLACE{$transcript."_CDSstart_".$cds_start}) | |
{ | |
$REPLACE{$transcript."_CDSstart_".$cds_start} = $cds_start2; | |
} | |
} | |
} | |
if ($cds_end2 >= $cds_end) | |
{ | |
if (!(defined($REPLACE{$transcript."_CDSend_".$cds_end}))) | |
{ | |
$REPLACE{$transcript."_CDSend_".$cds_end} = $cds_end2; | |
} | |
else | |
{ | |
if ($cds_end2 > $REPLACE{$transcript."_CDSend_".$cds_end}) | |
{ | |
$REPLACE{$transcript."_CDSend_".$cds_end} = $cds_end2; | |
} | |
} | |
} | |
} | |
} | |
close(FILE); | |
system "rm -f $overlaps"; | |
# FIND CASES OF OVERLAPPING EXONS IN THE $sorted_exon_gff FILE: | |
$overlaps = HelminthGenomeAnalysis::AvrilFileUtils::make_filename($outputdir); | |
$cmd = "bedtools intersect -loj -a $sorted_exon_gff2 -b $sorted_exon_gff2 > $overlaps"; | |
$systemcall = system "$cmd"; | |
# THROW AN ERROR IF THE RETURN VALUE FROM THE SYSTEM CALL IS NOT 0: | |
throw Error::Simple("ERRORCODE=2: merge_overlapping_cds: return value $systemcall when tried to run $cmd") if ($systemcall != 0); # TESTED FOR THIS | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
open(FILE,"$overlaps") || die "ERROR: merge_overlapping_exon: cannot open $overlaps\n"; | |
while(<FILE>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$scaffold = $temp[0]; | |
$exon_start = $temp[3] + 1; # ADJUSTED TO BE LIKE IN $sorted_exon_gff | |
$exon_end = $temp[4] - 1; # ADJUSTED TO BE LIKE IN $sorted_exon_gff | |
$exon = $temp[8]; | |
$scaffold2 = $temp[9]; | |
$exon_start2 = $temp[12] + 1; # ADJUSTED TO BE LIKE IN $sorted_exon_gff | |
$exon_end2 = $temp[13] - 1; # ADJUSTED TO BE LIKE IN $sorted_exon_gff | |
$exon2 = $temp[17]; | |
# DEFINE GFF LINE FOR EXON 1: | |
$line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$exon_start."\t".$exon_end."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\t".$temp[8]; | |
# DEFINE GFF LINE FOR EXON 2: | |
$line2 = $temp[9]."\t".$temp[10]."\t".$temp[11]."\t".$exon_start2."\t".$exon_end2."\t".$temp[14]."\t".$temp[15]."\t".$temp[16]."\t".$temp[17]; | |
# GET THE TRANSCRIPT NAME: | |
@temp2 = split(/Parent=/,$exon); | |
$transcript = $temp2[1]; # eg. CGOC_0000249501-mRNA-1 | |
@temp2 = split(/\;/,$transcript); | |
$transcript = $temp2[0]; | |
# GET THE TRANSCRIPT 2 NAME: | |
@temp2 = split(/Parent=/,$exon2); | |
$transcript2 = $temp2[1]; | |
@temp2 = split(/\;/,$transcript); | |
$transcript = $temp2[0]; | |
# CHECK IF THE TWO TRANSCRIPTS ARE THE SAME: | |
if ($transcript eq $transcript2 && $scaffold eq $scaffold2 && $exon ne $exon2) | |
{ | |
if ($exon_start2 <= $exon_start) | |
{ | |
if (!(defined($REPLACE{$transcript."_exonstart_".$exon_start}))) | |
{ | |
$REPLACE{$transcript."_exonstart_".$exon_start} = $exon_start2; | |
} | |
else | |
{ | |
if ($exon_start2 < $REPLACE{$transcript."_exonstart_".$exon_start}) | |
{ | |
$REPLACE{$transcript."_exonstart_".$exon_start} = $exon_start2; | |
} | |
} | |
} | |
if ($exon_end2 >= $exon_end) | |
{ | |
if (!(defined($REPLACE{$transcript."_exonend_".$exon_end}))) | |
{ | |
$REPLACE{$transcript."_exonend_".$exon_end} = $exon_end2; | |
} | |
else | |
{ | |
if ($exon_end2 > $REPLACE{$transcript."_exonend_".$exon_end}) | |
{ | |
$REPLACE{$transcript."_exonend_".$exon_end} = $exon_end2; | |
} | |
} | |
} | |
} | |
} | |
close(FILE); | |
system "rm -f $overlaps"; | |
# CLOSE THE OUTPUT GFF FILE: | |
open(OUTPUTGFF,">$output_gff") || die "ERROR: merge_overlapping_cds: cannot open $output_gff\n"; | |
# READ IN THE INPUT GFF AND REPLACE CDS LINES: | |
open(INPUTGFF,"$input_gff") || die "ERROR: merge_overlapping_cds: cannot open $input_gff\n"; | |
while(<INPUTGFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
$feature_type = $temp[2]; | |
$feature_start = $temp[3]; | |
$feature_end = $temp[4]; | |
$feature_name = $temp[8]; | |
# THROW AN EXCEPTION IF THERE ARE NOT 9 COLUMNS IN THE GFF LINE: | |
throw Error::Simple("ERRORCODE=1: merge_overlapping_cds: do not have 9 columns in $input_gff: line $line") if ($#temp != 8); # TESTED FOR | |
if ($feature_type eq 'CDS' || $feature_type eq 'exon') | |
{ | |
# GET THE TRANSCRIPT NAME: | |
@temp2 = split(/Parent=/,$feature_name); | |
$transcript = $temp2[1]; # eg. CGOC_0000249501-mRNA-1 | |
@temp2 = split(/\;/,$transcript); | |
$transcript = $temp2[0]; | |
# SEE IF WE WANT TO REPLACE THE START OR END POINT: | |
if (defined($REPLACE{$transcript."_".$feature_type."start_".$feature_start})) { $feature_start = $REPLACE{$transcript."_".$feature_type."start_".$feature_start};} | |
if (defined($REPLACE{$transcript."_".$feature_type."end_".$feature_end})) { $feature_end = $REPLACE{$transcript."_".$feature_type."end_".$feature_end}; } | |
# PRINT OUT THE LINE: | |
$line = $temp[0]."\t".$temp[1]."\t".$temp[2]."\t".$feature_start."\t".$feature_end."\t".$temp[5]."\t".$temp[6]."\t".$temp[7]."\t".$temp[8]; | |
if (!($PRINTED{$transcript."_".$feature_type."_".$feature_start."_".$feature_end})) | |
{ | |
# PRINT OUT THE CDS LINE: | |
print OUTPUTGFF "$line\n"; | |
$PRINTED{$transcript."_".$feature_type."_".$feature_start."_".$feature_end} = 1; | |
} | |
} | |
else | |
{ | |
print OUTPUTGFF "$line\n"; | |
} | |
} | |
else | |
{ | |
print OUTPUTGFF "$line\n"; | |
} | |
} | |
close(INPUTGFF); | |
close(OUTPUTGFF); | |
# DELETE TEMPORARY FILES: | |
system "rm -f $sorted_cds_gff2"; | |
system "rm -f $sorted_exon_gff2"; | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: sort_gff_lines_for_genes(): ORDERS THE GFF LINES FOR EACH GENE IN ORDER 'gene', 'mRNA', 'exon', 'CDS': | |
sub sort_gff_lines_for_genes | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $output_gff = $_[1]; # NAME OF THE OUTPUT GFF FILE | |
my $line; # | |
my @temp; # | |
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE | |
my $feature_type; # TYPE OF FEATURE | |
my $feature_name; # FEATURE NAME | |
my $scaffold; # SCAFFOLD NAME | |
my $TRANSCRIPT2GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES | |
my %CDSLINES = (); # HASH TABLE TO RECORD CDS LINES FOR GENES | |
my %MRNALINES = (); # HASH TABLE TO RECORD mRNA LINES FOR GENES | |
my %EXONLINES = (); # HASH TABLE TO RECORD exon LINES FOR GENES | |
my %THREEPRIMEUTRLINES = (); # HASH TABLE TO RECORD three_prime_UTR LINES FOR GENES | |
my %FIVEPRIMEUTRLINES = (); # HASH TABLE TO RECORD five_prime_UTR LINES FOR GENES | |
my $gene; # GENE NAME | |
# READ IN THE GENE NAMES FOR TRANSCRIPT NAMES: | |
$TRANSCRIPT2GENE = HelminthGenomeAnalysis::AvrilGffUtils::read_genenames_for_transcripts($input_gff); | |
# READ IN THE INPUT GFF FILE, AND RECORD THE LINES FOR EACH GENE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: sort_gff_lines_for_genes: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
$scaffold = $temp[0]; | |
$feature_type = $temp[2]; | |
if ($feature_type eq 'CDS' || $feature_type eq 'mRNA' || $feature_type eq 'exon' || $feature_type eq 'three_prime_UTR' || $feature_type eq 'five_prime_UTR') | |
{ | |
$feature_name = $temp[8]; | |
# GET THE GENE NAME: | |
$gene = HelminthGenomeAnalysis::AvrilGffUtils::get_gene_name($feature_type,$feature_name,$TRANSCRIPT2GENE,$scaffold); | |
# RECORD THE CDS/mRNA/exon LINES FOR $gene: | |
if ($feature_type eq 'CDS') | |
{ | |
if (!(defined($CDSLINES{$gene}))) { $CDSLINES{$gene} = $line;} | |
else { $CDSLINES{$gene} = $CDSLINES{$gene}."\n".$line; } | |
} | |
elsif ($feature_type eq 'exon') | |
{ | |
if (!(defined($EXONLINES{$gene}))) { $EXONLINES{$gene} = $line;} | |
else { $EXONLINES{$gene} = $EXONLINES{$gene}."\n".$line; } | |
} | |
elsif ($feature_type eq 'mRNA') | |
{ | |
if (!(defined($MRNALINES{$gene}))) { $MRNALINES{$gene} = $line;} | |
else { $MRNALINES{$gene} = $MRNALINES{$gene}."\n".$line; } | |
} | |
elsif ($feature_type eq 'three_prime_UTR') | |
{ | |
if (!(defined($THREEPRIMEUTRLINES{$gene}))) { $THREEPRIMEUTRLINES{$gene} = $line;} | |
else { $THREEPRIMEUTRLINES{$gene} = $THREEPRIMEUTRLINES{$gene}."\n".$line; } | |
} | |
elsif ($feature_type eq 'five_prime_UTR') | |
{ | |
if (!(defined($FIVEPRIMEUTRLINES{$gene}))) { $FIVEPRIMEUTRLINES{$gene} = $line;} | |
else { $FIVEPRIMEUTRLINES{$gene} = $FIVEPRIMEUTRLINES{$gene}."\n".$line; } | |
} | |
} | |
} | |
} | |
close(INPUT_GFF); | |
# OPEN THE OUTPUT GFF FILE: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: sort_gff_lines_for_genes: cannot open $output_gff\n"; | |
# NOW WRITE OUT TO THE OUTPUT GFF FILE: | |
$end_of_gff = 0; | |
open(INPUT_GFF,"$input_gff") || die "ERROR: sort_gff_lines_for_genes: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
$scaffold = $temp[0]; | |
$feature_type = $temp[2]; | |
if ($feature_type eq 'gene') | |
{ | |
$feature_name = $temp[8]; | |
# GET THE GENE NAME: | |
$gene = HelminthGenomeAnalysis::AvrilGffUtils::get_gene_name($feature_type,$feature_name,$TRANSCRIPT2GENE,$scaffold); | |
# PRINT THE LINE FOR THE GENE: | |
print OUTPUT_GFF "$line\n"; | |
# THERE SHOULD BE mRNA LINES DEFINED FOR $gene, AS THEY WERE USED IN read_genenames_for_transcripts(): | |
assert(defined($MRNALINES{$gene})); | |
print OUTPUT_GFF "$MRNALINES{$gene}\n"; | |
# THROW AN ERROR IF exon LINES ARE NOT DEFINED FOR $gene: | |
throw Error::Simple("ERRORCODE=2: sort_gff_lines_for_genes: no exon lines recorded for gene $gene") if (!(defined($EXONLINES{$gene}))); # TESTED FOR | |
print OUTPUT_GFF "$EXONLINES{$gene}\n"; | |
# CHECK IF THERE ARE three_prime_UTR LINES FOR $gene: | |
if (defined($THREEPRIMEUTRLINES{$gene})) { print OUTPUT_GFF "$THREEPRIMEUTRLINES{$gene}\n"; } | |
# CHECK IF THERE ARE five_prime_UTR LINES FOR $gene: | |
if (defined($FIVEPRIMEUTRLINES{$gene})) { print OUTPUT_GFF "$FIVEPRIMEUTRLINES{$gene}\n"; } | |
# THROW AN ERROR IF CDS LINES ARE NOT DEFINED FOR $gene: | |
throw Error::Simple("ERRORCODE=3: sort_gff_lines_for_genes: no CDS lines recorded for gene $gene") if (!(defined($CDSLINES{$gene}))); # TESTED FOR | |
print OUTPUT_GFF "$CDSLINES{$gene}\n"; | |
} | |
else | |
{ | |
# IGNORE 'mRNA'/'exon'/'CDS' LINES: | |
if ($feature_type ne 'mRNA' && $feature_type ne 'CDS' && $feature_type ne 'exon' && $feature_type ne 'three_prime_UTR' && $feature_type ne 'five_prime_UTR') | |
{ | |
print OUTPUT_GFF "$line\n"; | |
} | |
} | |
} | |
else | |
{ | |
print OUTPUT_GFF "$line\n"; | |
} | |
} | |
close(INPUT_GFF); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: count_features_in_region: SUBROUTINE TO COUNT THE NUMBER OF FEATURES IN A GFF FILE BETWEEN $the_start AND $the_end ON SCAFFOLD $the_scaffold: | |
sub count_features_in_region | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $the_scaffold = $_[1]; # THE SCAFFOLD OF INTEREST | |
my $the_start = $_[2]; # THE START OF THE REGION OF INTEREST | |
my $the_end = $_[3]; # THE END OF THE REGION OF INTEREST | |
my $line; # | |
my @temp; # | |
my $num_features = 0; # NUMBER OF FEATURES BETWEEN $the_start AND $the_end | |
my $scaffold; # SCAFFOLD | |
my $start; # FEATURE START | |
my $end; # FEATURE END | |
open(INPUT_GFF,"$input_gff") || die "ERROR: count_features_in_region: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$scaffold = $temp[0]; | |
$start = $temp[3]; | |
$end = $temp[4]; | |
if ($scaffold eq $the_scaffold && $start >= $the_start && $end >= $the_start && $start <= $the_end && $end <= $the_end) | |
{ | |
$num_features++; | |
} | |
} | |
close(INPUT_GFF); | |
return($num_features); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE SYNOPSIS: convert_exonerate_gff_to_standard_gff: SUBROUTINE TO CONVERT EXONERATE GFF OUTPUT TO A MORE STANDARD GFF FORMAT. | |
sub convert_exonerate_gff_to_standard_gff | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $output_gff = $_[1]; # OUTPUT GFF FILE | |
my $line; # | |
my @temp; # | |
my $feature_type; # FEATURE TYPE | |
my $scaffold; # SCAFFOLD START | |
my $feature_start; # FEATURE START | |
my $feature_end; # FEATURE END | |
my $strand; # FEATURE STRAND | |
my $gene_num = 0; # GENE NUMBER IN THE EXONERATE FILE | |
my $gene; # GENE NAME | |
my $exon_num = 0; # EXON NUMBER IN A GENE | |
# OPEN THE OUTPUT GFF: | |
open(OUTPUT_GFF,">$output_gff") || die "ERROR: convert_exonerate_gff_to_standard_gff: cannot open $output_gff\n"; | |
# READ IN THE EXONERATE GFF: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: convert_exonerate_gff_to_standard_gff: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
# THROW AN EXCEPTION IF THERE ARE NOT 9 COLUMNS IN THE GFF LINE: | |
throw Error::Simple("ERRORCODE=1: convert_exonerate_gff_to_standard_gff: do not have 9 columns in $input_gff: line $line") if ($#temp != 8); # TESTED FOR | |
$scaffold = $temp[0]; | |
$feature_type = $temp[2]; | |
$feature_start = $temp[3]; | |
$feature_end = $temp[4]; | |
$strand = $temp[6]; | |
if ($feature_type eq 'gene') | |
{ | |
# eg. scaffold_000039 exonerate:protein2genome:local gene 252337 253077 685 + . gene_id 0 ; sequence SRAE_2000311000.t1:mRNA ; gene_orientation . | |
$gene_num++; | |
$exon_num = 0; | |
$gene = "gene".$gene_num; | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$gene;Parent=$gene\n"; | |
} | |
elsif ($feature_type eq 'cds') | |
{ | |
# scaffold_000039 exonerate:protein2genome:local cds 252337 253077 . + . cds | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\tCDS\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$gene:cds;Parent=$gene\n"; | |
} | |
elsif ($feature_type eq 'exon') | |
{ | |
# scaffold_000039 exonerate:protein2genome:local exon 252337 253077 . + . insertions 0 ; deletions 2 | |
$exon_num++; | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$gene:exon:$exon_num;Parent=$gene\n"; | |
} | |
elsif ($feature_type eq 'similarity') | |
{ | |
# scaffold_000039 exonerate:protein2genome:local similarity 252337 253077 685 + . alignment_id 0 ; Query SRAE_2000311000.t1:mRNA ; Align 25175 81 564 ; Align 25739 271 177 | |
print OUTPUT_GFF "$line\n"; | |
} | |
elsif ($feature_type eq 'splice5') | |
{ | |
# scaffold_000043 exonerate:protein2genome:local splice5 45991 45992 . + . intron_id 1 ; splice_site "GT" | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$gene:splice5;Parent=$gene\n"; | |
} | |
elsif ($feature_type eq 'splice3') | |
{ | |
# scaffold_000043 exonerate:protein2genome:local splice3 46138 46139 . + . intron_id 0 ; splice_site "AT" | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$gene:splice3;Parent=$gene\n"; | |
} | |
elsif ($feature_type eq 'intron') | |
{ | |
# scaffold_000043 exonerate:protein2genome:local intron 45991 46139 . + . intron_id 1 | |
print OUTPUT_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$temp[3]\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\tID=$gene:intron;Parent=$gene\n"; | |
} | |
else | |
{ | |
throw Error::Simple("ERRORCODE=1: convert_exonerate_gff_to_standard_gff: unknown feature type $feature_type"); # TESTED FOR | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT_GFF); | |
return(1); | |
} | |
#------------------------------------------------------------------# | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment