Created
August 27, 2013 14:12
-
-
Save avrilcoghlan/6354055 to your computer and use it in GitHub Desktop.
Perl script that removes genes that are only supported by 'model_gff' features from a maker gff file.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=head1 NAME | |
remove_modelgff_genes_from_gff.pl | |
=head1 SYNOPSIS | |
remove_modelgff_genes_from_gff.pl input_gff output_gff outputdir | |
where input_gff is the input gff file, | |
output_gff is the output gff file, | |
outputdir is the output directory for writing output files. | |
=head1 DESCRIPTION | |
This script takes an input gff file (<input_gff>), and removes genes that | |
are only supported by 'model_gff' entries (from maker), and writes the output | |
gff file (<output_gff>) in directory <outputdir>. | |
=head1 VERSION | |
Perl script last edited 1-May-2013. | |
=head1 CONTACT | |
[email protected] (Avril Coghlan) | |
=cut | |
# | |
# Perl script remove_modelgff_genes_from_gff.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 1-May-13. | |
# Last edited 1-May-2013. | |
# SCRIPT SYNOPSIS: remove_modelgff_genes_from_gff.pl: removes genes that are only supported by 'model_gff' features from a maker gff file. | |
# | |
# NOTE: ELEANOR SAID THAT WE WANT TO KEEP THE GENE MODEL IF IT HAS ANY ONE OR MORE OF THE FOLLOWING AS SUPPORT: | |
# (a) Genemark, (b) Augustus, (c) SNAP, (d) pred_gff | |
# HOW TO RECOGNISE THESE: | |
# (i) IF THERE IS genemark/snap/pred_gff IN THE GENE NAME, KEEP THE GENE | |
# (ii) IF THERE IS blastx/exonerate IN THE GENE NAME, AND THE GENE OVERLAPS AN augustus* | |
# OR pred_gff:embl OR pred_gff:protein_coding FEATURE, KEEP THE GENE | |
# (iii) IF THERE IS augustus/blastx/exonerate IN THE GENE NAME, AND THE GENE DOESN'T OVERLAP AN | |
# augustus* OR pred_gff:embl OR pred_gff:protein_coding FEATURE, DISCARD THE GENE | |
# | |
# THIS IS WHAT THE SCRIPT DOES: | |
# 1) IT CHECKS IF A GENE OVERLAPS A pred_gff:embl OR pred_gff:protein_coding OR augustus* (augustus OR augustus_masked) FEATURE | |
# 2) IF IT DOESN'T, IT CHECKS WHETHER THE GENE NAME CONTAINS snap/genemark/pred_gff | |
# - IF THE GENE NAME DOES CONTAIN snap/genemark/pred_gff, KEEP THE GENE | |
# - IF THE GENE NAME DOESN'T CONTAIN snap/genemark/pred_gff, CHECK IF THE GENE NAME CONTAINS blastx/exonerate/augustus | |
# 3) IF THE GENE NAME DOESN'T CONTAIN blastx/exonerate/augustus, PRINT AN ERROR MESSAGE AND DIE (AS WE THINK THIS SHOULDN'T HAPPEN) | |
# 4) IF THE GENE NAME DOES CONTAIN blastx/exonerate/augustus, DELETE THE GENE FROM THE OUTPUT GFF | |
# | |
# NOTE: pred_gff COMES FROM genblastg OR RATT. | |
# NOTE: IT'S NECESSARY TO RUN rename_genes_in_maker_gff.pl AFTER RUNNING THIS SCRIPT. | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of remove_modelgff_genes_from_gff.pl\n\n"; | |
print "perl remove_modelgff_genes_from_gff.pl <input_gff> <output_gff> <outputdir>\n"; | |
print "where <input_gff> is the input gff file,\n"; | |
print " <output_gff> is the output gff file,\n"; | |
print " <outputdir> is the output directory for writing output files.\n"; | |
print "For example, >perl remove_modelgff_genes_from_gff.pl round3.v2.gff new_round3.v2.gff\n"; | |
print "/lustre/scratch108/parasites/alc/\n"; | |
print "NOTE: Do *NOT* run rename_genes_in_maker_gff.pl before running this script!\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT GFF FILE: | |
my $input_gff = $ARGV[0]; | |
# FIND THE PATH TO THE OUTPUT GFF FILE: | |
my $output_gff = $ARGV[1]; | |
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES: | |
my $outputdir = $ARGV[2]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_print_error; | |
&test_make_gene_gff($outputdir); | |
&test_make_features_gff($outputdir); | |
&test_find_genes_to_discard($outputdir); | |
&test_read_genes_for_transcripts($outputdir); | |
&test_write_output_gff($outputdir); | |
&test_check_whether_to_discard; | |
&test_run_main_program($outputdir); | |
print STDERR "Finished tests, running main code now...\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($outputdir,$input_gff,$output_gff); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $input_gff = $_[1]; # THE INPUT GFF FILE | |
my $output_gff = $_[2]; # THE OUTPUT GFF FILE | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR | |
my $gene_gff; # GFF FILE FOR GENES | |
my $features_gff; # GFF FILE WITH NON-GENE FEATURES | |
my $DISCARD; # HASH TABLE OF GENES TO DISCARD | |
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS | |
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH JUST THE GENES: | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($input_gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*) | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($input_gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES: | |
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,0); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ARE FROM snap/pred_gff/genemark, OR SUPPORTED BY augustus*/pred_gff: | |
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,$DISCARD,$GENE); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# DELETE TEMPORARY FILES: | |
system "rm -f $gene_gff"; | |
system "rm -f $features_gff"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &run_main_program | |
sub test_run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES INTO | |
my $input_gff; # INPUT GFF FILE | |
my $output_gff; # OUTPUT GFF FILE | |
my $errorcode; # RETURNED FROM A FUNCTION AS 0 IF THERE IS NO ERROR | |
my $errormsg; # RETURNED FROM A FUNCTION AS 'none' IF THERE IS NO ERROR | |
my $expected_output_gff; # FILE CONTAINING EXPECTED CONTENTS OF $output_gff | |
my $differences; # DIFFERENCES BETWEEN $output_gff AND $expected_output_gff | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
# EXAMPLE WHERE WE SHOULD DELETE A GENE, ALSO WHERE START POSITION IS 0 IN MAKER FILE: | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker gene 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker mRNA 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1;_AED=0.30;_eAED=0.30;_QI=0|0|0|1|0|0|5|1|157\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 0 120 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:137;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 348 480 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:136;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 608 653 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:135;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 778 855 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:134;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker exon 1516 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:exon:133;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker three_prime_UTR 0 1 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:three_prime_utr;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 1516 1613 . - 2 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 778 855 . - 2 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 608 653 . - 0 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 348 480 . - 1 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.04971.2025 maker CDS 2 120 . - 0 ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1:cds;Parent=maker-BPAG.contig.04971.2025-augustus-gene-0.0-mRNA-1\n"; | |
close(INPUT_GFF); | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
&run_main_program($outputdir,$input_gff,$output_gff); | |
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: cannot open $expected_output_gff\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $output_gff $expected_output_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;} | |
system "rm -f $output_gff"; | |
system "rm -f $expected_output_gff"; | |
system "rm -f $input_gff"; | |
# EXAMPLE WHERE WE SHOULD NOT DELETE A GENE, ALSO WHERE START POSITION IS 0 OR -1 IN MAKER FILE: | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_run_main_program: cannot open $input_gff\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker gene -1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker mRNA -1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|0|0|0|1|1|4|0|197\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker exon -1 170 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:117;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker exon 441 605 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:116;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker exon 951 1167 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:115;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker exon 1226 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:114;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker three_prime_UTR -1 0 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:three_prime_utr;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 1226 1267 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 951 1167 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 441 605 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 maker CDS 1 170 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match 19 1267 0.51 - . ID=BPAG.contig.07548.1438:hit:573:3.5.0.0;Name=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 1226 1267 0.51 - . ID=BPAG.contig.07548.1438:hsp:749:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 535 576 +;Gap=M42\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 951 1167 0.51 - . ID=BPAG.contig.07548.1438:hsp:750:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 318 534 +;Gap=M217\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 441 605 0.51 - . ID=BPAG.contig.07548.1438:hsp:751:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 153 317 +;Gap=M165\n"; | |
print INPUT_GFF "BPAG.contig.07548.1438 augustus_masked match_part 19 170 0.51 - . ID=BPAG.contig.07548.1438:hsp:752:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 1 152 +;Gap=M152\n"; | |
close(INPUT_GFF); | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
&run_main_program($outputdir,$input_gff,$output_gff); | |
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_run_main_program: cannot open $expected_output_gff\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker gene 1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker mRNA 1 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0;Name=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1;_AED=1.00;_eAED=1.00;_QI=0|0|0|0|1|1|4|0|197\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker exon 1 170 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:117;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker exon 441 605 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:116;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker exon 951 1167 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:115;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker exon 1226 1267 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:exon:114;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker three_prime_UTR 1 0 . - . ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:three_prime_utr;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker CDS 1226 1267 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker CDS 951 1167 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker CDS 441 605 . - 1 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 maker CDS 1 170 . - 0 ID=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1:cds;Parent=augustus_masked-BPAG.contig.07548.1438-processed-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match 19 1267 0.51 - . ID=BPAG.contig.07548.1438:hit:573:3.5.0.0;Name=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1\n"; | |
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 1226 1267 0.51 - . ID=BPAG.contig.07548.1438:hsp:749:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 535 576 +;Gap=M42\n"; | |
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 951 1167 0.51 - . ID=BPAG.contig.07548.1438:hsp:750:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 318 534 +;Gap=M217\n"; | |
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 441 605 0.51 - . ID=BPAG.contig.07548.1438:hsp:751:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 153 317 +;Gap=M165\n"; | |
print EXPECTED "BPAG.contig.07548.1438 augustus_masked match_part 19 170 0.51 - . ID=BPAG.contig.07548.1438:hsp:752:3.5.0.0;Parent=BPAG.contig.07548.1438:hit:573:3.5.0.0;Target=augustus_masked-BPAG.contig.07548.1438-abinit-gene-0.0-mRNA-1 1 152 +;Gap=M152\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $output_gff $expected_output_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_run_main_program: failed test2 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;} | |
system "rm -f $output_gff"; | |
system "rm -f $expected_output_gff"; | |
system "rm -f $input_gff"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &find_genes_to_discard | |
sub test_find_genes_to_discard | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my $gff; # INPUT GFF FILE | |
my $gene_gff; # GFF FILE WITH JUST THE GENES | |
my $features_gff; # GFF FILE WITH JUST THE FEATURES | |
my $DISCARD; # HASH TABLE OF GENES TO DISCARD # | |
# EXAMPLE WITH TWO GENES TO KEEP, THAT HAVE 'snap' IN THEIR NAMES: | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=snap-scaff_000010-processed-gene-0.3;Name=snap-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=snap-scaff_000010-processed-gene-0.3-mRNA-1;Parent=snap-scaff_000010-processed-gene-0.3;Name=snap-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=snap-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=snap-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=snap-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=snap-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=snap-scaff_000010-processed-gene-0.4;Name=snap-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=snap-scaff_000010-processed-gene-0.4-mRNA-1;Parent=snap-scaff_000010-processed-gene-0.4;Name=snap-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=snap-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=snap-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=snap-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=snap-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
# MAKE A GFF FILE WITH JUST THE GENES: | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*) | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES: | |
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1); | |
if ($errorcode != 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
if (keys %{$DISCARD} > 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test1b\n"; exit;} | |
system "rm -f $gff"; | |
system "rm -f $gene_gff"; | |
system "rm -f $features_gff"; | |
# EXAMPLE WITH TWO GENES TO DISCARD, THAT DO NOT HAVE snap/genemark/pred_gff IN THEIR NAMES AND DO NOT OVERLAP ANY pred_gff/augustus* FEATURE: | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;Parent=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=exonerate-scaff_000010-processed-gene-0.4;Name=exonerate-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=exonerate-scaff_000010-processed-gene-0.4-mRNA-1;Parent=exonerate-scaff_000010-processed-gene-0.4;Name=exonerate-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=exonerate-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=exonerate-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=exonerate-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=exonerate-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=exonerate-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=exonerate-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
# MAKE A GFF FILE WITH JUST THE GENES: | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*) | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES: | |
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1); | |
if ($errorcode != 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
if (!($DISCARD->{"exonerate-scaff_000010-processed-gene-0.3"})) { print STDERR "ERROR: test_find_genes_to_discard: failed test2b\n"; exit;} | |
if (!($DISCARD->{"exonerate-scaff_000010-processed-gene-0.4"})) { print STDERR "ERROR: test_find_genes_to_discard: failed test2c\n"; exit;} | |
system "rm -f $gff"; | |
system "rm -f $gene_gff"; | |
system "rm -f $features_gff"; | |
# TEST ERRORCODE=100: (GENE TO DISCARD DOES NOT HAVE exonerate/blastx IN ITS NAME): | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;Parent=exonerate-scaff_000010-processed-gene-0.3;Name=exonerate-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=exonerate-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=exonerate-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
# MAKE A GFF FILE WITH JUST THE GENES: | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*) | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES: | |
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1); | |
if ($errorcode != 100) { print STDERR "ERROR: test_find_genes_to_discard: failed test2d (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $gff"; | |
system "rm -f $gene_gff"; | |
system "rm -f $features_gff"; | |
# EXAMPLE WITH TWO GENES TO KEEP, THAT DO NOT HAVE snap/genemark/pred_gff IN THEIR NAMES BUT OVERLAP ANY pred_gff/augustus* FEATURE: | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.3;Name=auugustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=auugustus_masked-scaff_000010-processed-gene-0.3;Name=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=auugustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=auugustus_masked-scaff_000010-processed-gene-0.4;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=auugustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "scaff_000010\taugustus_masked\tmatch\t20477\t21300\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "scaff_000010\taugustus_masked\tmatch\t29010\t29020\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
# MAKE A GFF FILE WITH JUST THE GENES: | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus*) | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES: | |
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gene_gff,$features_gff,1); | |
if ($errorcode != 0) { print STDERR "ERROR: test_find_genes_to_discard: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
if ($DISCARD->{"auugustus_masked-scaff_000010-processed-gene-0.3"}) { print STDERR "ERROR: test_find_genes_to_discard: failed test3b\n"; exit;} | |
if ($DISCARD->{"auugustus_masked-scaff_000010-processed-gene-0.4"}) { print STDERR "ERROR: test_find_genes_to_discard: failed test3c\n"; exit;} | |
system "rm -f $gff"; | |
system "rm -f $gene_gff"; | |
system "rm -f $features_gff"; | |
# TEST ERRORCODE=101 (BEDTOOLS COULD NOT RUN PROPERLY): | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "BPAG.contig.04966.2027 maker gene 73 1002 . - . ID=maker-BPAG.contig.04966.2027-snap-gene-0.6;Name=maker-BPAG.contig.04966.2027-snap-gene-0.6\n"; | |
print GFF "BPAG.contig.04967.2027 maker gene 1129 1876 . - . ID=genemark-BPAG.contig.04967.2027-processed-gene-0.0;Name=genemark-BPAG.contig.04967.2027-processed-gene-0.0\n"; | |
print GFF "BPAG.contig.04971.2025 maker gene 1675 1797 . + . ID=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2;Name=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2\n"; | |
print GFF "BPAG.contig.04971.2025 maker gene 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
# RUN &find_genes_to_discard: | |
($DISCARD,$errorcode,$errormsg) = &find_genes_to_discard($outputdir,$gff,$gff,1); | |
if ($errorcode != 101) { print STDERR "ERROR: test_find_genes_to_discard: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $gff"; | |
system "rm -f $gene_gff"; | |
system "rm -f $features_gff"; | |
} | |
#------------------------------------------------------------------# | |
# FIND GENES THAT DO NOT OVERLAP ANY OF THE NON-GENE FEATURES: | |
sub find_genes_to_discard | |
{ | |
my $outputdir = $_[0]; ## DIRECTORY TO PUT OUTPUT FILES IN | |
my $gene_gff = $_[1]; ## GENE GFF FILE | |
my $features_gff = $_[2]; ## FEATURES GFF FILE | |
my $testing = $_[3]; ## SAYS WHETHER THIS IS CALLED FROM A TESTING SUBROUTINE | |
my $cmd; ## COMMAND TO RUN | |
my $output; ## BEDTOOLS OUTPUT FILE | |
my $line; ## | |
my @temp; ## | |
my $name; ## | |
my %DISCARD = (); ## HASH TABLE OF GENES TO DISCARD | |
my $errorcode = 0; ## RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";## RETURNED AS 'none' IF THERE IS NO ERROR | |
my %KEEP = (); ## HASH TABLE OF GENES TO KEEP | |
my $features_gff_len = 0; ## NUMBER OF LINES IN THE $features_gff FILE | |
my $scaffold; ## SCAFFOLD THAT A GENE LIES ON | |
my $gene_start; ## START OF A GENE | |
my $gene_end; ## END OF A GENE | |
my $gene_name; ## GENE NAME | |
my $systemcall; ## RETURN VALUE FROM SYSTEM CALL | |
# CHECK IF THE $features_gff FILE HAS SOME LINES IN IT: | |
open(TEMP,"wc -l $features_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\s+/,$line); | |
$features_gff_len = $temp[0]; | |
} | |
close(TEMP); | |
if ($features_gff_len > 0) | |
{ | |
# RUN BEDTOOLS TO FIND OVERLAPS BETWEEN THE GENE GFF AND FEATURES GFF: | |
($output,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# NOTE: I SEEM TO NEED -loj IN BEDTOOLS, OR ELSE THE COORDINATES ARE WITH RESPECT TO THE OVERLAP REGION, NOT THE GENE REGION: | |
if ($testing == 0) { $cmd = "bedtools intersect -loj -a $gene_gff -b $features_gff > $output";} | |
else { $cmd = "bedtools intersect -loj -a $gene_gff -b $features_gff >& $output";} | |
$systemcall = system "$cmd"; | |
if ($systemcall != 0) | |
{ | |
system "rm -f $output"; | |
$errormsg = "ERROR: find_genes_to_discard: return value $systemcall when tried to run: $cmd\n"; | |
$errorcode = 101; # ERRORCODE=101 (TESTED FOR) | |
return(\%DISCARD,$errorcode,$errormsg); | |
} | |
system "sleep 1"; # SLEEP FOR 1 SECOND TO MAKE SURE THE OUTPUT IS COMPLETED | |
# READ IN THE BEDTOOLS OUTPUT FILE: | |
open(OUTPUT,"$output") || die "ERROR: find_genes_to_discard: cannot open $output\n"; | |
while(<OUTPUT>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
# GET THE NAME OF THE GENE: | |
$scaffold = $temp[0]; | |
$gene_start = $temp[3]; | |
$gene_end = $temp[4]; | |
$name = $temp[8]; # ID=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;score=1000 | |
if ($temp[9] ne '.') # THIS IS NOT A PROPER OVERLAP | |
{ | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45;score=1000 | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.45 | |
# THE GENE OVERLAPS AN augustus* OR pred_gff FEATURE, SO WE WANT TO KEEP IT: | |
$gene_name = $scaffold."=".$gene_start."=".$gene_end."=".$name; | |
$KEEP{$gene_name} = 1; | |
} | |
} | |
close(OUTPUT); | |
system "rm -f $output"; | |
} | |
# NOW READ IN THE INPUT GFF FILE OF GENES, AND FIGURE OUT WHICH GENES WE WANT TO DISCARD: | |
open(GENE_GFF,"$gene_gff") || die "ERROR: find_genes_to_discard: cannot open $gene_gff\n"; | |
while(<GENE_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
$scaffold = $temp[0]; | |
$gene_start = $temp[3]; | |
$gene_end = $temp[4]; | |
$name = $temp[8]; # ID=augustus_masked-scaff_000001-processed-gene-0.142;Name=augustus_masked-scaff_000001-processed-gene-0.142 | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # augustus_masked-scaff_000001-processed-gene-0.142;Name=augustus_masked-scaff_000001-processed-gene-0.142 | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # augustus_masked-scaff_000001-processed-gene-0.142; | |
$gene_name = $scaffold."=".$gene_start."=".$gene_end."=".$name; | |
# IF THE GENE DOES NOT OVERLAP AN augustus*/pred_gff FEATURE, AND DOES NOT HAVE | |
# snap/genemark/pred_gff IN ITS NAME, THEN WE WANT TO DISCARD IT: | |
if (!($KEEP{$gene_name})) | |
{ | |
# eg. maker-scaff_000010-snap-gene-0.227,snap_masked-New_000054-processed-gene-0.2-mRNA-1 | |
# eg. genemark-scaff_000010-processed-gene-1.31, genemark-New_000047-processed-gene-0.2-mRNA-1 | |
# eg. maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.64, maker-scaff_000023-pred_gff_protein_coding-gene-0.25-mRNA-1 | |
if (!($name =~ /snap/ || $name =~ /genemark/ || $name =~ /pred_gff/ || $name =~ /est_gff_source/)) | |
# est_gff_source MEANS THE GENE IS SUPPORTED BY RNA-SEQ DATA. WE WANT TO KEEP IT EVEN IF IT IS NOT SUPPORTED BY PROTEIN EVIDENCE. | |
{ | |
if ($testing == 0) { print STDERR "Discarding gene $name (scaffold $scaffold $gene_start-$gene_end) ...\n";} | |
$DISCARD{$gene_name} = 1; | |
$DISCARD{$name} = 1; | |
if (!($name =~ /exonerate/ || $name =~ /BLASTX/ || $name =~ /blastx/ || $name =~ /augustus/ || $name =~ /protein2genome/ || $name =~ /est2genome/ || $name =~ /blastn/)) | |
{ | |
$errormsg = "ERROR: find_genes_to_discard: gene name $name does not contain exonerate or BLASTX or blastx or blastn or or augustus or protein2genome or est2genome\n"; | |
$errorcode = 100; # ERRORCODE=100 (TESTED FOR) | |
return(\%DISCARD,$errorcode,$errormsg); | |
} | |
} | |
} | |
} | |
close(GENE_GFF); | |
return(\%DISCARD,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_features_gff | |
sub test_make_features_gff | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $gff; # GFF FILE | |
my $features_gff; # FEATURES GFF FILE | |
my $expected_features_gff; # FILE WITH EXPECTED CONTENTS OF $features_gff | |
my $differences; # DIFFERENES BETWEEN $features_gff AND $expected_features_gff | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_features_gff: failed test1\n"; exit;} | |
($expected_features_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_features_gff") || die "ERROR: test_make_features_gff: cannot open $expected_features_gff\n"; | |
print EXPECTED "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print EXPECTED "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n"; | |
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n"; | |
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n"; | |
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n"; | |
print EXPECTED "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $features_gff $expected_features_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_features_gff: failed test1 (features_gff $features_gff expected_features_gff $expected_features_gff)\n"; exit;} | |
system "rm -f $features_gff"; | |
system "rm -f $expected_features_gff"; | |
system "rm -f $gff"; | |
# TEST ERRORCODE=16 ('match' FEATURE OF AN USUSUAL TYPE): | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker2\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 16) { print STDERR "ERROR: test_make_features_gff: failed test1\n"; exit;} | |
system "rm -f $features_gff"; | |
system "rm -f $gff"; | |
# TEST ERRORCODE=17 (UNUSUAL FEATURE TYPE IN GFF): | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS3\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 17) { print STDERR "ERROR: test_make_features_gff: failed test3\n"; exit;} | |
system "rm -f $features_gff"; | |
system "rm -f $gff"; | |
# TEST ERRORCODE=18 (NOT 9 COLUMNS IN GFF FILE): | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "scaff_000010\tprotein2genome\tprotein_match\t339754\t340801\t478\t+\t.\tID=scaff_000010:hit:2470:2.10.0.3;Name=tr|G0MIG4|G0MIG4_CAEBE\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch\t3010038\t3012310\t0.64\t+\t.\tID=scaff_000001:hit:48921:3.5.0.30;Name=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1\n"; | |
print GFF "scaff_000001\taugustus_masked\tmatch_part\t3010038\t3010098\t0.64\t+\t.\tID=scaff_000001:hsp:87184:3.5.0.30;Parent=scaff_000001:hit:48921:3.5.0.30;Target=augustus_masked-scaff_000001-abinit-gene-30.3-mRNA-1 1 61 +;Gap=M61\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch\t1590\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1;score=1000\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:embl\tmatch_part\t2498\t2575\t1000\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38715:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21596:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_embl-gene-0.39-mRNA-1 859 936 +;Gap=M78\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch\t1590\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Name=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1\n"; | |
print GFF "SSTP.scaffold.00001.2080514\tpred_gff:protein_coding\tmatch_part\t2494\t2575\t.\t-\t.\tID=SSTP.scaffold.00001.2080514:hsp:38793:3.5.0.0;Parent=SSTP.scaffold.00001.2080514:hit:21635:3.5.0.0;Target=maker-SSTP.scaffold.00001.2080514-pred_gff_protein_coding-gene-0.80-mRNA-1 1 82 +;Gap=M82\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($features_gff,$errorcode,$errormsg) = &make_features_gff($gff,$outputdir); | |
if ($errorcode != 18) { print STDERR "ERROR: test_make_features_gff: failed test4\n"; exit;} | |
system "rm -f $features_gff"; | |
system "rm -f $gff"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A GFF FILE WITH NON-GENE FEATURES: (SPECIFICALLY pred_gff:embl, pred_gff:protein_coding & augustus_masked) | |
sub make_features_gff | |
{ | |
my $gff = $_[0]; # GFF FILE FOR ALL SCAFFOLDS | |
my $outputdir = $_[1]; # DIRECTORY TO PUT THE OUTPUT FILES INTO | |
my $features_gff; # GFF FILE FOR NON-GENE FEATURES | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE | |
my $name; # NAME OF THE FEATURE | |
($features_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(FEATURES_GFF,">$features_gff") || die "ERROR: make_features_gff: cannot open $features_gff\n"; | |
open(GFF,"$gff") || die "ERROR: make_features_gff: cannot open $gff\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
# ELEANOR SAID THAT WE WANT TO KEEP THE GENE MODEL IF IT HAS ANY ONE OR MORE OF THE FOLLOWING AS SUPPORT: | |
# (a) Genemark, (b) SNAP, (c) pred_gff | |
# HOW TO RECOGNISE THESE: | |
# (i) IF THERE IS genemark/snap/pred_gff IN THE GENE NAME, KEEP THE GENE | |
# (ii) IF THERE IS blastx/exonerate IN THE GENE NAME, AND THE GENE OVERLAPS AN augustus* | |
# OR pred_gff:embl OR pred_gff:protein_coding FEATURE, KEEP THE GENE | |
# (iii) IF THERE IS blastx/exonerate/augustus IN THE GENE NAME, AND THE GENE DOESN'T OVERLAP AN | |
# augustus* OR pred_gff:embl OR pred_gff:protein_coding FEATURE, DISCARD THE GENE | |
if ($temp[2] eq 'match' || $temp[2] eq 'match_part') | |
{ | |
# pred_gff CAN BE EITHER FROM genblastg OR RATT, ELEANOR SAID. | |
if ($temp[1] eq 'pred_gff:embl' || $temp[1] eq 'pred_gff:protein_coding' || $temp[1] =~ /augustus/) | |
{ | |
print FEATURES_GFF "$line\n"; | |
} | |
else | |
{ | |
if ($temp[1] ne 'blastn' && $temp[1] ne 'blastx' && $temp[1] ne 'est2genome' && $temp[1] ne 'model_gff:maker' && | |
$temp[1] ne 'repeatmasker' && $temp[1] ne 'repeatrunner' && $temp[1] ne 'protein2genome' && $temp[1] ne 'genemark' && $temp[1] ne 'snap_masked' && | |
$temp[1] ne 'est_gff:source') | |
{ | |
$errormsg = "ERROR: make_features_gff: match/match_part feature source $temp[1] on line $line\n"; | |
$errorcode = 16; # ERRORCODE=16 (TESTED FOR) | |
system "rm -f $features_gff"; | |
return($features_gff,$errorcode,$errormsg); | |
} | |
} | |
} | |
else | |
{ | |
if ($temp[2] ne 'contig' && $temp[2] ne 'exon' && $temp[2] ne 'CDS' && $temp[2] ne 'mRNA' && $temp[2] ne 'gene' && | |
$temp[2] ne 'five_prime_UTR' && $temp[2] ne 'three_prime_UTR' && $temp[2] ne 'expressed_sequence_match' && $temp[2] ne 'protein_match') | |
{ | |
$errormsg = "ERROR: make_features_gff: feature $temp[2] on line $line\n"; | |
$errorcode = 17; # ERRORCODE=17 (TESTED FOR) | |
system "rm -f $features_gff"; | |
return($features_gff,$errorcode,$errormsg); | |
} | |
} | |
if ($#temp != 8) | |
{ | |
$errormsg = "ERROR: make_features_gff: do not have 9 columns in $gff\n"; | |
$errorcode = 18; # ERRORCODE=18 (TESTED FOR) | |
system "rm -f $features_gff"; | |
return($features_gff,$errorcode,$errormsg); | |
} | |
} | |
} | |
close(GFF); | |
close(FEATURES_GFF); | |
return($features_gff,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &make_gene_gff | |
sub test_make_gene_gff | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $gff; # GFF FILE | |
my $gene_gff; # GENE GFF FILE | |
my $expected_gene_gff; # FILE WITH EXPECTED CONTENTS OF $gene_gff | |
my $differences; # DIFFERENES BETWEEN $gene_gff AND $expected_gene_gff | |
my $length_differences; # LENGTH OF $differences | |
my $line; # | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1\n"; exit;} | |
($expected_gene_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_gene_gff") || die "ERROR: test_make_gene_gff: cannot open $expected_gene_gff\n"; | |
print EXPECTED "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print EXPECTED "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $gene_gff $expected_gene_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1 (gene_gff $gene_gff expected_gene_gff $expected_gene_gff)\n"; exit;} | |
system "rm -f $gene_gff"; | |
system "rm -f $expected_gene_gff"; | |
system "rm -f $gff"; | |
# TEST ERRORCODE=15 (DO NOT HAVE 9 COLUMNS IN THE GFF FILE): | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 15) { print STDERR "ERROR: test_make_gene_gff: failed test2\n"; exit;} | |
system "rm -f $gene_gff"; | |
system "rm -f $gff"; | |
# ADD EXAMPLE WITH MAKER GFF WHERE THE START POINT IS 0: | |
($gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GFF,">$gff") || die "ERROR: test_make_gff_for_scaffold: cannot open $gff\n"; | |
print GFF "BPAG.contig.04967.2027 maker gene 1129 1876 . - . ID=genemark-BPAG.contig.04967.2027-processed-gene-0.0;Name=genemark-BPAG.contig.04967.2027-processed-gene-0.0\n"; | |
print GFF "BPAG.contig.04971.2025 maker gene 1675 1797 . + . ID=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2;Name=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2\n"; | |
print GFF "BPAG.contig.04971.2025 maker gene 0 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n"; | |
print GFF "##FASTA\n"; | |
print GFF ">seq1\n"; | |
print GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(GFF); | |
($gene_gff,$errorcode,$errormsg) = &make_gene_gff($gff,$outputdir); | |
if ($errorcode != 0) { print STDERR "ERROR: test_make_gene_gff: failed test1\n"; exit;} | |
($expected_gene_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_gene_gff") || die "ERROR: test_make_gene_gff: cannot open $expected_gene_gff\n"; | |
print EXPECTED "BPAG.contig.04967.2027 maker gene 1129 1876 . - . ID=genemark-BPAG.contig.04967.2027-processed-gene-0.0;Name=genemark-BPAG.contig.04967.2027-processed-gene-0.0\n"; | |
print EXPECTED "BPAG.contig.04971.2025 maker gene 1675 1797 . + . ID=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2;Name=maker-BPAG.contig.04971.2025-exonerate_protein2genome-gene-0.2\n"; | |
print EXPECTED "BPAG.contig.04971.2025 maker gene 1 1613 . - . ID=maker-BPAG.contig.04971.2025-augustus-gene-0.0;Name=maker-BPAG.contig.04971.2025-augustus-gene-0.0\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $gene_gff $expected_gene_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_make_gene_gff: failed test3 (gene_gff $gene_gff expected_gene_gff $expected_gene_gff)\n"; exit;} | |
system "rm -f $gene_gff"; | |
system "rm -f $expected_gene_gff"; | |
system "rm -f $gff"; | |
} | |
#------------------------------------------------------------------# | |
# MAKE A GFF FILE WITH JUST THE GENES: | |
sub make_gene_gff | |
{ | |
my $gff = $_[0]; # GFF FILE FOR ALL SCAFFOLDS | |
my $outputdir = $_[1]; # DIRECTORY TO PUT THE OUTPUT FILES INTO | |
my $gene_gff; # GFF FILE FOR GENES | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE GFF FILE | |
my $start; # START OF THE FEATURE IN THE GFF | |
($gene_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(GENE_GFF,">$gene_gff") || die "ERROR: make_gene_gff: cannot open $gene_gff\n"; | |
open(GFF,"$gff") || die "ERROR: make_gene_gff: cannot open $gff\n"; | |
while(<GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
if ($temp[2] eq 'gene') | |
{ | |
$start = $temp[3]; | |
if ($start <= 0) { $start = 1;} # THIS SOMETIMES HAPPENS WITH MAKER GFF FILES. | |
print GENE_GFF "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n"; | |
} | |
if ($#temp != 8) | |
{ | |
$errormsg = "ERROR: make_gene_gff: do not have 9 columns in $gff\n"; | |
$errorcode = 15; # ERRORCODE=15 (TESTED FOR) | |
system "rm -f $gene_gff"; | |
return($gene_gff,$errorcode,$errormsg); | |
} | |
} | |
} | |
close(GFF); | |
close(GENE_GFF); | |
return($gene_gff,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &write_output_gff | |
sub test_write_output_gff | |
{ | |
my $outputdir = $_[0]; ## DIRECTORY TO WRITE OUTPUT FILES IN | |
my $input_gff; ## INPUT GFF FILE | |
my $errorcode; ## RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; ## RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my %DISCARD = (); ## HASH TABLE OF mRNAs/GENES TO DISCARD | |
my $output_gff; ## OUTPUT GFF FILE | |
my @temp; ## | |
my $expected_output_gff; ## FILE WITH EXPECTED CONTENTS OF $output_gff | |
my $differences; ## DIFFERENCES BETWEEN $output_gff AND $expected_output_gff | |
my $length_differences; ## LENGTH OF $differences | |
my $line; ## | |
my $GENE; ## HASH TABLE WITH GENE NAMES FOR TRANSCRIPTS | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE); | |
if ($errorcode != 0) { print STDERR "ERROR: test_write_output_gff: failed test1\n"; exit;} | |
($expected_output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output_gff") || die "ERROR: test_write_output_gff: cannot open $expected_output_gff\n"; | |
print EXPECTED "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print EXPECTED "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print EXPECTED "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print EXPECTED "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print EXPECTED "##FASTA\n"; | |
print EXPECTED ">seq1\n"; | |
print EXPECTED "ACAACACAGATAGATATATAGAGCA\n"; | |
close(EXPECTED); | |
$output_gff = $outputdir."/".$output_gff; | |
$differences = ""; | |
open(TEMP,"diff $output_gff $expected_output_gff |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_write_output_gff: failed test1 (output_gff $output_gff expected_output_gff $expected_output_gff)\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
system "rm -f $expected_output_gff"; | |
# TEST ERRORCODE=1 (DO NOT KNOW GENE NAMES FOR TRANSCRIPTS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
%{$GENE} = (); | |
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE); | |
if ($errorcode != 1) { print STDERR "ERROR: test_write_output_gff: failed test2\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
# TEST ERRORCODE=14 (STRANGE FEATURE IN GFF FILE): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texonz\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE); | |
if ($errorcode != 14) { print STDERR "ERROR: test_write_output_gff: failed test3\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
# TEST ERRORCODE=5 (GFF FILE DOES NOT HAVE 9 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_write_output_gff: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
$DISCARD{"augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
$DISCARD{"scaff_000010=29000=29497=augustus_masked-scaff_000010-processed-gene-0.4"} = 1; | |
($output_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\//,$output_gff); | |
$output_gff = $temp[$#temp]; | |
($errorcode,$errormsg) = &write_output_gff($outputdir,$input_gff,$output_gff,\%DISCARD,$GENE); | |
if ($errorcode != 5) { print STDERR "ERROR: test_write_output_gff: failed test4\n"; exit;} | |
system "rm -f $input_gff"; | |
system "rm -f $output_gff"; | |
} | |
#------------------------------------------------------------------# | |
# TEST &check_whether_to_discard | |
sub test_check_whether_to_discard | |
{ | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
my $discard; # SAYS WHETHER TO DISCARD A PARTICULAR GENE | |
my %DISCARD = (); # HASH TABLE OF GENES TO DISCARD | |
# CHECK A CASE WHERE WE SHOULD DISCARD THE GENE: | |
$DISCARD{"scaff1=100=200=gene1"} = 1; | |
$DISCARD{"gene1"} = 1; | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff1","100","200",\%DISCARD); | |
if ($errorcode != 0 || $discard != 1) { print STDERR "ERROR: test_check_whether_to_discard: failed test1 (errorcode $errorcode errormsg $errormsg discard $discard)\n"; exit;} | |
# CHECK A CASE WHERE WE SHOULD DISCARD THE GENE: | |
$DISCARD{"scaff1=100=200=gene1"} = 1; | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff1","150","175",\%DISCARD); | |
if ($errorcode != 0 || $discard != 1) { print STDERR "ERROR: test_check_whether_to_discard: failed test2\n"; exit;} | |
# CHECK A CASE WHERE WE SHOULD *NOT* DISCARD THE GENE: | |
$DISCARD{"scaff1=100=200=gene1"} = 1; | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff1","150","250",\%DISCARD); | |
if ($errorcode != 0 || $discard != 0) { print STDERR "ERROR: test_check_whether_to_discard: failed test3\n"; exit;} | |
# CHECK A CASE WHERE WE SHOULD *NOT* DISCARD THE GENE: | |
$DISCARD{"scaff1=100=200=gene1"} = 1; | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene1","scaff2","100","200",\%DISCARD); | |
if ($errorcode != 0 || $discard != 0) { print STDERR "ERROR: test_check_whether_to_discard: failed test4\n"; exit;} | |
# CHECK A CASE WHERE WE SHOULD *NOT* DISCARD THE GENE: | |
$DISCARD{"scaff1=100=200=gene1"} = 1; | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard("gene2","scaff1","150","175",\%DISCARD); | |
if ($errorcode != 0 || $discard != 0) { print STDERR "ERROR: test_check_whether_to_discard: failed test5\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# CHECK WHETHER TO DISCARD A FEATURE | |
sub check_whether_to_discard | |
{ | |
my $gene = $_[0]; # GENE NAME | |
my $scaffold = $_[1]; # SCAFFOLD | |
my $start = $_[2]; # FEATURE START | |
my $end = $_[3]; # FEATURE END | |
my $DISCARD = $_[4]; # HASH TABLE OF GENES TO DISCARD | |
my $discard = 0; # ASSUME WE SHOULD NOT DISCARD THE FEATURE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $gene2; # GENE WE WANT TO DISCARD | |
my @temp; # | |
my $scaffold2; # SCAFFOLD FOR $gene2 | |
my $start2; # START FOR $gene2 | |
my $end2; # END FOR $gene2 | |
if ($DISCARD->{$gene}) # WE DO WANT TO DISCARD A GENE OF THIS NAME: | |
{ | |
# CHECK THAT THE FEATURE OVERLAPS A GENE WE WANT TO DISCARD OF THIS NAME: | |
foreach $gene2 (keys %{$DISCARD}) | |
{ | |
@temp = split(/=/,$gene2); | |
if ($#temp == 3) | |
{ | |
$scaffold2 = $temp[0]; | |
$start2 = $temp[1]; | |
$end2 = $temp[2]; | |
$gene2 = $temp[3]; | |
if ($scaffold eq $scaffold2 && ($start >= $start2 && $end <= $end2) && $gene eq $gene2) | |
{ | |
$discard = 1; | |
return($discard,$errorcode,$errormsg); | |
} | |
} | |
} | |
} | |
else # WE DO NOT WANT TO DISCARD A GENE OF THIS NAME: | |
{ | |
$discard = 0; | |
} | |
return($discard,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GFF FILE WITH GENES THAT ARE FROM snap/pred_gff/genemark, OR SUPPORTED BY augustus*/pred_gff: | |
sub write_output_gff | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO WRITE OUTPUT FILES IN | |
my $input_gff = $_[1]; # INPUT GFF FILE | |
my $output_gff = $_[2]; # OUTPUT GFF FILE | |
my $DISCARD = $_[3]; # HASH TABLE OF GENES TO DISCARD | |
my $GENE = $_[4]; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $end_of_gff = 0; # SAYS WHETHER WE HAVE REACHED THE END OF THE INPUT GFF | |
my $line; # | |
my @temp; # | |
my $name; # GENE NAME | |
my $scaffold; # NAME OF SCAFFOLD | |
my $start; # START OF A FEATURE | |
my $end; # END OF A FEATURE | |
my $discard; # SAYS WHETHER TO DISCARD A FEATURE OR NOT | |
# OPEN THE OUTPUT GFF FILE: | |
$output_gff = $outputdir."/".$output_gff; | |
open(OUTPUT,">$output_gff") || die "ERROR: write_output_gff: cannot open $output_gff\n"; | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: write_output_gff: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
if ($#temp == 8) | |
{ | |
$scaffold = $temp[0]; | |
$start = $temp[3]; | |
$end = $temp[4]; | |
if ($start <= 0) { $start = 1;} # THIS SOMETIMES HAPPENS WITH MAKER GFF FILES. | |
if ($temp[2] eq 'contig') | |
{ | |
print OUTPUT "$line\n"; | |
} | |
elsif ($temp[2] eq 'gene') | |
{ | |
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3 | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3 | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.3 | |
# CHECK IF THIS OVERLAPS A GENE OF THE SAME NAME THAT WE WANT TO DELETE: | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard($name,$scaffold,$start,$end,$DISCARD); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\t+/,$line); | |
if ($discard == 0) { print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n";} | |
} | |
elsif ($temp[2] eq 'mRNA') | |
{ | |
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.5;N... | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1;Parent=augustus_... | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # eg. augustus_masked-scaff_000010-processed-gene-0.5-mRNA-1 : NAME OF THE mRNA | |
$name = $scaffold."=".$name; | |
if (!($GENE->{$name})) | |
{ | |
$errormsg = "ERROR: write_output_gff: do not know gene name for transcript $name (line $line, mRNA)\n"; | |
$errorcode = 1; # ERRORCODE=1 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
$name = $GENE->{$name}; | |
# CHECK IF THIS OVERLAPS A GENE OF THE SAME NAME THAT WE WANT TO DELETE: | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard($name,$scaffold,$start,$end,$DISCARD); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\t+/,$line); | |
if ($discard == 0) { print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n"; } | |
} | |
elsif ($temp[2] eq 'exon' || $temp[2] eq 'CDS' || $temp[2] eq 'five_prime_UTR' || $temp[2] eq 'three_prime_UTR') | |
{ | |
$name = $temp[8]; # eg. ID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1 | |
@temp = split(/Parent=/,$name); | |
$name = $temp[1]; # eg. augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1 : NAME OF THE mRNA | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; | |
@temp = split(/\,/,$name); # IT SOMETIMES HAPPENS THAT MAKER GIVES THE TRANSCRIPT NAME TWICE | |
$name = $temp[0]; | |
$name = $scaffold."=".$name; | |
if (!($GENE->{$name})) | |
{ | |
$errormsg = "ERROR: write_output_gff: do not know gene name for transcript $name (line $line, exon/CDS/UTR)\n"; | |
$errorcode = 1; # ERRORCODE=1 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
$name = $GENE->{$name}; | |
# CHECK IF THIS OVERLAPS A GENE OF THE SAME NAME THAT WE WANT TO DELETE: | |
($discard,$errorcode,$errormsg) = &check_whether_to_discard($name,$scaffold,$start,$end,$DISCARD); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
@temp = split(/\t+/,$line); | |
if ($discard == 0) { print OUTPUT "$temp[0]\t$temp[1]\t$temp[2]\t$start\t$temp[4]\t$temp[5]\t$temp[6]\t$temp[7]\t$temp[8]\n";} | |
} | |
elsif ($temp[2] eq 'match' || $temp[2] eq 'match_part' || $temp[2] eq 'protein_match' || $temp[2] eq 'expressed_sequence_match') | |
# NOTE: WE CAN HAVE GFF LINES WITH: | |
# blastx IN COLUMN 2 AND protein_match/match_part IN COLUMN 3 -> ELEANOR SAYS THESE ARE DIRECT BLAST HITS. | |
# protein2genome IN COLUMN 2 AND match/match_part IN COLUMN 3 -> ELEANOR SAYS THESE ARE FROM EXONERATE ALIGNING | |
# THE PROTEIN TO THE GENOME. | |
{ | |
print OUTPUT "$line\n"; | |
} | |
else | |
{ | |
$errormsg = "ERROR: write_output_gff: feature $temp[2] in gff line $line\n"; | |
$errorcode = 14; # ERRORCODE=14 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: write_output_gff: line does not have 9 columns: $line\n"; | |
$errorcode = 5; # ERRORCODE=5 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
} | |
else | |
{ | |
print OUTPUT "$line\n"; | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &read_genes_for_transcripts | |
sub test_read_genes_for_transcripts | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN | |
my $GENE; # HASH TABLE OF GENE NAMES FOR TRANSCRIPTS | |
my $input_gff; # INPUT GFF FILE | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE IS NO ERROR | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripst: failed test1\n"; exit;} | |
if ($GENE->{"scaff_000010=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.3") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1b\n"; exit;} | |
if ($GENE->{"scaff_000010=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1"} ne "augustus_masked-scaff_000010-processed-gene-0.4") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1c\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST ERRORCODE=9 (mRNA APPEARS TWICE IN GFF): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t29497\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 9) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test2\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST ERRORCODE=3 (LINE DOES NOT HAVE 9 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.3;Name=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1;_AED=0.00;_eAED=0.00;_QI=0|-1|0|1|-1|1|1|0|392\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t20476\t21654\t.\t+\t.\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:exon:0;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t20476\t21654\t.\t+\t0\tID=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.3-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tgene\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tmRNA\t29000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;Parent=augustus_masked-scaff_000010-processed-gene-0.4;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1;_AED=0.01;_eAED=0.01;_QI=0|-1|0|1|-1|1|1|0|165\n"; | |
print INPUT_GFF "scaff_000010\tmaker\texon\t9000\t29497\t.\t-\t.\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:exon:25;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmaker\tCDS\t29000\t.\t-\t0\tID=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1:cds;Parent=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch\t20476\t21654\t.\t+\t.\tID=scaff_000010:hit:4104:3.5.0.0;Name=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1\n"; | |
print INPUT_GFF "scaff_000010\tmodel_gff:maker\tmatch_part\t20476\t21654\t.\t+\t.\tID=scaff_000010:hsp:7664:3.5.0.0;Parent=scaff_000010:hit:4104:3.5.0.0;Target=augustus_masked-scaff_000010-processed-gene-0.4-mRNA-1 1 1179 +;Gap=M1179\n"; | |
print INPUT_GFF "##FASTA\n"; | |
print INPUT_GFF ">seq1\n"; | |
print INPUT_GFF "ACAACACAGATAGATATATAGAGCA\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 3) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test3\n"; exit;} | |
system "rm -f $input_gff"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE GENE NAME FOR EACH TRANSCRIPT IN THE INPUT GFF FILE: | |
sub read_genes_for_transcripts | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $errorcode = 0; # THIS IS RETURNED AS 0 IF NO ERROR OCCURRED. | |
my $errormsg = "none";# THIS IS RETURNED AS 'none' IF NO ERROR OCCURRED. | |
my %GENE = (); # HASH TABLE OF GENE NAMES FOR TRANSCRIPT NAMES. | |
my $line; # | |
my @temp; # | |
my $end_of_gff = 0; # SAYS WHETHER THE END OF THE GFF FILE HAS BEEN REACHED | |
my $name; # NAME OF mRNA | |
my $gene; # NAME OF GENE | |
my $scaffold; # NAME OF SCAFFOLD | |
open(INPUT_GFF,"$input_gff") || die "ERROR: read_genes_for_transcripts: cannot open $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
if ($line eq '##FASTA') { $end_of_gff = 1;} | |
if (substr($line,0,1) ne '#' && $end_of_gff == 0) | |
{ | |
@temp = split(/\t+/,$line); | |
if ($#temp == 8) | |
{ | |
if ($temp[2] eq 'mRNA') | |
{ | |
# eg. scaff_000010 maker mRNA 88793 89828 . + . ID=maker-scaff_000010-augustus-gene-0.21-mRNA-1;Parent=maker-scaff_000010-augustus-gene-0.21;... | |
$scaffold = $temp[0]; | |
$name = $temp[8]; | |
$gene = $temp[8]; | |
@temp = split(/ID=/,$name); | |
$name = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1;... | |
@temp = split(/\;/,$name); | |
$name = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21-mRNA-1 | |
# GET THE NAME OF THE GENE: | |
@temp = split(/Parent=/,$gene); | |
$gene = $temp[1]; # eg. maker-scaff_000010-augustus-gene-0.21;... | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; # eg. maker-scaff_000010-augustus-gene-0.21 | |
$name = $scaffold."=".$name; | |
# RECORD THE GENE NAME FOR THE mRNA: | |
if ($GENE{$name}) | |
{ | |
# IT IS POSSIBLE FOR A MAKER GFF TO HAVE >=2 GENES OF THE SAME NAME ON THE SAME SCAFFOLD: | |
if ($GENE{$name} ne $gene) | |
{ | |
$errormsg= "ERROR: read_genes_for_transcripts: already have gene name $GENE{$name} for $name (not $gene)\n"; | |
$errorcode= 9; # ERRORCODE=9 (TESTED FOR) | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
} | |
$GENE{$name} = $gene; | |
} | |
} | |
else | |
{ | |
$errormsg = "ERROR: read_genes_for_transcripts: line does not have 9 columns: $line\n"; | |
$errorcode = 3; # ERRORCODE=3 (TESTED FOR) | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
} | |
} | |
close(INPUT_GFF); | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE: | |
sub make_filename | |
{ | |
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO | |
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET | |
my $filename = "none";# NEW FILE NAME TO USE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $poss_filename; # POSSIBLE FILE NAME TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
while($found_name == 0) | |
{ | |
$random_number = rand(); | |
$poss_filename = $outputdir."/tmp".$random_number; | |
if (!(-e $poss_filename)) | |
{ | |
$filename = $poss_filename; | |
$found_name = 1; | |
} | |
} | |
if ($found_name == 0 || $filename eq 'none') | |
{ | |
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
return($filename,$errorcode,$errormsg); | |
}# | |
return($filename,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment