Last active
December 10, 2015 19:39
-
-
Save avrilcoghlan/4483186 to your computer and use it in GitHub Desktop.
Perl script to convert a gff file from the Chado database to a gtf format file for the RNA-SeqQC software
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/perl | |
=head1 NAME | |
convert_chado_gff_to_gtf.pl | |
=head1 SYNOPSIS | |
convert_chado_gff_to_gtf.pl input_gff output_gtf outputdir | |
where input_gff is the input embl file, | |
output_gtf is the output fasta file, | |
outputdir is the output directory for writing output files. | |
=head1 DESCRIPTION | |
This script takes an input gff file from chado (<input_gff>), and converts it to | |
gtf format, and writes the output file (<output_fasta>) in directory | |
<outputdir>. | |
Note that this script adds ';' at the end of the scaffold names in the output | |
gtf file. | |
=head1 VERSION | |
Perl script last edited 4-Jan-2013. | |
=head1 CONTACT | |
[email protected] (Avril Coghlan) | |
=cut | |
# | |
# Perl script convert_chado_gff_to_gtf.pl | |
# Written by Avril Coghlan ([email protected]) | |
# 4-Jan-13. | |
# Last edited 4-Jan-2013. | |
# SCRIPT SYNOPSIS: converts an input gff file from chado into gtf format | |
# | |
#------------------------------------------------------------------# | |
# CHECK IF THERE ARE THE CORRECT NUMBER OF COMMAND-LINE ARGUMENTS: | |
use strict; | |
use warnings; | |
my $num_args = $#ARGV + 1; | |
if ($num_args != 3) | |
{ | |
print "Usage of convert_chado_gff_to_gtf_fasta.pl\n\n"; | |
print "perl convert_chado_gff_to_gtf.pl <input_gff> <output_gtf> <outputdir>\n"; | |
print "where <input_gff> is the input embl file,\n"; | |
print " <output_gtf> is the output fasta file,\n"; | |
print " <outputdir> is the output directory for writing output files\n"; | |
print "For example, >perl convert_chado_gff_to_gtf.pl Pk_strainH_chr01.gff Pk_strainH_chr01.gtf\n"; | |
print "/lustre/scratch108/parasites/alc/RNA-SeQC/Pknowlesi\n"; | |
exit; | |
} | |
# FIND THE PATH TO THE INPUT GFF FILE: | |
my $input_gff = $ARGV[0]; | |
# FIND THE PATH TO THE OUTPUT GTF FILE: | |
my $output_gtf = $ARGV[1]; | |
# FIND THE DIRECTORY TO USE FOR OUTPUT FILES: | |
my $outputdir = $ARGV[2]; | |
#------------------------------------------------------------------# | |
# TEST SUBROUTINES: | |
my $PRINT_TEST_DATA = 0; # SAYS WHETHER TO PRINT DATA USED DURING TESTING. | |
&test_read_RNAs($outputdir); | |
&test_read_genes_for_transcripts($outputdir); | |
# xxxyyyzzz avril &test_convert_gff_to_gtf($outputdir); | |
&test_print_error; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
&run_main_program($outputdir,$input_gff,$output_gtf); | |
print STDERR "FINISHED.\n"; | |
#------------------------------------------------------------------# | |
# RUN THE MAIN PART OF THE CODE: | |
sub run_main_program | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN. | |
my $input_gff = $_[1]; # THE INPUT GFF FILE | |
my $output_gtf = $_[2]; # THE OUTPUT GTF FILE | |
my $errorcode; # RETURNED AS 0 IF THERE IS NO ERROR. | |
my $errormsg; # RETURNED AS 'none' IF THERE IS NO ERROR. | |
my $GENE; # HASH TABLE TO STORE THE GENE NAMES FOR TRANSCRIPTS | |
my $RNA; # HASH TABLE TO STORE NAMES OF RNA GENES | |
my $RRNA; # HASH TABLE TO STORE NAMES OF rRNA GENES | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GTF FILE: | |
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
} | |
#------------------------------------------------------------------# | |
# TEST &read_RNAs | |
sub test_read_RNAs | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $input_gff; # INPUT GFF FILE | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $RRNA; # HASH TABLE TO STORE rRNA GENES IN | |
my $RNA; # HASH TABLE TO STORE RNA GENES IN | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_RNAs: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
if ($errorcode != 0) { print STDERR "ERROR: test_read_RNAs: failed test1a\n"; exit;} | |
if (!($RRNA->{"PKH_011712:rRNA"}) || !($RRNA->{"PKH_011712"})) { print STDERR "ERROR: test_read_RNAs: failed test1b\n"; exit;} | |
if (!($RNA->{"PKH_083075.1"}) || !($RNA->{"PKH_083075"})) { print STDERR "ERROR: test_read_RNAs: failed test1c\n"; exit;} | |
if ($RNA->{"PKH_010010.1"} || $RNA->{"PKH_010010"}) { print STDERR "ERROR: test_read_RNAs: failed test1d\n"; exit;} | |
if ($RRNA->{"PKH_010010.1"} || $RRNA->{"PKH_010010"}) { print STDERR "ERROR: test_read_RNAs: failed test1e\n"; exit;} | |
if ($RRNA->{"PKH_083075.1"} || $RRNA->{"PKH_083075"}) { print STDERR "ERROR: test_read_RNAs: failed test1f\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=8 (THE GFF FILE DOES NOT HAVE 8 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_RNAs: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
if ($errorcode != 8) { print STDERR "ERROR: test_read_RNAs: failed test2\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=9 (UNKNOWN FEATURE TYPE IN GFF FILE): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_RNAs: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado xRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
if ($errorcode != 9) { print STDERR "ERROR: test_read_RNAs: failed test3\n"; exit;} | |
system "rm -f $input_gff"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
sub read_RNAs | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $start_of_seq = 0; # SAYS WHETHER WE HAVE FOUND THE START OF THE SEQUENCE | |
my $feature; # FEATURE TYPE | |
my $name; # FEATURE NAME | |
my %RNA = (); # HASH TABLE TO STORE THE NAMES OF RNA GENES | |
my %RRNA = (); # HASH TABLE TO STORE THE NAMES OF rRNA GENES | |
my $gene; # GENE NAME | |
my $transcript; # TRANSCRIPT NAME | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: read_RNAs: cannot open input_gff $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
if (substr($line,0,1) ne '#' && $start_of_seq == 0) # IGNORE COMMENT LINES | |
{ | |
if ($#temp != 8) # THE GFF FILE SHOULD HAVE 8 COLUMNS | |
{ | |
$errormsg = "ERROR: read_RNAs: input_gff has line that does not have 8 columns: $line\n"; | |
$errorcode = 8; # ERRORCODE=8 (TESTED FOR) | |
return(\%RNA,\%RRNA,$errorcode,$errormsg); | |
} | |
$feature = $temp[2]; # eg. gene, CDS, mRNA | |
$name = $temp[8]; # FEATURE NAME | |
# CHECK THAT THE FEATURES ARE OF THE TYPE THAT WE EXPECT: | |
if ($feature ne 'gene' && $feature ne 'CDS' && $feature ne 'mRNA' && $feature ne 'polypeptide' && $feature ne 'gap' && | |
$feature ne 'tRNA' && $feature ne 'polypeptide_motif' && $feature ne 'pseudogenic_transcript' && $feature ne 'pseudogene' && | |
$feature ne 'pseudogenic_exon' && $feature ne 'rRNA' && $feature ne 'repeat_region' && $feature ne 'snoRNA' && $feature ne 'snRNA') | |
{ | |
$errormsg = "ERROR: read_RNAs: input_gff has feature $feature on line $line\n"; | |
$errorcode = 9; # ERRORCODE=9 (TESTED FOR) | |
return(\%RNA,\%RRNA,$errorcode,$errormsg); | |
} | |
if ($feature eq 'rRNA') # A rRNA GENE | |
{ | |
# eg. ID=PKH_010104.1;Parent=PKH_010104; | |
# GET THE TRANSCRIPT NAME: | |
@temp = split(/ID=/,$name); | |
$transcript = $temp[1]; | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
# GET THE GENE NAME: | |
@temp = split(/Parent=/,$name); | |
$gene = $temp[1]; | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; | |
$RRNA{$gene} = 1; # RECORD THAT THIS IS A rRNA GENE | |
$RRNA{$transcript} = 1; # RECORD THAT THIS IS A rRNA TRANSCRIPT | |
} | |
elsif ($feature eq 'tRNA' || $feature eq 'snoRNA' || $feature eq 'snRNA') # A tRNA OR snoRNA OR OTHER RNA GENE | |
{ | |
# eg. ID=PKH_010104.1;Parent=PKH_010104; | |
# GET THE TRANSCRIPT NAME: | |
@temp = split(/ID=/,$name); | |
$transcript = $temp[1]; | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
# GET THE GENE NAME: | |
@temp = split(/Parent=/,$name); | |
$gene = $temp[1]; | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; | |
$RNA{$gene} = 1; # RECORD THAT THIS IS A RNA GENE | |
$RNA{$transcript} = 1; # RECORD THAT THIS IS A RNA TRANSCRIPT | |
} | |
} | |
elsif (substr($line,0,7) eq '##FASTA') | |
{ | |
$start_of_seq = 1; # THERE IS SEQUENCE AFTER THIS IN THE GTF FILE | |
} | |
} | |
close(INPUT_GFF); | |
return(\%RNA,\%RRNA,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &read_genes_for_transcripts | |
sub test_read_genes_for_transcripts | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $input_gff; # INPUT GFF FILE | |
my $GENE; # HASH TABLE TO STORE THE GENE NAME FOR TRANSCRIPTS | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1a\n"; exit;} | |
if ($GENE->{"PKH_011712:rRNA"} ne 'PKH_011712') { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1b\n"; exit;} | |
if ($GENE->{"PKH_010010.1"} ne "PKH_010010") { print STDERR "ERROR: test_read_genes_for_transcripts: failed test1c\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=1 (GFF FILE DOES NOT HAVE 8 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 1) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test2\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=4 (UNKNOWN FEATURE TYPE IN GFF FILE): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado xRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 4) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test3\n"; exit;} | |
system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=5 (TWO DIFFERENT TRANSCRIPT NAMES APPEAR FOR A GENE): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_read_genes_for_transcripts: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010011;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 5) { print STDERR "ERROR: test_read_genes_for_transcripts: failed test4\n"; exit;} | |
system "rm -f $input_gff"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
sub read_genes_for_transcripts | |
{ | |
my $input_gff = $_[0]; # INPUT GFF FILE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my %GENE = (); # HASH TABLE TO STORE GENE NAMES FOR TRANSCRIPTS | |
my $start_of_seq = 0; # SAYS WHETHER WE HAVE FOUND THE START OF THE SEQUENCE | |
my $line; # | |
my @temp; # | |
my $feature; # FEATURE TYPE | |
my $name; # FEATURE NAME | |
my $gene; # GENE NAME | |
my $transcript; # TRANSCRIPT NAME | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: read_genes_for_transcripts: cannot open input_gff $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
if (substr($line,0,1) ne '#' && $start_of_seq == 0) # IGNORE COMMENT LINES | |
{ | |
if ($#temp != 8) # THE GFF FILE SHOULD HAVE 8 COLUMNS | |
{ | |
$errormsg = "ERROR: read_genes_for_transcripts: input_gff has line that does not have 8 columns: $line\n"; | |
$errorcode = 1; # ERRORCODE=1 (TESTED FOR) | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
$feature = $temp[2]; # eg. gene, CDS, mRNA | |
$name = $temp[8]; # FEATURE NAME | |
# CHECK THAT THE FEATURES ARE OF THE TYPE THAT WE EXPECT: | |
if ($feature ne 'gene' && $feature ne 'CDS' && $feature ne 'mRNA' && $feature ne 'polypeptide' && $feature ne 'gap' && | |
$feature ne 'tRNA' && $feature ne 'polypeptide_motif' && $feature ne 'pseudogenic_transcript' && $feature ne 'pseudogene' && | |
$feature ne 'pseudogenic_exon' && $feature ne 'rRNA' && $feature ne 'repeat_region' && $feature ne 'snoRNA' && $feature ne 'snRNA') | |
{ | |
$errormsg = "ERROR: read_genes_for_transcripts: input_gff has feature $feature on line $line\n"; | |
$errorcode = 4; # ERRORCODE=4 (TESTED FOR) | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
if ($feature eq 'mRNA' || $feature eq 'rRNA') # A mRNA OR rRNA TRANSCRIPT | |
{ | |
# eg. ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c | |
# GET THE TRANSCRIPT NAME: | |
@temp = split(/ID=/,$name); | |
$transcript = $temp[1]; | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
# GET THE GENE NAME: | |
@temp = split(/Parent=/,$name); | |
$gene = $temp[1]; | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; | |
if ($GENE{$transcript}) | |
{ | |
$errormsg = "ERROR: read_genes_for_transcripts: already have gene name for transcript $transcript\n"; | |
$errorcode = 5; # ERRORCODE=5 (TESTED FOR) | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
$GENE{$transcript} = $gene; | |
} | |
} | |
elsif (substr($line,0,7) eq '##FASTA') | |
{ | |
$start_of_seq = 1; # THERE IS SEQUENCE AFTER THIS IN THE GTF FILE | |
} | |
} | |
close(INPUT_GFF); | |
return(\%GENE,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &convert_gff_to_gtf | |
sub test_convert_gff_to_gtf | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES INTO | |
my $input_gff; # INPUT GFF FILE | |
my $output_gtf; # OUTPUT GTF FILE | |
my $errorcode; # RETURNED AS 0 BY A FUNCTION IF THERE IS NO ERROR | |
my $errormsg; # RETURNED AS 'none' BY A FUNCTION IF THERE IS NO ERROR | |
my $RNA; # HASH TABLE TO STORE RNA GENE NAMES IN | |
my $RRNA; # HASH TABLE TO STORE rRNA GENE NAMES IN | |
my $expected_output_gtf; # FILE WITH THE EXPECTED CONTENTS OF $output_gtf | |
my $differences; # DIFFERENCES BETWEEN $output_gtf AND $expected_output_gtf | |
my $length_differences; # LENGTH OF $differences | |
my $GENE; # HASH TABLE OF THE NAMES OF GENES FOR TRANSCRIPTS | |
my $line; # | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# CALL &convert_gff_to_gtf: | |
($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA); | |
if ($errorcode != 0) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test1 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
# OPEN A FILE WITH THE EXPECTED CONTENTS OF $output_gtf: | |
($expected_output_gtf,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(EXPECTED,">$expected_output_gtf") || die "ERROR: test_convert_gff_to_gtf: cannot open $expected_output_gtf\n"; | |
print EXPECTED "Pk_strainH_chr01; chado gene 108 1421 . + . gene_id \"PKH_010010\"; transcript_id \"PKH_010010\"; transcript_type \"protein_coding\";\n"; | |
print EXPECTED "Pk_strainH_chr01; chado exon 108 758 . + 0 gene_id \"PKH_010010\"; transcript_id \"PKH_010010.1\"; transcript_type \"protein_coding\";\n"; | |
print EXPECTED "Pk_strainH_chr01; chado exon 978 1421 . + 0 gene_id \"PKH_010010\"; transcript_id \"PKH_010010.1\"; transcript_type \"protein_coding\";\n"; | |
print EXPECTED "Pk_strainH_chr01; chado mRNA 108 1421 . + . gene_id \"PKH_010010\"; transcript_id \"PKH_010010.1\"; transcript_type \"protein_coding\";\n"; | |
print EXPECTED "Pk_strainH_chr01; chado gene 795283 795406 . + . gene_id \"PKH_011712\"; transcript_id \"PKH_011712\"; transcript_type \"rRNA\";\n"; | |
print EXPECTED "Pk_strainH_chr01; chado exon 795283 795406 . + 0 gene_id \"PKH_011712\"; transcript_id \"PKH_011712:rRNA\"; transcript_type \"rRNA\";\n"; | |
print EXPECTED "Pk_strainH_chr01; chado rRNA 795283 795406 . + . gene_id \"PKH_011712\"; transcript_id \"PKH_011712:rRNA\"; transcript_type \"rRNA\";\n"; | |
close(EXPECTED); | |
$differences = ""; | |
open(TEMP,"diff $output_gtf $expected_output_gtf |"); | |
while(<TEMP>) | |
{ | |
$line = $_; | |
$differences = $differences.$line; | |
} | |
close(TEMP); | |
$length_differences = length($differences); | |
if ($length_differences != 0) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test1 (output_gtf $output_gtf expected_output_gtf $expected_output_gtf)\n"; exit;} | |
system "rm -f $output_gtf"; | |
system "rm -f $expected_output_gtf"; | |
system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=7 (GFF DOES NOT HAVE 8 COLUMNS): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
# CALL &convert_gff_to_gtf: | |
($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA); | |
if ($errorcode != 7) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test2 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $output_gtf"; | |
system "rm -f $input_gff"; | |
#xxx TEST FOR ERRORCODE=3 (DO NOT KNOW GENE NAME FOR rRNA GENE): | |
# ($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
# close(INPUT_GFF); | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
# ($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# delete($GENE->{"PKH_011712:rRNA"}); | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
# ($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# CALL &convert_gff_to_gtf: | |
# ($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir); | |
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# ($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA); | |
# if ($errorcode != 3) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test3 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
# system "rm -f $output_gtf"; | |
# system "rm -f $input_gff"; | |
# TEST FOR ERRORCODE=2 (UNKNOWN TYPE OF FEATURE IN GFF FILE): | |
($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado CCDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
close(INPUT_GFF); | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
# CALL &convert_gff_to_gtf: | |
($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir); | |
if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA); | |
if ($errorcode != 2) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test4 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
system "rm -f $output_gtf"; | |
system "rm -f $input_gff"; | |
#xxx TEST FOR ERRORCODE=10 (DO NOT KNOW GENE NAME FOR PROTEIN-CODING GENE): | |
# ($input_gff,$errorcode,$errormsg) = &make_filename($outputdir); | |
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# open(INPUT_GFF,">$input_gff") || die "ERROR: test_convert_gff_to_gtf: cannot open $input_gff\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado gene 108 1421 . + . ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 108 758 . + 0 ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 978 1421 . + 0 ID=PKH_010010.1:exon:2;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado mRNA 108 1421 . + . ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado gene 795283 795406 . + . ID=PKH_011712;isObsolete=false;feature_id=222622;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado CDS 795283 795406 . + 0 ID=PKH_011712.1:exon:1;Parent=PKH_011712:rRNA;isObsolete=false;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr01 chado rRNA 795283 795406 . + . ID=PKH_011712:rRNA;Parent=PKH_011712;isObsolete=false;feature_id=222623;timelastmodified=21.10.2007+01:51:37+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr08 chado gene 1463759 1463875 . + . ID=PKH_083075;isObsolete=false;feature_id=25886604;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr08 chado CDS 1463759 1463875 . + 0 ID=PKH_083075.1:exon:1;Parent=PKH_083075.1;isObsolete=false;timelastmodified=17.07.2012+04:19:05+BST\n"; | |
# print INPUT_GFF "Pk_strainH_chr08 chado snoRNA 1463759 1463875 . + . ID=PKH_083075.1;Parent=PKH_083075;Dbxref=Rfam:RF00532;timelastmodified=17.07.2012+04:21:38+BST;feature_id=25886605;isObsolete=false;history=term%3Dstructural%3Bdate%3D20120717%3Bqualifier%3Dmethod_new%3DncRNA+added+based+on+homology%3BcuratorName%3Ducb\@sanger.ac.uk;product=term%3Dsmall+nucleolar+RNA+Me18S-Um1356%3B\n"; | |
# close(INPUT_GFF); | |
# READ IN THE INPUT GFF FILE, TO FIND THE GENE NAMES FOR TRANSCRIPTS: | |
# ($GENE,$errorcode,$errormsg) = &read_genes_for_transcripts($input_gff); | |
# delete($GENE->{"PKH_010010.1"}); | |
# READ IN THE INPUT GFF FILE, TO FIND RNA GENES: | |
# ($RNA,$RRNA,$errorcode,$errormsg) = &read_RNAs($input_gff); | |
# CALL &convert_gff_to_gtf: | |
# ($output_gtf,$errorcode,$errormsg) = &make_filename($outputdir); | |
# if ($errorcode != 0) { ($errorcode,$errormsg) = &print_error($errormsg,$errorcode,0); } | |
# ($errorcode,$errormsg) = &convert_gff_to_gtf($outputdir,$input_gff,$output_gtf,$GENE,$RNA,$RRNA); | |
# if ($errorcode != 10) { print STDERR "ERROR: test_convert_gff_to_gtf: failed test5 (errorcode $errorcode errormsg $errormsg)\n"; exit;} | |
# system "rm -f $output_gtf"; | |
# system "rm -f $input_gff"; | |
} | |
#------------------------------------------------------------------# | |
# READ IN THE INPUT GFF FILE, AND MAKE THE OUTPUT GTF FILE: | |
sub convert_gff_to_gtf | |
{ | |
my $outputdir = $_[0]; # DIRECTORY TO PUT OUTPUT FILES IN | |
my $input_gff = $_[1]; # INPUT GFF FILE | |
my $output_gtf = $_[2]; # OUTPUT GTF FILE | |
my $GENE = $_[3]; # HASH TABLE TO STORE THE GENE NAMES FOR TRANSCRIPTS | |
my $RNA = $_[4]; # HASH TABLE TO STORE RNA GENES | |
my $RRNA = $_[5]; # HASH TABLE TO STORE rRNA GENES | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $line; # | |
my @temp; # | |
my $scaffold; # NAME OF THE SCAFFOLD | |
my $method; # METHOD USED TO FIND THE FEATURE (chado HERE) | |
my $feature; # FEATURE TYPE | |
my $start; # START POSITION OF FEATURE | |
my $end; # END POSITION OF FEATURE | |
my $score; # SCORE OF FEATURE | |
my $strand; # STRAND OF FEATURE | |
my $frame; # FRAME OF FEATURE | |
my $name; # FEATURE NAME | |
my $start_of_seq = 0; # SAYS WHETHER WE HAVE FOUND THE START OF THE SEQUENCE | |
my $new_name; # NEW VERSION OF THE FEATURE NAME | |
my $gene; # GENE NAME | |
my $transcript; # TRANSCRIPT NAME | |
my $cds; # CDS (EXON) NAME | |
my $i; # | |
# OPEN AN OUTPUT GTF FILE: | |
open(OUTPUT_GTF,">$output_gtf") || die "ERROR: convert_gff_to_gtf: cannot open output_gtf $output_gtf\n"; | |
# READ IN THE INPUT GFF FILE: | |
open(INPUT_GFF,"$input_gff") || die "ERROR: convert_gff_to_gtf: cannot open input_gff $input_gff\n"; | |
while(<INPUT_GFF>) | |
{ | |
$line = $_; | |
chomp $line; | |
@temp = split(/\t+/,$line); | |
if (substr($line,0,1) ne '#' && $start_of_seq == 0) # IGNORE COMMENT LINES | |
{ | |
if ($#temp != 8) # THE GFF FILE SHOULD HAVE 8 COLUMNS | |
{ | |
$errormsg = "ERROR: convert_gff_to_gtf: input_gff has line that does not have 8 columns: $line\n"; | |
$errorcode = 7; # ERRORCODE=7 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
$scaffold = $temp[0]; # eg. Pk_strainH_chr01 | |
$method = $temp[1]; # eg. chado | |
$feature = $temp[2]; # eg. gene, CDS, mRNA | |
$start = $temp[3]; # eg. 108 | |
$end = $temp[4]; # eg. 1421 | |
$score = $temp[5]; | |
$strand = $temp[6]; | |
$frame = $temp[7]; | |
$name = $temp[8]; # FEATURE NAME | |
# WE NEED TO ADD ';' TO THE END OF THE SCAFFOLD NAMES: | |
# $scaffold = $scaffold.";"; # xxxyyyzzz avril | |
# CHECK THAT THE FEATURES ARE OF THE TYPE THAT WE EXPECT: | |
if ($feature ne 'gene' && $feature ne 'CDS' && $feature ne 'mRNA' && $feature ne 'polypeptide' && $feature ne 'gap' && | |
$feature ne 'tRNA' && $feature ne 'polypeptide_motif' && $feature ne 'pseudogenic_transcript' && $feature ne 'pseudogene' && | |
$feature ne 'pseudogenic_exon' && $feature ne 'rRNA' && $feature ne 'repeat_region' && $feature ne 'snoRNA' && $feature ne 'snRNA') | |
{ | |
$errormsg = "ERROR: convert_gff_to_gtf: input_gff has feature $feature on line $line\n"; | |
$errorcode = 2; # ERRORCODE=2 (TESTED FOR) | |
return($errorcode,$errormsg); | |
} | |
# WE ARE ONLY INTERESTED IN 'exon', 'transcript', 'CDS', 'UTR', 'stop_codon', 'start_codon', 'gene', 'Selenocysteine' FEATURES: | |
if (($feature eq 'gene' || $feature eq 'CDS' || $feature eq 'mRNA' || $feature eq 'rRNA') && ($name =~ /ID=/)) # xxx | |
{ | |
if ($feature eq 'gene') # eg. ID=PKH_010010;isObsolete=false;isFminPartial;feature_id=222341;timelastmodified=10.12.2012+02:43:44+GMT | |
{ | |
# GET THE GENE NAME: | |
@temp = split(/ID=/,$name); | |
$gene = $temp[1]; # eg. PKH_010010;... | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; # eg. PKH_010010 | |
if ($RRNA->{$gene}) # IF IT IS A rRNA GENE | |
{ | |
$new_name = "gene_id \"$gene\"; transcript_id \"$gene\"; transcript_type \"rRNA\";"; | |
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n"; | |
} | |
elsif (!($RNA->{$gene})) # IF IT IS NOT AN RNA GENE: | |
{ | |
$new_name = "gene_id \"$gene\"; transcript_id \"$gene\"; transcript_type \"protein_coding\";"; | |
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n"; | |
} | |
} | |
elsif ($feature eq 'CDS') # eg. ID=PKH_010010.1:exon:1;Parent=PKH_010010.1;isFminPartial;isObsolete=false;timelastmodified=10.12.2012+02:43:44+GMT;colour=7 | |
{ | |
# GET THE CDS NAME: | |
@temp = split(/ID=/,$name); | |
$cds = $temp[1]; # eg. PKH_010010.1:exon:1;... | |
@temp = split(/\;/,$cds); | |
$cds = $temp[0]; # eg. PKH_010010.1:exon:1 | |
# GET THE TRANSCRIPT NAME: | |
@temp = split(/Parent=/,$name); | |
$transcript = $temp[1]; | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
# xxxif (!($GENE->{$transcript})) | |
# { | |
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for rRNA transcript $transcript\n"; | |
# $errorcode= 3; # ERRORCODE=3 (TESTED FOR) | |
# return($errorcode,$errormsg); | |
# } | |
# print "xxx transcript $transcript cds $cds xxx\n"; | |
@temp = split(/\,/,$transcript); | |
for ($i = 0; $i <= $#temp; $i++) # xxx WORMBASE HAS JUST ONE LINE FOR A CDS SHARED BY TWO TRANSCRIPTS, eg. Transcript:AC3.5.1 | |
{ | |
$transcript = $temp[$i]; | |
if ($GENE->{$transcript}) # WE SHOULD KNOW THE GENE FOR rRNA OR mRNA TRANSCRIPTS | |
{ | |
# xxx if (!($GENE->{$transcript})) | |
# { | |
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for transcript $transcript\n"; | |
# $errorcode= 3; # ERRORCODE=3 (TESTED FOR) xxx | |
# return($errorcode,$errormsg); | |
# } | |
$gene = $GENE->{$transcript}; | |
if ($RRNA->{$gene}) # IF IT IS A rRNA GENE | |
{ | |
# xxxif (!($GENE->{$transcript})) | |
# { | |
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for rRNA transcript $transcript\n"; | |
# $errorcode= 3; # ERRORCODE=3 (TESTED FOR) | |
# return($errorcode,$errormsg); | |
# } | |
# $gene = $GENE->{$transcript}; | |
$feature = "exon"; # NOTE: IN THE GENECODE GTF, 'CDS' IS THE CDS, AND 'exon' COULD INCLUDE NON-CODING (UTR) SEQUENCE | |
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"rRNA\";"; | |
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n"; | |
} | |
elsif (!($RNA->{$transcript})) # IF IT IS NOT A RNA GENE: | |
{ | |
# if (!($GENE->{$transcript})) | |
# { | |
# $errormsg= "ERROR: convert_gff_to_gtf: do not know gene for transcript $transcript cds $cds name $name (line $line)\n"; | |
# $errorcode= 10; # ERRORCODE=10 (TESTED FOR) | |
# return($errorcode,$errormsg); | |
# } | |
$feature = "exon"; # NOTE: IN THE GENECODE GTF, 'CDS' IS THE CDS, AND 'exon' COULD INCLUDE NON-CODING (UTR) SEQUENCE | |
# $gene = $GENE->{$transcript}; | |
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"protein_coding\";"; | |
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n"; | |
} | |
} | |
} | |
} | |
elsif ($feature eq 'mRNA' || $feature eq 'rRNA') # eg. ID=PKH_010010.1;Parent=PKH_010010;isObsolete=false;isFminPartial;feature_id=222342;timelastmodified=10.12.2012+02:43:44+GMT;previous_systematic_id=PK00_0600c | |
{ | |
# GET THE TRANSCRIPT NAME: | |
@temp = split(/ID=/,$name); | |
$transcript = $temp[1]; | |
@temp = split(/\;/,$transcript); | |
$transcript = $temp[0]; | |
# GET THE GENE NAME: | |
@temp = split(/Parent=/,$name); | |
$gene = $temp[1]; | |
@temp = split(/\;/,$gene); | |
$gene = $temp[0]; | |
if ($RRNA->{$gene} || $RRNA->{$transcript}) # IF IT IS AN RRNA GENE: | |
{ | |
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"rRNA\";"; | |
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n"; | |
} | |
elsif (!($RNA->{$gene}) && !($RNA->{$transcript})) # IF IT IS NOT A RNA GENE: | |
{ | |
$new_name= "gene_id \"$gene\"; transcript_id \"$transcript\"; transcript_type \"protein_coding\";"; | |
print OUTPUT_GTF "$scaffold\t$method\t$feature\t$start\t$end\t$score\t$strand\t$frame\t$new_name\n"; | |
} | |
} | |
} | |
} | |
elsif (substr($line,0,7) eq '##FASTA') | |
{ | |
$start_of_seq = 1; # THERE IS SEQUENCE AFTER THIS IN THE GTF FILE | |
} | |
} | |
close(INPUT_GFF); | |
close(OUTPUT_GTF); | |
return($errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# SUBROUTINE TO MAKE A FILE NAME FOR A TEMPORARY FILE: | |
sub make_filename | |
{ | |
my $outputdir = $_[0]; # OUTPUT DIRECTORY TO WRITE TEMPORARY FILE NAME TO | |
my $found_name = 0; # SAYS WHETHER WE HAVE FOUND A FILE NAME YET | |
my $filename = "none";# NEW FILE NAME TO USE | |
my $errorcode = 0; # RETURNED AS 0 IF THERE IS NO ERROR | |
my $errormsg = "none";# RETURNED AS 'none' IF THERE IS NO ERROR | |
my $poss_filename; # POSSIBLE FILE NAME TO USE | |
my $random_number; # RANDOM NUMBER TO USE IN TEMPORARY FILE NAME | |
while($found_name == 0) | |
{ | |
$random_number = rand(); | |
$poss_filename = $outputdir."/tmp".$random_number; | |
if (!(-e $poss_filename)) | |
{ | |
$filename = $poss_filename; | |
$found_name = 1; | |
} | |
} | |
if ($found_name == 0 || $filename eq 'none') | |
{ | |
$errormsg = "ERROR: make_filename: found_name $found_name filename $filename\n"; | |
$errorcode = 6; # ERRORCODE=6 | |
return($filename,$errorcode,$errormsg); | |
} | |
return($filename,$errorcode,$errormsg); | |
} | |
#------------------------------------------------------------------# | |
# TEST &print_error | |
sub test_print_error | |
{ | |
my $errormsg; # RETURNED AS 'none' FROM A FUNCTION IF THERE WAS NO ERROR | |
my $errorcode; # RETURNED AS 0 FROM A FUNCTION IF THERE WAS NO ERROR | |
($errormsg,$errorcode) = &print_error(45,45,1); | |
if ($errorcode != 12) { print STDERR "ERROR: test_print_error: failed test1\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message','My error message',1); | |
if ($errorcode != 11) { print STDERR "ERROR: test_print_error: failed test2\n"; exit;} | |
($errormsg,$errorcode) = &print_error('none',45,1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test3\n"; exit;} | |
($errormsg,$errorcode) = &print_error('My error message', 0, 1); | |
if ($errorcode != 13) { print STDERR "ERROR: test_print_error: failed test4\n"; exit;} | |
} | |
#------------------------------------------------------------------# | |
# PRINT OUT AN ERROR MESSAGE AND EXIT. | |
sub print_error | |
{ | |
my $errormsg = $_[0]; # THIS SHOULD BE NOT 'none' IF AN ERROR OCCURRED. | |
my $errorcode = $_[1]; # THIS SHOULD NOT BE 0 IF AN ERROR OCCURRED. | |
my $called_from_test = $_[2]; # SAYS WHETHER THIS WAS CALLED FROM test_print_error OR NOT | |
if ($errorcode =~ /[A-Z]/ || $errorcode =~ /[a-z]/) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 11; $errormsg = "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; # ERRORCODE=11 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errorcode is $errorcode, should be a number.\n"; | |
exit; | |
} | |
} | |
if (!($errormsg =~ /[A-Z]/ || $errormsg =~ /[a-z]/)) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 12; $errormsg = "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; # ERRORCODE=12 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: the errormessage $errormsg does not seem to contain text.\n"; | |
exit; | |
} | |
} | |
if ($errormsg eq 'none' || $errorcode == 0) | |
{ | |
if ($called_from_test == 1) | |
{ | |
$errorcode = 13; $errormsg = "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; # ERRORCODE=13 | |
return($errormsg,$errorcode); | |
} | |
else | |
{ | |
print STDERR "ERROR: print_error: errormsg $errormsg, errorcode $errorcode.\n"; | |
exit; | |
} | |
} | |
else | |
{ | |
print STDERR "$errormsg"; | |
exit; | |
} | |
return($errormsg,$errorcode); | |
} | |
#------------------------------------------------------------------# | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment