Last active
August 29, 2015 14:04
-
-
Save davfre/621e042dd6b2a1262924 to your computer and use it in GitHub Desktop.
Convert JGI gtf-like annotation file to GTF2.2 format
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use warnings; | |
=head1 | |
Author: [email protected] | |
This script converts a JGI gtf-like file: | |
scaffold_1 JGI exon 46078 46218 . + . name "gw.1.597.1"; transcriptId 65742 | |
scaffold_1 JGI CDS 46078 46218 . + 0 name "gw.1.597.1"; proteinId 65742; exonNumber 1 | |
scaffold_1 JGI exon 46093 46218 . + . name "gw.1.23.1"; transcriptId 6905 | |
scaffold_1 JGI CDS 46093 46218 . + 0 name "gw.1.23.1"; proteinId 6905; exonNumber 1 | |
scaffold_1 JGI exon 48713 48808 . - . name "gw.1.11.1"; transcriptId 1718 | |
scaffold_1 JGI CDS 48713 48808 . - 0 name "gw.1.11.1"; proteinId 1718; exonNumber 6 | |
scaffold_1 JGI exon 51025 51074 . - . name "gw.1.11.1"; transcriptId 1718 | |
scaffold_1 JGI CDS 51025 51074 . - 0 name "gw.1.11.1"; proteinId 1718; exonNumber 5 | |
scaffold_1 JGI exon 51139 51229 . - . name "gw.1.11.1"; transcriptId 1718 | |
scaffold_1 JGI CDS 51139 51229 . - 2 name "gw.1.11.1"; proteinId 1718; exonNumber 4 | |
scaffold_1 JGI exon 51307 51381 . - . name "gw.1.11.1"; transcriptId 1718 | |
scaffold_1 JGI CDS 51307 51381 . - 0 name "gw.1.11.1"; proteinId 1718; exonNumber 3 | |
scaffold_1 JGI exon 52291 52379 . - . name "gw.1.11.1"; transcriptId 1718 | |
scaffold_1 JGI CDS 52291 52379 . - 0 name "gw.1.11.1"; proteinId 1718; exonNumber 2 | |
scaffold_1 JGI exon 52754 52955 . - . name "gw.1.11.1"; transcriptId 1718 | |
scaffold_1 JGI CDS 52754 52955 . - 2 name "gw.1.11.1"; proteinId 1718; exonNumber 1 | |
To GTF2.2 (parseable by cufflinks): | |
scaffold_488 JGI exon 2243 2367 . - . gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "1"; exon_id "fgenesh1_pg.scaffold_488000001.1"; | |
scaffold_488 JGI CDS 2243 2367 . - 0 gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "1"; exon_id "fgenesh1_pg.scaffold_488000001.1"; | |
scaffold_488 JGI exon 2746 3001 . - . gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "2"; exon_id "fgenesh1_pg.scaffold_488000001.2"; | |
scaffold_488 JGI CDS 2746 3001 . - 1 gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "2"; exon_id "fgenesh1_pg.scaffold_488000001.2"; | |
scaffold_488 JGI exon 3652 3853 . - . gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "3"; exon_id "fgenesh1_pg.scaffold_488000001.3"; | |
scaffold_488 JGI CDS 3652 3853 . - 2 gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "3"; exon_id "fgenesh1_pg.scaffold_488000001.3"; | |
scaffold_488 JGI exon 4866 4986 . - . gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "4"; exon_id "fgenesh1_pg.scaffold_488000001.4"; | |
scaffold_488 JGI CDS 4866 4986 . - 0 gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "4"; exon_id "fgenesh1_pg.scaffold_488000001.4"; | |
scaffold_488 JGI exon 5366 5621 . - . gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "5"; exon_id "fgenesh1_pg.scaffold_488000001.5"; | |
scaffold_488 JGI CDS 5366 5621 . - 1 gene_id "fgenesh1_pg.scaffold_488000001"; transcript_id "fgenesh1_pg.scaffold_488000001"; exon_number "5"; exon_id "fgenesh1_pg.scaffold_488000001.5"; | |
=cut | |
while(<>){ | |
my ($seqid,$src,$type,$start,$end,$score,$strand,$frame,$lastcol) = split(/\t/,$_); | |
next unless $type eq 'CDS' || $type eq 'exon'; | |
my ($geneid,$transcriptid,$exonnumber); | |
if( $lastcol =~ /(?:name|gene_id)\s+\"([^\"]+)\";/ ) { | |
$geneid = $1; | |
} | |
if( $lastcol =~ /proteinId\s+(\d+)/ ) { | |
$transcriptid = "$1"; | |
} | |
if( $lastcol =~ /transcriptId\s+(\d+)/ ) { | |
$transcriptid = "$1"; | |
} | |
if( $lastcol =~ /exonNumber\s+(\d+)/ ) { | |
$exonnumber = "$1"; | |
} | |
print join("\t",$seqid,$src,$type,$start,$end,$score,$strand,$frame); | |
print "\tgene_id \"$geneid\"\; transcript_id \"$transcriptid\"\;"; | |
if($exonnumber){ | |
print " exon_number \"$exonnumber\"\; exon_id \"$transcriptid\.$exonnumber\"\;"; | |
} | |
print "\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment