Created
November 9, 2016 14:12
-
-
Save mmterpstra/c592527d41fb5a956b196384ad4695ec to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
my $credits = "Written by Miente Martijn Terpstra from UMCG Department of Genetics subgroup oncogentics last modified [Thu Jul 4 16:35:29 CEST 2013]"; | |
my $exampleGTF = <<"END"; | |
chr1 HAVANA transcript 12010 13670 . + . transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000 | |
chr1 HAVANA transcript 11869 14409 . + . transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208 | |
chr1 ENSEMBL transcript 11872 14412 . + . transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000 | |
chr1 ENSEMBL transcript 11874 14409 . + . transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000 | |
chr1 ENSEMBL transcript 30366 30503 . + . transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000 | |
chr1 HAVANA transcript 30267 31109 . + . transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000 | |
chr1 HAVANA transcript 29554 31097 . + . transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855 | |
chr1 HAVANA transcript 62948 63887 . + . transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000 | |
chr1 HAVANA transcript 69091 70008 . + . transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936 | |
chr1 HAVANA transcript 131104 133923 . + . transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516 | |
END | |
my $use = <<"END"; | |
simple fluxCapacitor.txt to tab reader | |
use: | |
$0 fluxCapacitor.gff > fluxCapacitor.table | |
this should be flexable and is tested with flux-capacitor version 1.2.2 + the gencode.v13.annotation_corrected_statuses.gff + ucsc hg19 | |
input should look like:' | |
$exampleGTF' | |
$credits | |
END | |
my $script; | |
################################################################################ | |
#no arg cathching | |
if (scalar(@ARGV)==0){ | |
print $use; | |
exit(0); | |
} | |
#file input##################################################### | |
my $file = shift(@ARGV); | |
#my $basename = $file | |
#$basename =~ s/.fluxCapacitor.txt//; | |
#check max amount of fields##################################################### | |
open(my $in,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n"; | |
my $fieldIndex = 0; | |
my %fields; | |
while(<$in>){ | |
my %gtfline = &readFlux($_); | |
for my $key (keys(%gtfline)){ | |
if( not($fields{$key}) ){ | |
$fields{$key} = $fieldIndex; | |
$fieldIndex++; | |
} | |
} | |
} | |
close $in or die "write error cannot close $file\n"; | |
my @sortedkeyFields = sort(keys(%fields)); | |
#reformat to tab################################################################ | |
open(my $in2,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n"; | |
my $first = 0; | |
for my $key (@sortedkeyFields){ | |
if($first == 0){ | |
if($key){ | |
print $key; | |
} | |
}else{ | |
if($key){ | |
print "\t".$key; | |
} | |
} | |
$first++; | |
} | |
print "\n"; | |
while(<$in2>){ | |
my %gtfline = &readFlux($_); | |
$first = 0; | |
for my $key (@sortedkeyFields){ | |
if($first == 0){ | |
if($gtfline{$key}){ | |
print $gtfline{$key}; | |
}else{ | |
print "\t"; | |
} | |
}else{ | |
if($gtfline{$key}){ | |
print "\t".$gtfline{$key}; | |
}else{ | |
print "\t"; | |
} | |
} | |
$first++; | |
} | |
print "\n"; | |
# exit(); | |
} | |
close $in2 or die "write error cannot close $file\n"; | |
################################################################################ | |
sub readFlux { | |
my $inputline = shift @_; | |
chomp $inputline; | |
my @tabdelim; | |
@tabdelim = split("\t",$inputline); | |
my %gtfParsed; | |
$gtfParsed{"CHROM"} = $tabdelim[0]; | |
$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1]; | |
$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2]; | |
$gtfParsed{"START"} = $tabdelim[3]; | |
$gtfParsed{"END"} = $tabdelim[4]; | |
$gtfParsed{"COL6"} = $tabdelim[5]; | |
$gtfParsed{"STRAND"} = $tabdelim[6]; | |
$gtfParsed{"COL8"} = $tabdelim[7]; | |
$gtfParsed{"INFO"} = $tabdelim[8]; | |
my @semicolomndelimINFO; | |
@semicolomndelimINFO = split(';',$gtfParsed{"INFO"}); | |
for my $element (@semicolomndelimINFO){ | |
my @spacedelim = split(' ',$element); | |
if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){ | |
$gtfParsed{$spacedelim[0]} = $spacedelim[1]; | |
}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){ | |
$gtfParsed{$spacedelim[1]} = $spacedelim[2]; | |
}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){ | |
$gtfParsed{$spacedelim[1]} = $spacedelim[2]; | |
}else{ | |
die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!"; | |
} | |
} | |
return %gtfParsed; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment