Skip to content

Instantly share code, notes, and snippets.

@mmterpstra
Created November 9, 2016 14:12
Show Gist options
  • Save mmterpstra/c592527d41fb5a956b196384ad4695ec to your computer and use it in GitHub Desktop.
Save mmterpstra/c592527d41fb5a956b196384ad4695ec to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
my $credits = "Written by Miente Martijn Terpstra from UMCG Department of Genetics subgroup oncogentics last modified [Thu Jul 4 16:35:29 CEST 2013]";
my $exampleGTF = <<"END";
chr1 HAVANA transcript 12010 13670 . + . transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000
chr1 HAVANA transcript 11869 14409 . + . transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208
chr1 ENSEMBL transcript 11872 14412 . + . transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000
chr1 ENSEMBL transcript 11874 14409 . + . transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000
chr1 ENSEMBL transcript 30366 30503 . + . transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000
chr1 HAVANA transcript 30267 31109 . + . transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000
chr1 HAVANA transcript 29554 31097 . + . transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855
chr1 HAVANA transcript 62948 63887 . + . transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000
chr1 HAVANA transcript 69091 70008 . + . transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936
chr1 HAVANA transcript 131104 133923 . + . transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516
END
my $use = <<"END";
simple fluxCapacitor.txt to tab reader
use:
$0 fluxCapacitor.gff > fluxCapacitor.table
this should be flexable and is tested with flux-capacitor version 1.2.2 + the gencode.v13.annotation_corrected_statuses.gff + ucsc hg19
input should look like:'
$exampleGTF'
$credits
END
my $script;
################################################################################
#no arg cathching
if (scalar(@ARGV)==0){
print $use;
exit(0);
}
#file input#####################################################
my $file = shift(@ARGV);
#my $basename = $file
#$basename =~ s/.fluxCapacitor.txt//;
#check max amount of fields#####################################################
open(my $in,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n";
my $fieldIndex = 0;
my %fields;
while(<$in>){
my %gtfline = &readFlux($_);
for my $key (keys(%gtfline)){
if( not($fields{$key}) ){
$fields{$key} = $fieldIndex;
$fieldIndex++;
}
}
}
close $in or die "write error cannot close $file\n";
my @sortedkeyFields = sort(keys(%fields));
#reformat to tab################################################################
open(my $in2,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n";
my $first = 0;
for my $key (@sortedkeyFields){
if($first == 0){
if($key){
print $key;
}
}else{
if($key){
print "\t".$key;
}
}
$first++;
}
print "\n";
while(<$in2>){
my %gtfline = &readFlux($_);
$first = 0;
for my $key (@sortedkeyFields){
if($first == 0){
if($gtfline{$key}){
print $gtfline{$key};
}else{
print "\t";
}
}else{
if($gtfline{$key}){
print "\t".$gtfline{$key};
}else{
print "\t";
}
}
$first++;
}
print "\n";
# exit();
}
close $in2 or die "write error cannot close $file\n";
################################################################################
sub readFlux {
my $inputline = shift @_;
chomp $inputline;
my @tabdelim;
@tabdelim = split("\t",$inputline);
my %gtfParsed;
$gtfParsed{"CHROM"} = $tabdelim[0];
$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1];
$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2];
$gtfParsed{"START"} = $tabdelim[3];
$gtfParsed{"END"} = $tabdelim[4];
$gtfParsed{"COL6"} = $tabdelim[5];
$gtfParsed{"STRAND"} = $tabdelim[6];
$gtfParsed{"COL8"} = $tabdelim[7];
$gtfParsed{"INFO"} = $tabdelim[8];
my @semicolomndelimINFO;
@semicolomndelimINFO = split(';',$gtfParsed{"INFO"});
for my $element (@semicolomndelimINFO){
my @spacedelim = split(' ',$element);
if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){
$gtfParsed{$spacedelim[0]} = $spacedelim[1];
}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
$gtfParsed{$spacedelim[1]} = $spacedelim[2];
}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
$gtfParsed{$spacedelim[1]} = $spacedelim[2];
}else{
die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!";
}
}
return %gtfParsed;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment