mmterpstra · November 9, 2016 14:12
diff --git a/FluxToTab.pl b/FluxToTab.pl
 #!/usr/bin/perl
 use strict;
 use warnings;

 my $credits = "Written by Miente Martijn Terpstra from UMCG Department of Genetics subgroup oncogentics last modified [Thu Jul  4 16:35:29 CEST 2013]";


 my $exampleGTF = <<"END";
 chr1	HAVANA	transcript	12010	13670	.	+	.	transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000
 chr1	HAVANA	transcript	11869	14409	.	+	.	transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208
 chr1	ENSEMBL	transcript	11872	14412	.	+	.	transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000
 chr1	ENSEMBL	transcript	11874	14409	.	+	.	transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000
 chr1	ENSEMBL	transcript	30366	30503	.	+	.	transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000
 chr1	HAVANA	transcript	30267	31109	.	+	.	transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000
 chr1	HAVANA	transcript	29554	31097	.	+	.	transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855
 chr1	HAVANA	transcript	62948	63887	.	+	.	transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000
 chr1	HAVANA	transcript	69091	70008	.	+	.	transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936
 chr1	HAVANA	transcript	131104	133923	.	+	.	transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516
 END

 my $use = <<"END";
 	simple fluxCapacitor.txt to tab reader
 	use:
 	$0 fluxCapacitor.gff > fluxCapacitor.table
 	this should be flexable and is tested with flux-capacitor version 1.2.2 + the gencode.v13.annotation_corrected_statuses.gff + ucsc hg19
 	input should look like:'
 	$exampleGTF'
 	
 	$credits
 	
 END

 my $script;
 ################################################################################
 #no arg cathching
 if (scalar(@ARGV)==0){
 print $use;
 exit(0);
 }
 #file input#####################################################
 my $file = shift(@ARGV);
 #my $basename = $file
 #$basename =~ s/.fluxCapacitor.txt//;
 #check max amount of fields#####################################################
 open(my $in,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n";
 my $fieldIndex = 0;
 my %fields;
 while(<$in>){
 	my %gtfline = &readFlux($_);
 	for my $key (keys(%gtfline)){
 		if( not($fields{$key}) ){
 		 $fields{$key} = $fieldIndex;
 		 $fieldIndex++;
 		}
 	}	
 }
 close $in or die "write error cannot close $file\n";
 	my @sortedkeyFields = sort(keys(%fields));
 #reformat to tab################################################################



 open(my $in2,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n";
 my $first = 0;

 for my $key (@sortedkeyFields){
 		if($first == 0){
 			if($key){
 				print $key;
 			}
 		}else{
 			if($key){
 				print "\t".$key;
 			}
 		} 
 		$first++;
 }
 print "\n";

 while(<$in2>){
 	my %gtfline = &readFlux($_);
 	$first = 0;

 	for my $key (@sortedkeyFields){
 		if($first == 0){
 			if($gtfline{$key}){
 				print $gtfline{$key};
 			}else{
 				print "\t";
 			}
 		}else{
 			if($gtfline{$key}){
 				print "\t".$gtfline{$key};
 			}else{
 				print "\t";
 			}
 		} 
 		$first++;
 	}
 	print "\n";
 	
 #	exit();
 }
 close $in2 or die "write error cannot close $file\n";

 ################################################################################

 sub readFlux {
 	my $inputline = shift @_;
 	chomp $inputline;
 	
 	my @tabdelim;
 	@tabdelim = split("\t",$inputline);
 	
 	my %gtfParsed;
 	$gtfParsed{"CHROM"} = $tabdelim[0];
 	$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1];
 	$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2];
 	$gtfParsed{"START"} = $tabdelim[3];
 	$gtfParsed{"END"} = $tabdelim[4];
 	$gtfParsed{"COL6"} = $tabdelim[5];
 	$gtfParsed{"STRAND"} = $tabdelim[6];
 	$gtfParsed{"COL8"} = $tabdelim[7];
 	$gtfParsed{"INFO"} = $tabdelim[8];
 	
 	my @semicolomndelimINFO;
 	@semicolomndelimINFO = split(';',$gtfParsed{"INFO"});
 	
 	for my $element (@semicolomndelimINFO){
 		my @spacedelim = split(' ',$element);
 		if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){
 			$gtfParsed{$spacedelim[0]} = $spacedelim[1];
 		}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
 			$gtfParsed{$spacedelim[1]} = $spacedelim[2];
 		}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
 			$gtfParsed{$spacedelim[1]} = $spacedelim[2];
 		}else{
 			die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!";
 		}	
 	}
 	return %gtfParsed;
 }
	#!/usr/bin/perl
	use strict;
	use warnings;

	my $credits = "Written by Miente Martijn Terpstra from UMCG Department of Genetics subgroup oncogentics last modified [Thu Jul 4 16:35:29 CEST 2013]";


	my $exampleGTF = <<"END";
	chr1 HAVANA transcript 12010 13670 . + . transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000
	chr1 HAVANA transcript 11869 14409 . + . transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208
	chr1 ENSEMBL transcript 11872 14412 . + . transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000
	chr1 ENSEMBL transcript 11874 14409 . + . transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000
	chr1 ENSEMBL transcript 30366 30503 . + . transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000
	chr1 HAVANA transcript 30267 31109 . + . transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000
	chr1 HAVANA transcript 29554 31097 . + . transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855
	chr1 HAVANA transcript 62948 63887 . + . transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000
	chr1 HAVANA transcript 69091 70008 . + . transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936
	chr1 HAVANA transcript 131104 133923 . + . transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516
	END

	my $use = <<"END";
	simple fluxCapacitor.txt to tab reader
	use:
	$0 fluxCapacitor.gff > fluxCapacitor.table
	this should be flexable and is tested with flux-capacitor version 1.2.2 + the gencode.v13.annotation_corrected_statuses.gff + ucsc hg19
	input should look like:'
	$exampleGTF'

	$credits

	END

	my $script;
	################################################################################
	#no arg cathching
	if (scalar(@ARGV)==0){
	print $use;
	exit(0);
	}
	#file input#####################################################
	my $file = shift(@ARGV);
	#my $basename = $file
	#$basename =~ s/.fluxCapacitor.txt//;
	#check max amount of fields#####################################################
	open(my $in,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n";
	my $fieldIndex = 0;
	my %fields;
	while(<$in>){
	my %gtfline = &readFlux($_);
	for my $key (keys(%gtfline)){
	if( not($fields{$key}) ){
	$fields{$key} = $fieldIndex;
	$fieldIndex++;
	}
	}
	}
	close $in or die "write error cannot close $file\n";
	my @sortedkeyFields = sort(keys(%fields));
	#reformat to tab################################################################



	open(my $in2,'<', $file)or die "$0: failed to open script:'$file'!!!!\n$!\n";
	my $first = 0;

	for my $key (@sortedkeyFields){
	if($first == 0){
	if($key){
	print $key;
	}
	}else{
	if($key){
	print "\t".$key;
	}
	}
	$first++;
	}
	print "\n";

	while(<$in2>){
	my %gtfline = &readFlux($_);
	$first = 0;

	for my $key (@sortedkeyFields){
	if($first == 0){
	if($gtfline{$key}){
	print $gtfline{$key};
	}else{
	print "\t";
	}
	}else{
	if($gtfline{$key}){
	print "\t".$gtfline{$key};
	}else{
	print "\t";
	}
	}
	$first++;
	}
	print "\n";

	# exit();
	}
	close $in2 or die "write error cannot close $file\n";

	################################################################################

	sub readFlux {
	my $inputline = shift @_;
	chomp $inputline;

	my @tabdelim;
	@tabdelim = split("\t",$inputline);

	my %gtfParsed;
	$gtfParsed{"CHROM"} = $tabdelim[0];
	$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1];
	$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2];
	$gtfParsed{"START"} = $tabdelim[3];
	$gtfParsed{"END"} = $tabdelim[4];
	$gtfParsed{"COL6"} = $tabdelim[5];
	$gtfParsed{"STRAND"} = $tabdelim[6];
	$gtfParsed{"COL8"} = $tabdelim[7];
	$gtfParsed{"INFO"} = $tabdelim[8];

	my @semicolomndelimINFO;
	@semicolomndelimINFO = split(';',$gtfParsed{"INFO"});

	for my $element (@semicolomndelimINFO){
	my @spacedelim = split(' ',$element);
	if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){
	$gtfParsed{$spacedelim[0]} = $spacedelim[1];
	}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
	$gtfParsed{$spacedelim[1]} = $spacedelim[2];
	}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
	$gtfParsed{$spacedelim[1]} = $spacedelim[2];
	}else{
	die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!";
	}
	}
	return %gtfParsed;
	}