mmterpstra · November 24, 2016 15:00
diff --git a/IntersectGFF.pl b/IntersectGFF.pl
 #!/usr/bin/perl
 use strict;
 use warnings;
 use Data::Dumper;

 my $use = <<"END";
 	Simple extractor of the annotations between pre-defined regions in the gff format
 	input
 	perl $0 regions.gff ~/Downloads/Homo_sapiens.GRCh37.75.gtf > ~/regionsIntersectHomo_sapiens.GRCh37.75.gff
 	no validation of input....
 END

 my $TESoffset=1000;

 my $exampleGTF = <<"END";
 #examplefile
 1	HAVANA	transcript	12010	13670	.	+	.	transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000
 1	HAVANA	transcript	11869	14409	.	+	.	transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208
 1	ENSEMBL	transcript	11872	14412	.	+	.	transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000
 1	ENSEMBL	transcript	11874	14409	.	+	.	transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000
 1	ENSEMBL	transcript	30366	30503	.	+	.	transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000
 1	HAVANA	transcript	30267	31109	.	+	.	transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000
 1	HAVANA	transcript	29554	31097	.	+	.	transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855
 1	HAVANA	transcript	62948	63887	.	+	.	transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000
 1	HAVANA	transcript	69091	70008	.	+	.	transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936
 1	HAVANA	transcript	131104	133923	.	+	.	transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516
 #regions
 1    qpcr  transcript	12010   13670   .	.	.	target "geneENSG00000243485.1"
 END

 my $script;
 ################################################################################
 #no arg cathching
 if (scalar(@ARGV)==0){
 print $use;
 exit(0);
 }
 #file input#####################################################
 my $regionsGFF = shift(@ARGV);
 my $file = shift(@ARGV);
 #my $basename = $file
 #$basename =~ s/.fluxCapacitor.txt//;
 #check max amount of fields#####################################################
 warn " ## info: region info collection.\n";
 open(my $regionsIn,'<', $regionsGFF)or die "$0: failed to open file:'$regionsGFF'!!!!\n$!\n";
 #the chromsome optimised storage
 my $regions;
 while(<$regionsIn>){
 	next if (m/^#/);
        my $region = &readgtf($_);
 	push(@{$regions -> {$region -> {"CHROM"}}},$region);
 	
 }

 #warn Dumper($regions);

 close $regionsIn or die "write error cannot close $regionsGFF\n";

 warn " ## info: gene info collection.\n";

 open(my $genesIn,'<', $file)or die "$0: failed to open file:'$file'!!!!\n$!\n";
 my $transcriptExonLengths;

 while(<$genesIn>){
 	next if (m/^#/);
 	my $gtfline = &readgtf($_);
 	$gtfline -> {'exon_number'}=~s/"//g if(defined($gtfline -> {'exon_number'}) && $gtfline -> {'ANNOTATIONTYPE'} eq 'exon');
 	my $match;%{$match}=();
 	if(matches('gtf'=>$gtfline,'regions'=>$regions,'match'=>$match)){
 		#die "Match ".Dumper($match);
 		print writegtf(%{$match});
 	}
 	#warn Dumper(\%gtfline);
 }
 close $genesIn or die "write error cannot close $file\n";

 sub matches {
 	my $self;
 	%{$self} = @_;
 	my $regions = $self -> {'regions'};
 	my $gtfline = $self -> {'gtf'};
 	my $match   = $self -> {'match'};
 	#warn Dumper(\$regions,\$gtfline, \$match);
 	if(defined($gtfline -> {'CHROM'})&&defined($regions -> {$gtfline -> {'CHROM'}})){
 		for my $region (@{$regions -> {$gtfline -> {'CHROM'}}}){
 			#warn if($region -> {target} =~ /BRAF/);
 			if($region -> {'START'} <= $gtfline -> {'START'} && $region -> {'END'} >= $gtfline -> {'END'}){
 				%{$match} = %{$gtfline};
 				#warn Dumper($match, \$.);
 				return 1;#cannot get better match
 			}elsif($region -> {'START'} <= $gtfline -> {'START'} && $region -> {'END'} >= $gtfline -> {'START'}){
 				my $matchcur;%{$matchcur}=%{$gtfline};
 				#warn Dumper($matchcur, \$.);
                                $matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'};
                                $matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'};

 				$matchcur -> {'END'} = $region -> {'END'};
 				if(defined($match ->  {'START'})){
 					%{$match} = getlongestmatch($matchcur,$match); # warn Dumper($match, \$.);
 				}else{
 					%{$match} = %{$matchcur};	
 				}
 			}elsif($region -> {'START'} <= $gtfline -> {'END'} && $region -> {'END'} >= $gtfline -> {'END'}){
 				my $matchcur;%{$matchcur}=%{$gtfline};# warn Dumper($matchcur, \$.);
                                $matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'};
                                $matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'};

                                $matchcur -> {'START'} = $region -> {'START'};
                                if(defined($match ->  {'START'})){
                                        %{$match} = getlongestmatch($matchcur,$match);  #warn Dumper($match, \$.);
                                }else{
                                        %{$match} = %{$matchcur};
                                }
                        }elsif($region -> {'START'} <= $gtfline -> {'END'} && $region -> {'END'} >= $gtfline -> {'START'}){
 				my $matchcur;%{$matchcur}=%{$gtfline}; #warn Dumper($matchcur, \$.);
 				$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'};
 				$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'};

 				$matchcur -> {'START'} = $region -> {'START'};
 				$matchcur -> {'END'} = $region -> {'END'};
 				
 				if(defined($match ->  {'START'})){
                                        %{$match} = getlongestmatch($matchcur,$match);# warn Dumper($match, \$.);
                                }else{
                                        %{$match} = %{$matchcur};# warn Dumper($match, \$.);
                                }
 			}
 		}
 		#warn Dumper($match, \$.);
 		return 1 if(defined($match ->  {'START'}));
 		return undef;

 	}else{
 		return undef;
 	}
 }

 sub getlongestmatch {
 	my $gtf1	= shift	@_;
 	my $gtf2	= shift	@_;
 	warn getgtflength($gtf1).getgtflength($gtf2);
 	if(getgtflength($gtf1) >= getgtflength($gtf2)){
 		return(%{$gtf1});
 	}else{
 	        return(%{$gtf2});
 	}
 }

 sub getgtflength	{
 	my $gtf = shift @_;
 	return ($gtf -> {'END'} - $gtf -> {'START'} + 1);
 }
 sub readgtf {
 	my $inputline = shift @_;
 	return undef if(not(defined($inputline))); 
 	chomp $inputline;
 	
 	my @tabdelim;
 	@tabdelim = split("\t",$inputline);
 	
 	my %gtfParsed;
 	$gtfParsed{"CHROM"} = $tabdelim[0];
 	$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1];
 	$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2];
 	$gtfParsed{"START"} = $tabdelim[3];
 	$gtfParsed{"END"} = $tabdelim[4];
 	$gtfParsed{"COL6"} = $tabdelim[5];
 	$gtfParsed{"STRAND"} = $tabdelim[6];
 	$gtfParsed{"COL8"} = $tabdelim[7];
 	$gtfParsed{"INFO"} = $tabdelim[8];
 	
 	die "error start should always be smaller or eq to end".Dumper(\%gtfParsed)." " if($gtfParsed{"START"} > $gtfParsed{"END"});
 	
 	my @semicolomndelimINFO;
 	@semicolomndelimINFO = split(';',$gtfParsed{"INFO"});
 	
 	for my $element (@semicolomndelimINFO){
 		my @spacedelim = split(/ |=/,$element);
 		if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){
 			$gtfParsed{$spacedelim[0]} = $spacedelim[1];
 		}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
 			$gtfParsed{$spacedelim[1]} = $spacedelim[2];
 		}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
 			$gtfParsed{$spacedelim[1]} = $spacedelim[2];
 		}else{
 			die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!";
 		}	
 	}
 	return \%gtfParsed;
 }
 sub writegtf {
 	my %gtfParsed = @_;
 	
 	my $info="";
 	
 	for my $key (keys(%gtfParsed)){
 		
 		next if($key eq "CHROM" || $key eq "ANNOTATIONSCOURCE" || $key eq "ANNOTATIONTYPE" || $key eq "START" || $key eq "END" || $key eq "COL6" || $key eq "STRAND" || $key eq "COL8" || $key eq "INFO");
 		if($info ne ""){
 			$info=$info.";";
 		}
 		$info=$info.$key.'='.$gtfParsed{$key};
 	}
 	
 	#warn $info;
 	return join("\t",($gtfParsed{"CHROM"},$gtfParsed{"ANNOTATIONSCOURCE"},$gtfParsed{"ANNOTATIONTYPE"},$gtfParsed{"START"},$gtfParsed{"END"},$gtfParsed{"COL6"},$gtfParsed{"STRAND"},$gtfParsed{"COL8"},$info))."\n";
 }
	#!/usr/bin/perl
	use strict;
	use warnings;
	use Data::Dumper;

	my $use = <<"END";
	Simple extractor of the annotations between pre-defined regions in the gff format
	input
	perl $0 regions.gff ~/Downloads/Homo_sapiens.GRCh37.75.gtf > ~/regionsIntersectHomo_sapiens.GRCh37.75.gff
	no validation of input....
	END

	my $TESoffset=1000;

	my $exampleGTF = <<"END";
	#examplefile
	1 HAVANA transcript 12010 13670 . + . transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000
	1 HAVANA transcript 11869 14409 . + . transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208
	1 ENSEMBL transcript 11872 14412 . + . transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000
	1 ENSEMBL transcript 11874 14409 . + . transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000
	1 ENSEMBL transcript 30366 30503 . + . transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000
	1 HAVANA transcript 30267 31109 . + . transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000
	1 HAVANA transcript 29554 31097 . + . transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855
	1 HAVANA transcript 62948 63887 . + . transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000
	1 HAVANA transcript 69091 70008 . + . transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936
	1 HAVANA transcript 131104 133923 . + . transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516
	#regions
	1 qpcr transcript 12010 13670 . . . target "geneENSG00000243485.1"
	END

	my $script;
	################################################################################
	#no arg cathching
	if (scalar(@ARGV)==0){
	print $use;
	exit(0);
	}
	#file input#####################################################
	my $regionsGFF = shift(@ARGV);
	my $file = shift(@ARGV);
	#my $basename = $file
	#$basename =~ s/.fluxCapacitor.txt//;
	#check max amount of fields#####################################################
	warn " ## info: region info collection.\n";
	open(my $regionsIn,'<', $regionsGFF)or die "$0: failed to open file:'$regionsGFF'!!!!\n$!\n";
	#the chromsome optimised storage
	my $regions;
	while(<$regionsIn>){
	next if (m/^#/);
	my $region = &readgtf($_);
	push(@{$regions -> {$region -> {"CHROM"}}},$region);

	}

	#warn Dumper($regions);

	close $regionsIn or die "write error cannot close $regionsGFF\n";

	warn " ## info: gene info collection.\n";

	open(my $genesIn,'<', $file)or die "$0: failed to open file:'$file'!!!!\n$!\n";
	my $transcriptExonLengths;

	while(<$genesIn>){
	next if (m/^#/);
	my $gtfline = &readgtf($_);
	$gtfline -> {'exon_number'}=~s/"//g if(defined($gtfline -> {'exon_number'}) && $gtfline -> {'ANNOTATIONTYPE'} eq 'exon');
	my $match;%{$match}=();
	if(matches('gtf'=>$gtfline,'regions'=>$regions,'match'=>$match)){
	#die "Match ".Dumper($match);
	print writegtf(%{$match});
	}
	#warn Dumper(\%gtfline);
	}
	close $genesIn or die "write error cannot close $file\n";

	sub matches {
	my $self;
	%{$self} = @_;
	my $regions = $self -> {'regions'};
	my $gtfline = $self -> {'gtf'};
	my $match = $self -> {'match'};
	#warn Dumper(\$regions,\$gtfline, \$match);
	if(defined($gtfline -> {'CHROM'})&&defined($regions -> {$gtfline -> {'CHROM'}})){
	for my $region (@{$regions -> {$gtfline -> {'CHROM'}}}){
	#warn if($region -> {target} =~ /BRAF/);
	if($region -> {'START'} <= $gtfline -> {'START'} && $region -> {'END'} >= $gtfline -> {'END'}){
	%{$match} = %{$gtfline};
	#warn Dumper($match, \$.);
	return 1;#cannot get better match
	}elsif($region -> {'START'} <= $gtfline -> {'START'} && $region -> {'END'} >= $gtfline -> {'START'}){
	my $matchcur;%{$matchcur}=%{$gtfline};
	#warn Dumper($matchcur, \$.);
	$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'};
	$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'};

	$matchcur -> {'END'} = $region -> {'END'};
	if(defined($match -> {'START'})){
	%{$match} = getlongestmatch($matchcur,$match); # warn Dumper($match, \$.);
	}else{
	%{$match} = %{$matchcur};
	}
	}elsif($region -> {'START'} <= $gtfline -> {'END'} && $region -> {'END'} >= $gtfline -> {'END'}){
	my $matchcur;%{$matchcur}=%{$gtfline};# warn Dumper($matchcur, \$.);
	$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'};
	$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'};

	$matchcur -> {'START'} = $region -> {'START'};
	if(defined($match -> {'START'})){
	%{$match} = getlongestmatch($matchcur,$match); #warn Dumper($match, \$.);
	}else{
	%{$match} = %{$matchcur};
	}
	}elsif($region -> {'START'} <= $gtfline -> {'END'} && $region -> {'END'} >= $gtfline -> {'START'}){
	my $matchcur;%{$matchcur}=%{$gtfline}; #warn Dumper($matchcur, \$.);
	$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'};
	$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'};

	$matchcur -> {'START'} = $region -> {'START'};
	$matchcur -> {'END'} = $region -> {'END'};

	if(defined($match -> {'START'})){
	%{$match} = getlongestmatch($matchcur,$match);# warn Dumper($match, \$.);
	}else{
	%{$match} = %{$matchcur};# warn Dumper($match, \$.);
	}
	}
	}
	#warn Dumper($match, \$.);
	return 1 if(defined($match -> {'START'}));
	return undef;

	}else{
	return undef;
	}
	}

	sub getlongestmatch {
	my $gtf1 = shift @_;
	my $gtf2 = shift @_;
	warn getgtflength($gtf1).getgtflength($gtf2);
	if(getgtflength($gtf1) >= getgtflength($gtf2)){
	return(%{$gtf1});
	}else{
	return(%{$gtf2});
	}
	}

	sub getgtflength {
	my $gtf = shift @_;
	return ($gtf -> {'END'} - $gtf -> {'START'} + 1);
	}
	sub readgtf {
	my $inputline = shift @_;
	return undef if(not(defined($inputline)));
	chomp $inputline;

	my @tabdelim;
	@tabdelim = split("\t",$inputline);

	my %gtfParsed;
	$gtfParsed{"CHROM"} = $tabdelim[0];
	$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1];
	$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2];
	$gtfParsed{"START"} = $tabdelim[3];
	$gtfParsed{"END"} = $tabdelim[4];
	$gtfParsed{"COL6"} = $tabdelim[5];
	$gtfParsed{"STRAND"} = $tabdelim[6];
	$gtfParsed{"COL8"} = $tabdelim[7];
	$gtfParsed{"INFO"} = $tabdelim[8];

	die "error start should always be smaller or eq to end".Dumper(\%gtfParsed)." " if($gtfParsed{"START"} > $gtfParsed{"END"});

	my @semicolomndelimINFO;
	@semicolomndelimINFO = split(';',$gtfParsed{"INFO"});

	for my $element (@semicolomndelimINFO){
	my @spacedelim = split(/ \|=/,$element);
	if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){
	$gtfParsed{$spacedelim[0]} = $spacedelim[1];
	}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
	$gtfParsed{$spacedelim[1]} = $spacedelim[2];
	}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){
	$gtfParsed{$spacedelim[1]} = $spacedelim[2];
	}else{
	die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!";
	}
	}
	return \%gtfParsed;
	}
	sub writegtf {
	my %gtfParsed = @_;

	my $info="";

	for my $key (keys(%gtfParsed)){

	next if($key eq "CHROM" \|\| $key eq "ANNOTATIONSCOURCE" \|\| $key eq "ANNOTATIONTYPE" \|\| $key eq "START" \|\| $key eq "END" \|\| $key eq "COL6" \|\| $key eq "STRAND" \|\| $key eq "COL8" \|\| $key eq "INFO");
	if($info ne ""){
	$info=$info.";";
	}
	$info=$info.$key.'='.$gtfParsed{$key};
	}

	#warn $info;
	return join("\t",($gtfParsed{"CHROM"},$gtfParsed{"ANNOTATIONSCOURCE"},$gtfParsed{"ANNOTATIONTYPE"},$gtfParsed{"START"},$gtfParsed{"END"},$gtfParsed{"COL6"},$gtfParsed{"STRAND"},$gtfParsed{"COL8"},$info))."\n";
	}