Last active
November 24, 2016 15:00
-
-
Save mmterpstra/4adc4babaf556b2d95d8faf7dfa15bdd to your computer and use it in GitHub Desktop.
intersect gtf/gff files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use Data::Dumper; | |
my $use = <<"END"; | |
Simple extractor of the annotations between pre-defined regions in the gff format | |
input | |
perl $0 regions.gff ~/Downloads/Homo_sapiens.GRCh37.75.gtf > ~/regionsIntersectHomo_sapiens.GRCh37.75.gff | |
no validation of input.... | |
END | |
my $TESoffset=1000; | |
my $exampleGTF = <<"END"; | |
#examplefile | |
1 HAVANA transcript 12010 13670 . + . transcript_id "ENST00000450305.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 632; RPKM 0.000000 | |
1 HAVANA transcript 11869 14409 . + . transcript_id "ENST00000456328.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 232.000000; length 1657; RPKM 6.344208 | |
1 ENSEMBL transcript 11872 14412 . + . transcript_id "ENST00000515242.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1653; RPKM 0.000000 | |
1 ENSEMBL transcript 11874 14409 . + . transcript_id "ENST00000518655.2"; locus_id "chr1:11869-14412W"; gene_id "ENSG00000223972.4"; reads 0.000000; length 1483; RPKM 0.000000 | |
1 ENSEMBL transcript 30366 30503 . + . transcript_id "ENST00000408384.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000221311.1"; reads 0.000000; length 138; RPKM 0.000000 | |
1 HAVANA transcript 30267 31109 . + . transcript_id "ENST00000469289.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 0.000000; length 535; RPKM 0.000000 | |
1 HAVANA transcript 29554 31097 . + . transcript_id "ENST00000473358.1"; locus_id "chr1:29554-31109W"; gene_id "ENSG00000243485.1"; reads 56.000000; length 712; RPKM 3.563855 | |
1 HAVANA transcript 62948 63887 . + . transcript_id "ENST00000492842.1"; locus_id "chr1:62948-63887W"; gene_id "ENSG00000240361.1"; reads 0.000000; length 940; RPKM 0.000000 | |
1 HAVANA transcript 69091 70008 . + . transcript_id "ENST00000335137.3"; locus_id "chr1:69091-70008W"; gene_id "ENSG00000186092.4"; reads 36.000000; length 918; RPKM 1.776936 | |
1 HAVANA transcript 131104 133923 . + . transcript_id "ENST00000442987.2"; locus_id "chr1:131104-133923W"; gene_id "ENSG00000233750.2"; reads 522.000000; length 2820; RPKM 8.387516 | |
#regions | |
1 qpcr transcript 12010 13670 . . . target "geneENSG00000243485.1" | |
END | |
my $script; | |
################################################################################ | |
#no arg cathching | |
if (scalar(@ARGV)==0){ | |
print $use; | |
exit(0); | |
} | |
#file input##################################################### | |
my $regionsGFF = shift(@ARGV); | |
my $file = shift(@ARGV); | |
#my $basename = $file | |
#$basename =~ s/.fluxCapacitor.txt//; | |
#check max amount of fields##################################################### | |
warn " ## info: region info collection.\n"; | |
open(my $regionsIn,'<', $regionsGFF)or die "$0: failed to open file:'$regionsGFF'!!!!\n$!\n"; | |
#the chromsome optimised storage | |
my $regions; | |
while(<$regionsIn>){ | |
next if (m/^#/); | |
my $region = &readgtf($_); | |
push(@{$regions -> {$region -> {"CHROM"}}},$region); | |
} | |
#warn Dumper($regions); | |
close $regionsIn or die "write error cannot close $regionsGFF\n"; | |
warn " ## info: gene info collection.\n"; | |
open(my $genesIn,'<', $file)or die "$0: failed to open file:'$file'!!!!\n$!\n"; | |
my $transcriptExonLengths; | |
while(<$genesIn>){ | |
next if (m/^#/); | |
my $gtfline = &readgtf($_); | |
$gtfline -> {'exon_number'}=~s/"//g if(defined($gtfline -> {'exon_number'}) && $gtfline -> {'ANNOTATIONTYPE'} eq 'exon'); | |
my $match;%{$match}=(); | |
if(matches('gtf'=>$gtfline,'regions'=>$regions,'match'=>$match)){ | |
#die "Match ".Dumper($match); | |
print writegtf(%{$match}); | |
} | |
#warn Dumper(\%gtfline); | |
} | |
close $genesIn or die "write error cannot close $file\n"; | |
sub matches { | |
my $self; | |
%{$self} = @_; | |
my $regions = $self -> {'regions'}; | |
my $gtfline = $self -> {'gtf'}; | |
my $match = $self -> {'match'}; | |
#warn Dumper(\$regions,\$gtfline, \$match); | |
if(defined($gtfline -> {'CHROM'})&&defined($regions -> {$gtfline -> {'CHROM'}})){ | |
for my $region (@{$regions -> {$gtfline -> {'CHROM'}}}){ | |
#warn if($region -> {target} =~ /BRAF/); | |
if($region -> {'START'} <= $gtfline -> {'START'} && $region -> {'END'} >= $gtfline -> {'END'}){ | |
%{$match} = %{$gtfline}; | |
#warn Dumper($match, \$.); | |
return 1;#cannot get better match | |
}elsif($region -> {'START'} <= $gtfline -> {'START'} && $region -> {'END'} >= $gtfline -> {'START'}){ | |
my $matchcur;%{$matchcur}=%{$gtfline}; | |
#warn Dumper($matchcur, \$.); | |
$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'}; | |
$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'}; | |
$matchcur -> {'END'} = $region -> {'END'}; | |
if(defined($match -> {'START'})){ | |
%{$match} = getlongestmatch($matchcur,$match); # warn Dumper($match, \$.); | |
}else{ | |
%{$match} = %{$matchcur}; | |
} | |
}elsif($region -> {'START'} <= $gtfline -> {'END'} && $region -> {'END'} >= $gtfline -> {'END'}){ | |
my $matchcur;%{$matchcur}=%{$gtfline};# warn Dumper($matchcur, \$.); | |
$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'}; | |
$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'}; | |
$matchcur -> {'START'} = $region -> {'START'}; | |
if(defined($match -> {'START'})){ | |
%{$match} = getlongestmatch($matchcur,$match); #warn Dumper($match, \$.); | |
}else{ | |
%{$match} = %{$matchcur}; | |
} | |
}elsif($region -> {'START'} <= $gtfline -> {'END'} && $region -> {'END'} >= $gtfline -> {'START'}){ | |
my $matchcur;%{$matchcur}=%{$gtfline}; #warn Dumper($matchcur, \$.); | |
$matchcur -> {'STARTORIGINAL'} = $matchcur -> {'START'}; | |
$matchcur -> {'ENDORIGINAL'} = $matchcur -> {'END'}; | |
$matchcur -> {'START'} = $region -> {'START'}; | |
$matchcur -> {'END'} = $region -> {'END'}; | |
if(defined($match -> {'START'})){ | |
%{$match} = getlongestmatch($matchcur,$match);# warn Dumper($match, \$.); | |
}else{ | |
%{$match} = %{$matchcur};# warn Dumper($match, \$.); | |
} | |
} | |
} | |
#warn Dumper($match, \$.); | |
return 1 if(defined($match -> {'START'})); | |
return undef; | |
}else{ | |
return undef; | |
} | |
} | |
sub getlongestmatch { | |
my $gtf1 = shift @_; | |
my $gtf2 = shift @_; | |
warn getgtflength($gtf1).getgtflength($gtf2); | |
if(getgtflength($gtf1) >= getgtflength($gtf2)){ | |
return(%{$gtf1}); | |
}else{ | |
return(%{$gtf2}); | |
} | |
} | |
sub getgtflength { | |
my $gtf = shift @_; | |
return ($gtf -> {'END'} - $gtf -> {'START'} + 1); | |
} | |
sub readgtf { | |
my $inputline = shift @_; | |
return undef if(not(defined($inputline))); | |
chomp $inputline; | |
my @tabdelim; | |
@tabdelim = split("\t",$inputline); | |
my %gtfParsed; | |
$gtfParsed{"CHROM"} = $tabdelim[0]; | |
$gtfParsed{"ANNOTATIONSCOURCE"} = $tabdelim[1]; | |
$gtfParsed{"ANNOTATIONTYPE"} = $tabdelim[2]; | |
$gtfParsed{"START"} = $tabdelim[3]; | |
$gtfParsed{"END"} = $tabdelim[4]; | |
$gtfParsed{"COL6"} = $tabdelim[5]; | |
$gtfParsed{"STRAND"} = $tabdelim[6]; | |
$gtfParsed{"COL8"} = $tabdelim[7]; | |
$gtfParsed{"INFO"} = $tabdelim[8]; | |
die "error start should always be smaller or eq to end".Dumper(\%gtfParsed)." " if($gtfParsed{"START"} > $gtfParsed{"END"}); | |
my @semicolomndelimINFO; | |
@semicolomndelimINFO = split(';',$gtfParsed{"INFO"}); | |
for my $element (@semicolomndelimINFO){ | |
my @spacedelim = split(/ |=/,$element); | |
if($spacedelim[0] ne "" && $spacedelim[1] ne "" && not($spacedelim[2]) ){ | |
$gtfParsed{$spacedelim[0]} = $spacedelim[1]; | |
}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){ | |
$gtfParsed{$spacedelim[1]} = $spacedelim[2]; | |
}elsif($spacedelim[0] eq "" && $spacedelim[1] ne "" && $spacedelim[2] ne "" ){ | |
$gtfParsed{$spacedelim[1]} = $spacedelim[2]; | |
}else{ | |
die "incomplete datapairs in the INFO column!!!!on line: '$.'\non infotag:'".$gtfParsed{"INFO"}."'\n$!"; | |
} | |
} | |
return \%gtfParsed; | |
} | |
sub writegtf { | |
my %gtfParsed = @_; | |
my $info=""; | |
for my $key (keys(%gtfParsed)){ | |
next if($key eq "CHROM" || $key eq "ANNOTATIONSCOURCE" || $key eq "ANNOTATIONTYPE" || $key eq "START" || $key eq "END" || $key eq "COL6" || $key eq "STRAND" || $key eq "COL8" || $key eq "INFO"); | |
if($info ne ""){ | |
$info=$info.";"; | |
} | |
$info=$info.$key.'='.$gtfParsed{$key}; | |
} | |
#warn $info; | |
return join("\t",($gtfParsed{"CHROM"},$gtfParsed{"ANNOTATIONSCOURCE"},$gtfParsed{"ANNOTATIONTYPE"},$gtfParsed{"START"},$gtfParsed{"END"},$gtfParsed{"COL6"},$gtfParsed{"STRAND"},$gtfParsed{"COL8"},$info))."\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment