Skip to content

Instantly share code, notes, and snippets.

@cielavenir
Forked from zhoujj2013/gtf2refGene.pl
Created May 29, 2021 08:30
Show Gist options
  • Save cielavenir/66985e23a7b23794c6f97d72ba72901a to your computer and use it in GitHub Desktop.
Save cielavenir/66985e23a7b23794c6f97d72ba72901a to your computer and use it in GitHub Desktop.
Convert ensembl gtf file to UCSC refGene.txt file.
#!/usr/bin/perl -w
use strict;
use Data::Dumper;
&usage if @ARGV<1;
#open IN,"" ||die "Can't open the file:$\n";
#open OUT,"" ||die "Can't open the file:$\n";
sub usage {
my $usage = << "USAGE";
Convert ensembl gtf file to UCSC refGene.txt file.
Author: zhoujj2013\@gmail.com
Usage: $0 <ensembl gtf file> > <output file>
Example:perl $0 genes_chr_novo.gtf > genes_chr_novo.refGene.bed
USAGE
print "$usage";
exit(1);
};
my $gtf_f = shift;
my $gtf = read_gtf($gtf_f);
foreach my $transid (keys %{$gtf}){
# deal with exon information
my @exon_sorted = sort {$a->[1] <=> $b->[1]} @{$gtf->{$transid}{'exon'}};
# deal with cds information
if(exists $gtf->{$transid}{'CDS'}){
my @cds_sorted = sort {$a->[1] <=> $b->[1]} @{$gtf->{$transid}{'CDS'}};
}
# deal with stop and start codon informaton
my ($thickStart,$thickEnd) = (0, 0);
if(exists $gtf->{$transid}{'stop_codon'} && exists $gtf->{$transid}{'start_codon'}){
my @arr = ($gtf->{$transid}{'stop_codon'}[1],$gtf->{$transid}{'stop_codon'}->[2],$gtf->{$transid}{'start_codon'}->[1],$gtf->{$transid}{'start_codon'}->[2]);
my @arr_sorted = sort{$a <=> $b} @arr;
$thickStart = $arr_sorted[0];
$thickEnd = $arr_sorted[-1];
}elsif(exists $gtf->{$transid}{'stop_codon'} && !(exists $gtf->{$transid}{'start_codon'})){
$thickStart = $exon_sorted[0][1];
$thickEnd = $gtf->{$transid}{'stop_codon'}[2];
}elsif(!(exists $gtf->{$transid}{'stop_codon'}) && exists $gtf->{$transid}{'start_codon'}){
$thickStart = $gtf->{$transid}{'start_codon'}[1];
$thickEnd = $exon_sorted[-1][2];
}else{
$thickStart = $exon_sorted[-1][2];
$thickEnd = $exon_sorted[-1][2];
}
# deal with coding blocks
my @exon_len;
my @start_arr;
my $s = $exon_sorted[0][1];
foreach my $e (@exon_sorted){
my $start = $e->[1] - $s;
push @start_arr,$start;
my $len = $e->[2] - $e->[1] + 1;
push @exon_len,$len;
}
# print out the result
my $exon_num = scalar(@exon_sorted);
my $exon_len = join ",",@exon_len;
my $start_arr = join ",",@start_arr;
print "$exon_sorted[0][0]\t$exon_sorted[0][1]\t$exon_sorted[-1][2]\t$transid\t0\t$exon_sorted[0][3]\t$thickStart\t$thickEnd\t0\t$exon_num\t$exon_len,\t$start_arr,\n";
}
sub read_gtf{
my ($f) = @_;
my %gtf;
open IN,"$f" || die $!;
while(<IN>){
chomp;
my @t = split /\t/;
my $transid = $1 if($t[8] =~ /transcript_id "([^"]+)";/);
# store information in hash
($t[3], $t[4]) = ($t[4], $t[3]) if($t[3] > $t[4]);
if($t[2] =~ /exon/){
push @{$gtf{$transid}{'exon'}},[$t[0], $t[3], $t[4], $t[6]];
}
if($t[2] =~ /CDS/){
push @{$gtf{$transid}{'CDS'}},[$t[0], $t[3], $t[4], $t[6]];
}
if($t[2] =~ /start_codon/){
$gtf{$transid}{'start_codon'} = [$t[0], $t[3], $t[4], $t[6]];
}
if($t[2] =~ /stop_codon/){
$gtf{$transid}{'stop_codon'} = [$t[0], $t[3], $t[4], $t[6]];
}
}
close IN;
# return hash reference
return \%gtf;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment