Created
October 22, 2020 13:16
-
-
Save telatin/63fd5e8607a066a073ff7242d267d959 to your computer and use it in GitHub Desktop.
If prokka renamed contigs for sumbission ready format, this script can take the original contig names and spit out a GFF using those. BED output also supported.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use 5.012; | |
use warnings; | |
use lib '/qib/platforms/Informatics/GMH/bin/perl5/lib/perl5/'; | |
use Getopt::Long; | |
use FASTX::Reader; | |
my $opt_contigs; | |
my $opt_gff; | |
my $opt_help; | |
my $opt_bed; | |
my $opt_keepfasta; | |
my $opt_key = "ID"; | |
GetOptions( | |
'c|contigs=s' => \$opt_contigs, | |
'g|gff=s' => \$opt_gff, | |
'b|bed=s' => \$opt_bed, | |
'k|keepfasta' => \$opt_keepfasta, | |
'h|help' => \$opt_help, | |
); | |
$opt_help && help(); | |
if (! $opt_contigs or ! $opt_gff) { | |
say STDERR "Missing parameters: -c CONTIGS -g GFF"; | |
exit; | |
} | |
if (! -e "$opt_contigs") { | |
say STDERR "Contigs not found at <$opt_contigs>."; | |
exit 1; | |
} | |
if (! -e "$opt_gff") { | |
say STDERR "GFF not found at <$opt_gff>."; | |
exit 1; | |
} | |
my $BED; | |
if ($opt_bed) { | |
open($BED, '>', "$opt_bed") || die "Unable to write BED to <$opt_bed>.\n"; | |
} | |
my $ctg_count = 0; | |
my @contig_names = (); | |
my %contig_sizes = (); | |
my %contig_rename = (); | |
# Open both files to check immediately for exceptions | |
my $READER = FASTX::Reader->new({ filename => "$opt_contigs" }); | |
open (my $GFF, '<', "$opt_gff") || die "FATAL ERROR:\nUnable to read GFF file at <$opt_gff>.\n"; | |
# Parse FASTA file | |
while (my $s = $READER->getRead() ) { | |
$ctg_count++; | |
print STDERR " * $ctg_count contigs parsed ($s->{name})...\r" if ($ctg_count % 5000 == 0); | |
push(@contig_names, $s->{name}); | |
$contig_sizes{$s->{name}} = length($s->{seq}); | |
} | |
say STDERR " * $ctg_count contigs parsed."; | |
my $gff_line = 0; | |
while (my $line = readline($GFF) ) { | |
chomp($line); | |
$gff_line++; | |
print STDERR " * $gff_line lines parsed (", substr($line, 1, 50),"...)\r" if ($gff_line % 10000 == 0); | |
my $new_name = ''; | |
if ($line =~/##sequence-region\s+(\S+)\s+1\s+(\d+)/) { | |
my $prokka_name = $1; | |
my $assembly_name = shift(@contig_names); | |
if ($contig_sizes{$assembly_name} != $2) { | |
die " FATAL ERROR: Contig $prokka_name is $2 bp long. Matched with contig $assembly_name that is $contig_sizes{$assembly_name} bp long.\n"; | |
exit 1; | |
} | |
$contig_rename{$prokka_name} = $assembly_name; | |
say "##sequence-region $assembly_name 1 $2"; | |
} elsif ($line =~/^#/) { | |
if ($line =~/##FASTA/ and ! $opt_keepfasta) { | |
say STDERR " * Skipping FASTA section..."; | |
exit; | |
} | |
say $line; | |
} elsif ($line =~/^>(.+)/) { | |
say ">", $contig_rename{$1}; | |
} else { | |
my @fields = split /\t/, $line; | |
if ($fields[6]) { | |
$fields[0] = $contig_rename{$fields[0]}; | |
say join("\t", @fields); | |
if ($opt_bed) { | |
my $start = $fields[3] - 1; | |
my $name = ""; | |
if ($fields[-1]=~/$opt_key=([^;]+)/) { | |
$name = $1 | |
} | |
say {$BED} $fields[0], "\t", $start, "\t", $fields[4], "\t", $name; | |
} | |
} else { | |
say $line; | |
} | |
} | |
} | |
sub help { | |
use File::Basename; | |
my $b = basename($0); | |
say STDERR " | |
Rename reference sequences in GFF from Prokka, using | |
the original contig names | |
Usage: | |
$b [-b OUT] -c CONTIGS -g GFF > NEW.gff | |
Parameters: | |
-c, --contigs CONTIGFILE Original contig files (required) | |
-g, --gff PROKKAGFF GFF annotation by Prokka (required) | |
-b, --bed OUTPUT Print a BED output file (optional) | |
-k, --keepfasta Include sequences in GFF output | |
"; | |
exit; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment