Skip to content

Instantly share code, notes, and snippets.

@telatin
Created October 22, 2020 13:16
Show Gist options
  • Save telatin/63fd5e8607a066a073ff7242d267d959 to your computer and use it in GitHub Desktop.
Save telatin/63fd5e8607a066a073ff7242d267d959 to your computer and use it in GitHub Desktop.
If prokka renamed contigs for sumbission ready format, this script can take the original contig names and spit out a GFF using those. BED output also supported.
#!/usr/bin/env perl
use 5.012;
use warnings;
use lib '/qib/platforms/Informatics/GMH/bin/perl5/lib/perl5/';
use Getopt::Long;
use FASTX::Reader;
my $opt_contigs;
my $opt_gff;
my $opt_help;
my $opt_bed;
my $opt_keepfasta;
my $opt_key = "ID";
GetOptions(
'c|contigs=s' => \$opt_contigs,
'g|gff=s' => \$opt_gff,
'b|bed=s' => \$opt_bed,
'k|keepfasta' => \$opt_keepfasta,
'h|help' => \$opt_help,
);
$opt_help && help();
if (! $opt_contigs or ! $opt_gff) {
say STDERR "Missing parameters: -c CONTIGS -g GFF";
exit;
}
if (! -e "$opt_contigs") {
say STDERR "Contigs not found at <$opt_contigs>.";
exit 1;
}
if (! -e "$opt_gff") {
say STDERR "GFF not found at <$opt_gff>.";
exit 1;
}
my $BED;
if ($opt_bed) {
open($BED, '>', "$opt_bed") || die "Unable to write BED to <$opt_bed>.\n";
}
my $ctg_count = 0;
my @contig_names = ();
my %contig_sizes = ();
my %contig_rename = ();
# Open both files to check immediately for exceptions
my $READER = FASTX::Reader->new({ filename => "$opt_contigs" });
open (my $GFF, '<', "$opt_gff") || die "FATAL ERROR:\nUnable to read GFF file at <$opt_gff>.\n";
# Parse FASTA file
while (my $s = $READER->getRead() ) {
$ctg_count++;
print STDERR " * $ctg_count contigs parsed ($s->{name})...\r" if ($ctg_count % 5000 == 0);
push(@contig_names, $s->{name});
$contig_sizes{$s->{name}} = length($s->{seq});
}
say STDERR " * $ctg_count contigs parsed.";
my $gff_line = 0;
while (my $line = readline($GFF) ) {
chomp($line);
$gff_line++;
print STDERR " * $gff_line lines parsed (", substr($line, 1, 50),"...)\r" if ($gff_line % 10000 == 0);
my $new_name = '';
if ($line =~/##sequence-region\s+(\S+)\s+1\s+(\d+)/) {
my $prokka_name = $1;
my $assembly_name = shift(@contig_names);
if ($contig_sizes{$assembly_name} != $2) {
die " FATAL ERROR: Contig $prokka_name is $2 bp long. Matched with contig $assembly_name that is $contig_sizes{$assembly_name} bp long.\n";
exit 1;
}
$contig_rename{$prokka_name} = $assembly_name;
say "##sequence-region $assembly_name 1 $2";
} elsif ($line =~/^#/) {
if ($line =~/##FASTA/ and ! $opt_keepfasta) {
say STDERR " * Skipping FASTA section...";
exit;
}
say $line;
} elsif ($line =~/^>(.+)/) {
say ">", $contig_rename{$1};
} else {
my @fields = split /\t/, $line;
if ($fields[6]) {
$fields[0] = $contig_rename{$fields[0]};
say join("\t", @fields);
if ($opt_bed) {
my $start = $fields[3] - 1;
my $name = "";
if ($fields[-1]=~/$opt_key=([^;]+)/) {
$name = $1
}
say {$BED} $fields[0], "\t", $start, "\t", $fields[4], "\t", $name;
}
} else {
say $line;
}
}
}
sub help {
use File::Basename;
my $b = basename($0);
say STDERR "
Rename reference sequences in GFF from Prokka, using
the original contig names
Usage:
$b [-b OUT] -c CONTIGS -g GFF > NEW.gff
Parameters:
-c, --contigs CONTIGFILE Original contig files (required)
-g, --gff PROKKAGFF GFF annotation by Prokka (required)
-b, --bed OUTPUT Print a BED output file (optional)
-k, --keepfasta Include sequences in GFF output
";
exit;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment