Created
April 27, 2023 10:28
-
-
Save pmenzel/28c880a973340a18bd0d09f3fd029a96 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# | |
# 2019, Peter Menzel, Labor Berlin | |
# This script converts the Kraken output format into | |
# a 3-column output format with columns: | |
# 1: read name | |
# 2: taxon id | |
# 3: score, which is defined as the fraction of k-mers having that taxon id and its ancestors over all k-mers | |
# | |
# use as input for ktImportTaxonomy | |
# | |
# the program needs to know the k-mer size that the kraken database uses, can be specified with option -k | |
# | |
use strict; | |
use warnings; | |
use Getopt::Std; | |
my $kmerlen = 35; | |
my %options=(); | |
getopts("t:k:", \%options); | |
exists($options{t}) && defined($options{t}) or die "Specify path to taxonomy folder containing nodes.dmp and merged.dmp via option -t\n"; | |
my $taxdir = $options{t}; | |
$taxdir =~ s/nodes\.dmp//; # just in case the path to nodes.dmp is passed | |
$taxdir .= '/' unless $taxdir =~ m,/$,; | |
if(exists($options{k}) && defined($options{k})) { | |
$kmerlen = $options{k}; | |
} | |
my $nodesdmp = $taxdir.'nodes.dmp'; | |
if(!defined $ARGV[0]) { die "Usage: kraken-score.pl -t /path/to/taxonomy /path/to/kraken-output-file.tsv\n"; } | |
my %nodes; | |
open(NODES,$nodesdmp) or die "Cannot open $nodesdmp\n"; | |
while(<NODES>) { | |
my @F = split(/\|/,$_); | |
if(@F > 3) { | |
my $id = -+- $F[0]; | |
my $parentid = -+- $F[1]; | |
$nodes{$id} = $parentid; | |
} | |
} | |
close(NODES); | |
while(<>) { | |
chomp; | |
next unless length; #skip empty lines | |
my @F = split(/\t/,$_); | |
if(@F != 5) { die "Error: Not exactly 5 columns in line $_\n"; }; | |
if($F[0] eq "U") { | |
print $F[1],"\t0\t1\n"; | |
next; | |
} | |
if($F[2]=~/.*\(taxid (\d+)\).*/ or $F[2]=~/(\d+)/) { | |
my $taxid = $1; | |
#get all ancestors for taxon | |
my $lineage; | |
my $node = $taxid; | |
while(defined($nodes{$node})) { | |
$lineage .= " ".$node; | |
my $parent = $nodes{$node}; | |
if($node == $parent) { last; } | |
$node = $parent; | |
} | |
$lineage .= " "; | |
my $sum_kmers_all = 0; | |
while($F[3] =~ /(\d+)/g) { # add the number of k-mers for both reads from column 4, e.g. "250:251" | |
if($1 > $kmerlen) { | |
$sum_kmers_all += ($1 - $kmerlen + 1); | |
} | |
} | |
my $sum_kmers_taxid = 0; | |
while($F[4] =~ /(\d+):(\d+)/g) { # sum up the counts for the lineage tax ids in column 5 | |
my $id = $1; | |
my $count = $2; | |
if($lineage =~ / $id /) { | |
$sum_kmers_taxid += $count; | |
} | |
} | |
if($sum_kmers_all > 0) { | |
printf("%s\t%i\t%.2f\n", $F[1], $taxid, $sum_kmers_taxid / $sum_kmers_all); | |
} | |
else { | |
print $F[1],"\t0\t1\n"; | |
} | |
} | |
else { | |
die "Error: Cannot read taxon id from line $_\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment