pmenzel · April 27, 2023 10:28
diff --git a/kraken-score.pl b/kraken-score.pl
 #!/usr/bin/env perl
 #
 # 2019, Peter Menzel, Labor Berlin
 # This script converts the Kraken output format into
 # a 3-column output format with columns:
 # 1: read name
 # 2: taxon id
 # 3: score, which is defined as the fraction of k-mers having that taxon id and its ancestors over all k-mers
 #
 # use as input for ktImportTaxonomy
 #
 # the program needs to know the k-mer size that the kraken database uses, can be specified with option -k
 #

 use strict;
 use warnings;
 use Getopt::Std;

 my $kmerlen = 35;

 my %options=();
 getopts("t:k:", \%options);
 exists($options{t}) && defined($options{t}) or die "Specify path to taxonomy folder containing nodes.dmp and merged.dmp via option -t\n";
 my $taxdir = $options{t};
 $taxdir =~ s/nodes\.dmp//; # just in case the path to nodes.dmp is passed
 $taxdir .= '/' unless $taxdir =~ m,/$,;

 if(exists($options{k}) && defined($options{k})) {
 	$kmerlen = $options{k};
 }

 my $nodesdmp = $taxdir.'nodes.dmp';
 if(!defined $ARGV[0]) { die "Usage:  kraken-score.pl -t /path/to/taxonomy /path/to/kraken-output-file.tsv\n"; }

 my %nodes;
 open(NODES,$nodesdmp) or die "Cannot open $nodesdmp\n";
 while(<NODES>) {
 	my @F = split(/\|/,$_);
 	if(@F > 3) {
 		my $id = -+- $F[0];
 		my $parentid = -+- $F[1];
 		$nodes{$id} = $parentid;
 	}
 }
 close(NODES);

 while(<>) {
 	chomp;
 	next unless length; #skip empty lines
 	my @F = split(/\t/,$_);
 	if(@F != 5) { die "Error: Not exactly 5 columns in line $_\n"; };
 	if($F[0] eq "U") {
 		print $F[1],"\t0\t1\n";
 		next;
 	}
 	if($F[2]=~/.*\(taxid (\d+)\).*/ or $F[2]=~/(\d+)/) {
 		my $taxid = $1;
 		#get all ancestors for taxon
 		my $lineage;
 		my $node = $taxid;
 		while(defined($nodes{$node})) {
 			$lineage .= " ".$node;
 			my $parent = $nodes{$node};
 			if($node == $parent) { last; }
 			$node = $parent;
 		}
 		$lineage .= " ";

 		my $sum_kmers_all = 0;
 		while($F[3] =~ /(\d+)/g) { # add the number of k-mers for both reads from column 4, e.g. "250:251"
 			if($1 > $kmerlen) {
 				$sum_kmers_all += ($1 - $kmerlen + 1);
 			}
 		}
 		my $sum_kmers_taxid = 0;
 		while($F[4] =~ /(\d+):(\d+)/g) { # sum up the counts for the lineage tax ids in column 5
 			my $id = $1;
 			my $count = $2;
 			if($lineage =~ / $id /) {
 				$sum_kmers_taxid += $count;
 			}
 		}

 		if($sum_kmers_all > 0) {
 			printf("%s\t%i\t%.2f\n", $F[1], $taxid, $sum_kmers_taxid / $sum_kmers_all);
 		}
 		else {
 			print $F[1],"\t0\t1\n";
 		}

 	}
 	else {
 		die "Error: Cannot read taxon id from line $_\n";
 	}
 }
	#!/usr/bin/env perl
	#
	# 2019, Peter Menzel, Labor Berlin
	# This script converts the Kraken output format into
	# a 3-column output format with columns:
	# 1: read name
	# 2: taxon id
	# 3: score, which is defined as the fraction of k-mers having that taxon id and its ancestors over all k-mers
	#
	# use as input for ktImportTaxonomy
	#
	# the program needs to know the k-mer size that the kraken database uses, can be specified with option -k
	#

	use strict;
	use warnings;
	use Getopt::Std;

	my $kmerlen = 35;

	my %options=();
	getopts("t:k:", \%options);
	exists($options{t}) && defined($options{t}) or die "Specify path to taxonomy folder containing nodes.dmp and merged.dmp via option -t\n";
	my $taxdir = $options{t};
	$taxdir =~ s/nodes\.dmp//; # just in case the path to nodes.dmp is passed
	$taxdir .= '/' unless $taxdir =~ m,/$,;

	if(exists($options{k}) && defined($options{k})) {
	$kmerlen = $options{k};
	}

	my $nodesdmp = $taxdir.'nodes.dmp';
	if(!defined $ARGV[0]) { die "Usage: kraken-score.pl -t /path/to/taxonomy /path/to/kraken-output-file.tsv\n"; }

	my %nodes;
	open(NODES,$nodesdmp) or die "Cannot open $nodesdmp\n";
	while(<NODES>) {
	my @F = split(/\\|/,$_);
	if(@F > 3) {
	my $id = -+- $F[0];
	my $parentid = -+- $F[1];
	$nodes{$id} = $parentid;
	}
	}
	close(NODES);

	while(<>) {
	chomp;
	next unless length; #skip empty lines
	my @F = split(/\t/,$_);
	if(@F != 5) { die "Error: Not exactly 5 columns in line $_\n"; };
	if($F[0] eq "U") {
	print $F[1],"\t0\t1\n";
	next;
	}
	if($F[2]=~/.\(taxid (\d+)\)./ or $F[2]=~/(\d+)/) {
	my $taxid = $1;
	#get all ancestors for taxon
	my $lineage;
	my $node = $taxid;
	while(defined($nodes{$node})) {
	$lineage .= " ".$node;
	my $parent = $nodes{$node};
	if($node == $parent) { last; }
	$node = $parent;
	}
	$lineage .= " ";

	my $sum_kmers_all = 0;
	while($F[3] =~ /(\d+)/g) { # add the number of k-mers for both reads from column 4, e.g. "250:251"
	if($1 > $kmerlen) {
	$sum_kmers_all += ($1 - $kmerlen + 1);
	}
	}
	my $sum_kmers_taxid = 0;
	while($F[4] =~ /(\d+):(\d+)/g) { # sum up the counts for the lineage tax ids in column 5
	my $id = $1;
	my $count = $2;
	if($lineage =~ / $id /) {
	$sum_kmers_taxid += $count;
	}
	}

	if($sum_kmers_all > 0) {
	printf("%s\t%i\t%.2f\n", $F[1], $taxid, $sum_kmers_taxid / $sum_kmers_all);
	}
	else {
	print $F[1],"\t0\t1\n";
	}

	}
	else {
	die "Error: Cannot read taxon id from line $_\n";
	}
	}
No results found