Skip to content

Instantly share code, notes, and snippets.

@mariduv
Created July 17, 2025 17:42
Show Gist options
  • Save mariduv/11cc0ba09444bb37beddce4f72786798 to your computer and use it in GitHub Desktop.
Save mariduv/11cc0ba09444bb37beddce4f72786798 to your computer and use it in GitHub Desktop.
Merge SpamAssassin bayes DB backup files
#!/usr/bin/env perl
=head1 NAME
merge-spamassassin-bayes-backup.pl - Merge bayes db backup files
=head1 SYNOPSIS
$ merge-spamassassin-bayes-backup.pl bayes-1.txt bayes-2.txt > bayes-merged.txt
$ sa-learn --restore bayes-merged.txt
=head1 DESCRIPTION
Given a list of C<sa-learn --backup> files, merge them in memory and print the
result. SpamAssassin has no built-in ability to merge two bayes databases
together, and the C<sa-learn --restore> command clobbers existing data on use.
This script merge backup files so the output can be restored into a single
merged database.
Only db_version 3 is supported; a version 2 db or backup can be convered by
restore and re-backup through sa-learn.
=head1 SEE ALSO
Basic format and merge explanation given here:
L<https://web.archive.org/web/20240727114619/https://bz.apache.org/SpamAssassin/show_bug.cgi?id=6997#c1>
Backup/restore behavior reference:
L<Mail::SpamAssassin::BayesStore::BDB>
=cut
use v5.36;
use List::Util qw(max);
my ($num_spam, $num_nonspam) = (0, 0);
my (%spam_count, %ham_count, %atime);
my %sigs;
sub main {
load_backup($_) for @ARGV;
dump_merged();
}
sub dump_merged {
local $, = "\t";
say 'v', '3', 'db_version # this must be the first line!!!';
say 'v', $num_spam, 'num_spam';
say 'v', $num_nonspam, 'num_nonspam';
for my $token (sort keys %atime) {
my ($sc, $hc) = ($spam_count{$token}, $ham_count{$token});
next if $sc == 0 && $hc == 0;
say 't', $sc, $hc, $atime{$token}, $token;
}
for my $sig (sort keys %sigs) {
say 's', $sigs{$sig}, $sig;
}
}
sub load_backup($filename) {
open my $fh, $filename
or die "Can't open $filename: $!";
for (<$fh>) {
chomp;
my ($type, @l) = split "\t";
if ( my $handler = main::->can("handle_$type") ) {
$handler->(@l);
}
else {
warn "unhandled line type $type: @l\n";
}
}
}
# var
sub handle_v($val, $key) {
return unless $key;
if ( $key =~ /^db_version/ ) {
die "db_version $val not supported" if $val != 3;
}
elsif ( $key eq 'num_spam' ) {
$num_spam += $val;
}
elsif ( $key eq 'num_nonspam' ) {
$num_nonspam += $val;
}
else {
warn "unhandled var line: $key = $val\n";
}
}
# token
sub handle_t($sc, $hc, $atime, $token) {
return unless $token;
$spam_count{$token} += max(0, $sc);
$ham_count{$token} += max(0, $hc);
$atime{$token} = max($atime{$token} // 0, $atime);
}
# signature
sub handle_s($learned_as, $msgid) {
return unless $msgid;
if ( exists $sigs{$msgid} && $sigs{$msgid} ne $learned_as ) {
warn "encountered signature learned as both spam and ham: $msgid\n";
return;
}
$sigs{$msgid} = $learned_as;
}
main() unless caller;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment