Created
July 17, 2025 17:42
-
-
Save mariduv/11cc0ba09444bb37beddce4f72786798 to your computer and use it in GitHub Desktop.
Merge SpamAssassin bayes DB backup files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
=head1 NAME | |
merge-spamassassin-bayes-backup.pl - Merge bayes db backup files | |
=head1 SYNOPSIS | |
$ merge-spamassassin-bayes-backup.pl bayes-1.txt bayes-2.txt > bayes-merged.txt | |
$ sa-learn --restore bayes-merged.txt | |
=head1 DESCRIPTION | |
Given a list of C<sa-learn --backup> files, merge them in memory and print the | |
result. SpamAssassin has no built-in ability to merge two bayes databases | |
together, and the C<sa-learn --restore> command clobbers existing data on use. | |
This script merge backup files so the output can be restored into a single | |
merged database. | |
Only db_version 3 is supported; a version 2 db or backup can be convered by | |
restore and re-backup through sa-learn. | |
=head1 SEE ALSO | |
Basic format and merge explanation given here: | |
L<https://web.archive.org/web/20240727114619/https://bz.apache.org/SpamAssassin/show_bug.cgi?id=6997#c1> | |
Backup/restore behavior reference: | |
L<Mail::SpamAssassin::BayesStore::BDB> | |
=cut | |
use v5.36; | |
use List::Util qw(max); | |
my ($num_spam, $num_nonspam) = (0, 0); | |
my (%spam_count, %ham_count, %atime); | |
my %sigs; | |
sub main { | |
load_backup($_) for @ARGV; | |
dump_merged(); | |
} | |
sub dump_merged { | |
local $, = "\t"; | |
say 'v', '3', 'db_version # this must be the first line!!!'; | |
say 'v', $num_spam, 'num_spam'; | |
say 'v', $num_nonspam, 'num_nonspam'; | |
for my $token (sort keys %atime) { | |
my ($sc, $hc) = ($spam_count{$token}, $ham_count{$token}); | |
next if $sc == 0 && $hc == 0; | |
say 't', $sc, $hc, $atime{$token}, $token; | |
} | |
for my $sig (sort keys %sigs) { | |
say 's', $sigs{$sig}, $sig; | |
} | |
} | |
sub load_backup($filename) { | |
open my $fh, $filename | |
or die "Can't open $filename: $!"; | |
for (<$fh>) { | |
chomp; | |
my ($type, @l) = split "\t"; | |
if ( my $handler = main::->can("handle_$type") ) { | |
$handler->(@l); | |
} | |
else { | |
warn "unhandled line type $type: @l\n"; | |
} | |
} | |
} | |
# var | |
sub handle_v($val, $key) { | |
return unless $key; | |
if ( $key =~ /^db_version/ ) { | |
die "db_version $val not supported" if $val != 3; | |
} | |
elsif ( $key eq 'num_spam' ) { | |
$num_spam += $val; | |
} | |
elsif ( $key eq 'num_nonspam' ) { | |
$num_nonspam += $val; | |
} | |
else { | |
warn "unhandled var line: $key = $val\n"; | |
} | |
} | |
# token | |
sub handle_t($sc, $hc, $atime, $token) { | |
return unless $token; | |
$spam_count{$token} += max(0, $sc); | |
$ham_count{$token} += max(0, $hc); | |
$atime{$token} = max($atime{$token} // 0, $atime); | |
} | |
# signature | |
sub handle_s($learned_as, $msgid) { | |
return unless $msgid; | |
if ( exists $sigs{$msgid} && $sigs{$msgid} ne $learned_as ) { | |
warn "encountered signature learned as both spam and ham: $msgid\n"; | |
return; | |
} | |
$sigs{$msgid} = $learned_as; | |
} | |
main() unless caller; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment