Created
February 28, 2013 01:24
-
-
Save hoehrmann/5053420 to your computer and use it in GitHub Desktop.
Take mbox, print HTML version of contents with quoted text marked via <i> elements.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -w | |
use Modern::Perl; | |
use Algorithm::Diff::XS; | |
use Statistics::Basic qw/median stddev avg/; | |
use Mail::Mbox::MessageParser; | |
use MIME::Parser; | |
use MIME::Parser::Reader; | |
use Mail::Field; | |
use Encode; | |
use HTML::Entities; | |
use URI::Escape; | |
binmode STDOUT; | |
BEGIN { | |
# avoid annoying warning | |
local $Mail::Mbox::MessageParser::OLDSTDERR = undef; | |
} | |
my %mails; | |
my $mbox_path = $ARGV[0] // die "Usage: $0 example.mbox\n"; | |
open my $mbox, '<', $mbox_path; | |
my $reader = Mail::Mbox::MessageParser->new({ | |
file_handle => $mbox, | |
}); | |
my $mime_parser = MIME::Parser->new; | |
$mime_parser->output_to_core(1); | |
while (my $m = $reader->read_next_email) { | |
my $entity = $mime_parser->parse_data($m); | |
eval { | |
my $text = get_text_plain($entity); | |
$text = decode_utf8($text); | |
$text =~ s/\x0d\x0a/\x0a/sg; | |
my $get_mid = sub { | |
my $s = shift; | |
return $1 if defined $s and $s =~ /(<\S+?>)/; | |
""; | |
}; | |
my $head = $entity->head; | |
my $mid = $get_mid->($head->get('Message-Id')); | |
my @refs = map { $get_mid->($_) } split /\s+/, | |
($head->get('References') // ""); | |
my $irt = $get_mid->($head->get('In-Reply-To')); | |
my $parent = $irt ? $irt : $refs[-1]; | |
# ... | |
$mails{$mid} = { | |
text => $text, | |
entity => $entity, | |
parent => $parent | |
}; | |
} | |
} | |
sub get_text_plain { | |
# Shouldn't there be a module for this? | |
my $original = shift; | |
my $e = $original; | |
my @plain; | |
if ($e->effective_type eq 'text/plain') { | |
push @plain, $e; | |
} elsif ($e->effective_type eq 'multipart/alternative') { | |
push @plain, | |
[ reverse grep { $_->effective_type eq 'text/plain' } | |
$e->parts ]->[0] | |
if $e->parts; | |
} elsif ($e->effective_type eq 'multipart/mixed') { | |
push @plain, | |
grep { $_->effective_type eq 'text/plain' } $e->parts; | |
} elsif ($e->effective_type eq 'multipart/signed') { | |
push @plain, | |
[ reverse grep { $_->effective_type eq 'text/plain' } | |
$e->parts ]->[0] | |
if $e->parts; | |
} else { | |
warn "Don't know how to extract plain text from " | |
. $e->effective_type | |
. " messages"; | |
} | |
return join '', map { | |
my $encoded = $_->bodyhandle->as_string; | |
my $content_type = $_->get('Content-Type'); | |
if (defined $content_type) { | |
my $field = Mail::Field->new('Content-Type', $content_type); | |
my $charset = $field->charset; | |
if (defined $charset) { | |
my $decoded = Encode::decode($charset, $encoded, 1); | |
$encoded = Encode::encode_utf8($decoded); | |
} | |
} | |
$encoded; | |
} @plain; | |
} | |
sub make_word_list { | |
my $text = shift; | |
my @words = $text =~ /(\w+|\W)/gs; | |
my @result; | |
my $line = 1; | |
my $column = 1; | |
for (my $ix = 0; $ix < @words; ++$ix) { | |
push @result, { | |
word => $words[$ix], | |
index => $ix, | |
word_like => scalar($words[$ix] =~ /^\w+$/), | |
line => $line, | |
column => $column++, | |
}; | |
if ($words[$ix] =~ /[\r\n]/) { | |
$line++; | |
$column = 1; | |
} | |
} | |
return @result; | |
} | |
print q{ | |
<style> | |
pre:nth-of-type(odd) { background-color: #eee } | |
i { color: #ccc; font-style: normal } | |
pre { white-space: pre-wrap } | |
</style> | |
}; | |
while (my ($mid, $obj) = each %mails) { | |
my @child_words = make_word_list $obj->{text}; | |
next unless defined $obj->{parent}; | |
my $parent_obj = $mails{ $obj->{parent} }; | |
next unless defined $parent_obj; | |
my @parent_words = make_word_list $parent_obj->{text}; | |
my @child_words_f = | |
grep { $_->{word_like} || $_->{word} !~ /[\s>]/ } @child_words; | |
my @parent_words_f = | |
grep { $_->{word_like} || $_->{word} !~ /[\s>]/ } @parent_words; | |
my @diff = Algorithm::Diff::sdiff \@parent_words_f, | |
\@child_words_f, sub { | |
return $_[0]->{word}; | |
}; | |
$_->{unmodified_in_diff} = 0 for @child_words; | |
for (@diff) { | |
next unless $_->[0] eq 'u'; | |
$child_words[ $_->[2]->{index} ]->{unmodified_in_diff} = 1; | |
} | |
$_->{quoted} = $_->{unmodified_in_diff} // 0 for @child_words; | |
# If most characters on a line are not quoted, then assume that all | |
# the tokens on that line are in fact not quoted. This is to handle | |
# false positives that are generated due to overlap of frequent | |
# words like "the" and "and" in english text. But do not do that in | |
# some edge cases, which are approximated here as not doing it when | |
# the line is less than the average line long. | |
my %quoted_chars; | |
my %unquoted_chars; | |
my %chars_per_line; | |
$quoted_chars{ $_->{line} } += length $_->{word} | |
for grep { $_->{quoted} } @child_words; | |
$unquoted_chars{ $_->{line} } += length $_->{word} | |
for grep { !$_->{quoted} } @child_words; | |
$chars_per_line{ $_->{line} } += length $_->{word} | |
for @child_words; | |
# ... | |
my $median_cpl = 0 + median values %chars_per_line; | |
my $average_cpl = 0 + avg values %chars_per_line; | |
my $stddev_cpl = 0 + stddev values %chars_per_line; | |
for (@child_words) { | |
my $uc = $unquoted_chars{ $_->{line} } // 0; | |
my $qc = $quoted_chars{ $_->{line} } // 0; | |
my $cc = $chars_per_line{ $_->{line} }; | |
next if $qc >= $uc; | |
next if $cc < $average_cpl; | |
$_->{quoted} = 0; | |
} | |
my $mid_s = $1 if $mid =~ /<(.*)>/; | |
printf "<pre><a href='http://mid.gmane.org/%s'>news:%s</a>\n\n", | |
encode_entities(uri_escape($mid_s)), encode_entities($mid_s); | |
for (@child_words) { | |
if ($_->{quoted}) { | |
print "<i>", encode_entities($_->{word}), "</i>"; | |
} else { | |
print encode_entities($_->{word}); | |
} | |
} | |
print "</pre>"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment