Skip to content

Instantly share code, notes, and snippets.

@hoehrmann
Created April 15, 2012 23:27
Show Gist options
  • Save hoehrmann/2395307 to your computer and use it in GitHub Desktop.
Save hoehrmann/2395307 to your computer and use it in GitHub Desktop.
Merge Internet Archive OCR data with Google Books plain text into JSON
#!perl -w
use strict;
use warnings;
use XML::Parser;
use PerlIO::via::gzip;
use feature 'say';
use Archive::Zip;
use Image::Magick;
use Algorithm::Diff 'sdiff';
use List::Util qw/min max first/;
use YAML::XS;
use JSON;
use URI;
#
# Prints out tab-separated values with the fields: id, Unicode value,
# page number, offset left, top, right, bottom, and the image data as
# data:image/png address.
#
# http://archive.org/details/glossariumderfr01outzgoog
my $prefix = 'glossariumderfr01outzgoog';
my $zip_path = "${prefix}_tif.zip";
my $abbyy_path = "${prefix}_abbyy.gz";
# Generated using sth like https://gist.github.com/2313504
my $google_path = "${prefix}_google.txt";
open my $google_f, '<:utf8', $google_path or die $!;
my $google_txt = do { local $/; <$google_f>; };
my @google_words = split /\s+/, $google_txt;
my $zip = Archive::Zip->new;
$zip->read($zip_path) and die $!;
open my $f, '<:via(gzip)', $abbyy_path or die $!;
sub image_region_to_uri {
# return "";
my ($image, %region) = @_;
my $clone = $image->Clone();
$clone->Extent(%region);
$clone->Strip();
my $png_data = $clone->ImageToBlob(magick => 'png');
my $uri = URI->new('data:image/png,');
$uri->data($png_data);
return "$uri";
}
sub diff_words {
my @words = @_;
my @trial = sdiff \@words, \@google_words;
my $sum_u = grep { $_->[0] eq 'u' } @trial;
my $middle = 0;
my $running = 0;
for (my $running = 0; $running < $sum_u / 2; ++$middle) {
$running++ if $trial[$middle]->[0] eq 'u';
}
my @part =
@google_words[ max($middle - @words, 0)
.. min($#google_words, $middle + @words) ];
@trial = sdiff \@words, \@part;
my $third = int(0.33 * $#trial);
my @lead_third = @trial[ 0 .. $third ];
my @tail_third = @trial[ 2 * $third .. $#trial ];
my $skip = grep { $_->[0] eq '+' } @lead_third;
my $cut = grep { $_->[0] eq '+' } @tail_third;
@part = @part[ $skip .. $#part - $cut ];
@trial = sdiff \@words, \@part;
# find adjacent where IA has <prefix>-, <suffix> and Google
# has <prefix><suffix> as single entry and hyphenate Google.
# If IA has <prefix>- <suffix> and Google has <prefix>-<suffix>
# as single entry, split Google.
my @new = ();
for (my $i = 1; $i <= $#trial; ++$i) {
my ($ia_prefix) = $trial[ $i - 1 ]->[1] =~ /^(.+)-$/;
my ($ia_suffix) = $trial[$i]->[1];
if (defined $ia_prefix
and $trial[ $i - 1 ]->[2] eq "$ia_prefix$ia_suffix") {
push @new, "$ia_prefix-";
push @new, "$ia_suffix";
} elsif (defined $ia_prefix
and $trial[ $i - 1 ]->[2] eq "$ia_prefix-$ia_suffix") {
push @new, "$ia_prefix-";
push @new, "$ia_suffix";
} else {
push @new, $trial[ $i - 1 ]->[2];
push @new, $trial[$i]->[2] if $i == $#trial;
}
}
@trial = sdiff \@words, [ grep { length } @new ];
}
my $page = 0;
my $text = "";
my $cur_img;
my $p = XML::Parser->new;
my $cur_p;
$p->setHandlers(
Start => sub {
my ($xp, $name, %attr) = @_;
$text = "";
if ($name eq 'page') {
$cur_p = { page_number => ++$page, };
my $tiff_path = sprintf "%s_tif/%s_%04u.tif",
$prefix, $prefix, $page;
my $tiff_data = $zip->contents($tiff_path);
$cur_img = Image::Magick->new(magick => 'tif');
$cur_img->Set('monochrome' => 'True');
$cur_img->BlobToImage($tiff_data);
} elsif ($name eq 'par') {
push @{ $cur_p->{pars} }, {};
} elsif ($name eq 'line') {
push @{ $cur_p->{pars}[-1]{lines} },
{ words => [ { chars => [] } ], };
} elsif ($name eq 'charParams') {
if ($attr{wordStart} eq 'true') {
if (@{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }) {
push @{ $cur_p->{pars}[-1]{lines}[-1]{words} },
{ chars => [] };
}
}
push @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }, {
x => $attr{l},
y => $attr{t},
width => $attr{r} - $attr{l},
height => $attr{b} - $attr{t}
};
}
},
Char => sub {
my ($xp, $data) = @_;
$text .= $data;
},
End => sub {
my ($xp, $name) = @_;
if ($name eq 'charParams' and $text =~ /\s+/) {
# treat white space as word boundary
pop @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} };
push @{ $cur_p->{pars}[-1]{lines}[-1]{words} }, { chars => [] };
} elsif ($name eq 'charParams') {
my $char = $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars}[-1];
my $uri = image_region_to_uri(
$cur_img,
x => $char->{x},
y => $char->{y},
width => $char->{width},
height => $char->{height}
);
$char->{scan} = "$uri";
$char->{code} = ord $text;
$char->{char} = $text;
}
if ($name eq 'line') {
my $line = $cur_p->{pars}[-1]{lines}[-1];
$line->{words} =
[ grep { @{ $_->{chars} } } @{ $line->{words} } ];
for my $w (@{ $line->{words} }) {
$w->{word} = join '', map { $_->{char} } @{ $w->{chars} };
$w->{x} = min map { $_->{x} } @{ $w->{chars} };
$w->{y} = min map { $_->{y} } @{ $w->{chars} };
$w->{width} =
max(map { $_->{x} + $_->{width} } @{ $w->{chars} }) - $w->{x};
$w->{height} =
max(map { $_->{y} + $_->{height} } @{ $w->{chars} }) -
$w->{y};
my $uri = image_region_to_uri(
$cur_img,
x => $w->{x},
y => $w->{y},
width => $w->{width},
height => $w->{height},
);
$w->{scan} = "$uri";
}
}
if ($name eq 'page') {
warn "$page\n";
#return unless $page == 40;
my @words = map { @{ $_->{words} // [] } } # @@@
map { @{ $_->{lines} // [] } } # @@@
@{ $cur_p->{pars} };
my @literals = map { $_->{word} // "" } @words;
my @trial = diff_words(@literals);
my $index = 0;
for (@trial) {
if ($_->[0] eq '+' or not length $_->[1]) {
push @{ $words[$index]->{google_words} }, $_->[2];
next;
}
push @{ $words[ $index++ ]->{google_words} }, $_->[2];
}
my $json = JSON->new->ascii(1)->pretty(1)->encode($cur_p);
my $filename =
sprintf("xnew-%s-page%04u.json.gz", $prefix, $page);
open my $f, '>:via(gzip)', $filename or die $!;
print $f $json;
close $f;
# die;
}
}
);
$p->parse($f);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment