Created
April 15, 2012 23:27
-
-
Save hoehrmann/2395307 to your computer and use it in GitHub Desktop.
Merge Internet Archive OCR data with Google Books plain text into JSON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!perl -w | |
use strict; | |
use warnings; | |
use XML::Parser; | |
use PerlIO::via::gzip; | |
use feature 'say'; | |
use Archive::Zip; | |
use Image::Magick; | |
use Algorithm::Diff 'sdiff'; | |
use List::Util qw/min max first/; | |
use YAML::XS; | |
use JSON; | |
use URI; | |
# | |
# Prints out tab-separated values with the fields: id, Unicode value, | |
# page number, offset left, top, right, bottom, and the image data as | |
# data:image/png address. | |
# | |
# http://archive.org/details/glossariumderfr01outzgoog | |
my $prefix = 'glossariumderfr01outzgoog'; | |
my $zip_path = "${prefix}_tif.zip"; | |
my $abbyy_path = "${prefix}_abbyy.gz"; | |
# Generated using sth like https://gist.github.com/2313504 | |
my $google_path = "${prefix}_google.txt"; | |
open my $google_f, '<:utf8', $google_path or die $!; | |
my $google_txt = do { local $/; <$google_f>; }; | |
my @google_words = split /\s+/, $google_txt; | |
my $zip = Archive::Zip->new; | |
$zip->read($zip_path) and die $!; | |
open my $f, '<:via(gzip)', $abbyy_path or die $!; | |
sub image_region_to_uri { | |
# return ""; | |
my ($image, %region) = @_; | |
my $clone = $image->Clone(); | |
$clone->Extent(%region); | |
$clone->Strip(); | |
my $png_data = $clone->ImageToBlob(magick => 'png'); | |
my $uri = URI->new('data:image/png,'); | |
$uri->data($png_data); | |
return "$uri"; | |
} | |
sub diff_words { | |
my @words = @_; | |
my @trial = sdiff \@words, \@google_words; | |
my $sum_u = grep { $_->[0] eq 'u' } @trial; | |
my $middle = 0; | |
my $running = 0; | |
for (my $running = 0; $running < $sum_u / 2; ++$middle) { | |
$running++ if $trial[$middle]->[0] eq 'u'; | |
} | |
my @part = | |
@google_words[ max($middle - @words, 0) | |
.. min($#google_words, $middle + @words) ]; | |
@trial = sdiff \@words, \@part; | |
my $third = int(0.33 * $#trial); | |
my @lead_third = @trial[ 0 .. $third ]; | |
my @tail_third = @trial[ 2 * $third .. $#trial ]; | |
my $skip = grep { $_->[0] eq '+' } @lead_third; | |
my $cut = grep { $_->[0] eq '+' } @tail_third; | |
@part = @part[ $skip .. $#part - $cut ]; | |
@trial = sdiff \@words, \@part; | |
# find adjacent where IA has <prefix>-, <suffix> and Google | |
# has <prefix><suffix> as single entry and hyphenate Google. | |
# If IA has <prefix>- <suffix> and Google has <prefix>-<suffix> | |
# as single entry, split Google. | |
my @new = (); | |
for (my $i = 1; $i <= $#trial; ++$i) { | |
my ($ia_prefix) = $trial[ $i - 1 ]->[1] =~ /^(.+)-$/; | |
my ($ia_suffix) = $trial[$i]->[1]; | |
if (defined $ia_prefix | |
and $trial[ $i - 1 ]->[2] eq "$ia_prefix$ia_suffix") { | |
push @new, "$ia_prefix-"; | |
push @new, "$ia_suffix"; | |
} elsif (defined $ia_prefix | |
and $trial[ $i - 1 ]->[2] eq "$ia_prefix-$ia_suffix") { | |
push @new, "$ia_prefix-"; | |
push @new, "$ia_suffix"; | |
} else { | |
push @new, $trial[ $i - 1 ]->[2]; | |
push @new, $trial[$i]->[2] if $i == $#trial; | |
} | |
} | |
@trial = sdiff \@words, [ grep { length } @new ]; | |
} | |
my $page = 0; | |
my $text = ""; | |
my $cur_img; | |
my $p = XML::Parser->new; | |
my $cur_p; | |
$p->setHandlers( | |
Start => sub { | |
my ($xp, $name, %attr) = @_; | |
$text = ""; | |
if ($name eq 'page') { | |
$cur_p = { page_number => ++$page, }; | |
my $tiff_path = sprintf "%s_tif/%s_%04u.tif", | |
$prefix, $prefix, $page; | |
my $tiff_data = $zip->contents($tiff_path); | |
$cur_img = Image::Magick->new(magick => 'tif'); | |
$cur_img->Set('monochrome' => 'True'); | |
$cur_img->BlobToImage($tiff_data); | |
} elsif ($name eq 'par') { | |
push @{ $cur_p->{pars} }, {}; | |
} elsif ($name eq 'line') { | |
push @{ $cur_p->{pars}[-1]{lines} }, | |
{ words => [ { chars => [] } ], }; | |
} elsif ($name eq 'charParams') { | |
if ($attr{wordStart} eq 'true') { | |
if (@{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }) { | |
push @{ $cur_p->{pars}[-1]{lines}[-1]{words} }, | |
{ chars => [] }; | |
} | |
} | |
push @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }, { | |
x => $attr{l}, | |
y => $attr{t}, | |
width => $attr{r} - $attr{l}, | |
height => $attr{b} - $attr{t} | |
}; | |
} | |
}, | |
Char => sub { | |
my ($xp, $data) = @_; | |
$text .= $data; | |
}, | |
End => sub { | |
my ($xp, $name) = @_; | |
if ($name eq 'charParams' and $text =~ /\s+/) { | |
# treat white space as word boundary | |
pop @{ $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars} }; | |
push @{ $cur_p->{pars}[-1]{lines}[-1]{words} }, { chars => [] }; | |
} elsif ($name eq 'charParams') { | |
my $char = $cur_p->{pars}[-1]{lines}[-1]{words}[-1]{chars}[-1]; | |
my $uri = image_region_to_uri( | |
$cur_img, | |
x => $char->{x}, | |
y => $char->{y}, | |
width => $char->{width}, | |
height => $char->{height} | |
); | |
$char->{scan} = "$uri"; | |
$char->{code} = ord $text; | |
$char->{char} = $text; | |
} | |
if ($name eq 'line') { | |
my $line = $cur_p->{pars}[-1]{lines}[-1]; | |
$line->{words} = | |
[ grep { @{ $_->{chars} } } @{ $line->{words} } ]; | |
for my $w (@{ $line->{words} }) { | |
$w->{word} = join '', map { $_->{char} } @{ $w->{chars} }; | |
$w->{x} = min map { $_->{x} } @{ $w->{chars} }; | |
$w->{y} = min map { $_->{y} } @{ $w->{chars} }; | |
$w->{width} = | |
max(map { $_->{x} + $_->{width} } @{ $w->{chars} }) - $w->{x}; | |
$w->{height} = | |
max(map { $_->{y} + $_->{height} } @{ $w->{chars} }) - | |
$w->{y}; | |
my $uri = image_region_to_uri( | |
$cur_img, | |
x => $w->{x}, | |
y => $w->{y}, | |
width => $w->{width}, | |
height => $w->{height}, | |
); | |
$w->{scan} = "$uri"; | |
} | |
} | |
if ($name eq 'page') { | |
warn "$page\n"; | |
#return unless $page == 40; | |
my @words = map { @{ $_->{words} // [] } } # @@@ | |
map { @{ $_->{lines} // [] } } # @@@ | |
@{ $cur_p->{pars} }; | |
my @literals = map { $_->{word} // "" } @words; | |
my @trial = diff_words(@literals); | |
my $index = 0; | |
for (@trial) { | |
if ($_->[0] eq '+' or not length $_->[1]) { | |
push @{ $words[$index]->{google_words} }, $_->[2]; | |
next; | |
} | |
push @{ $words[ $index++ ]->{google_words} }, $_->[2]; | |
} | |
my $json = JSON->new->ascii(1)->pretty(1)->encode($cur_p); | |
my $filename = | |
sprintf("xnew-%s-page%04u.json.gz", $prefix, $page); | |
open my $f, '>:via(gzip)', $filename or die $!; | |
print $f $json; | |
close $f; | |
# die; | |
} | |
} | |
); | |
$p->parse($f); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment