Created February 28, 2020 13:20
use strict;
use utf8;
binmode(STDOUT, ":utf8");
binmode(STDERR, ":utf8");
use XML::LibXML;
use File::Slurp;
use XML::Quote;
use List::Util qw(min max);
# for output of OCR-D.
my $prog="$0";
for my $xml_fn (@ARGV) {
if($xml_fn=~/\.alto\.xml$/) { next; }
my $xml_str=read_file($xml_fn);
my $dom;
eval {
if($@) { warn "$prog: $xml_fn: $@\n"; next; }
my $out='<?xml version="1.0" encoding="UTF-8" standalone="yes"?>';
$out.='<!-- generated by '.$prog.' -->';
$out.='<alto xmlns="">';
my $xpc = XML::LibXML::XPathContext->new($dom);
$xpc->registerNs('PAGE', '');
my $root=$dom->documentElement;
my $pageNr=0;
for my $Page ($xpc->findnodes('/PAGE:PcGts/PAGE:Page', $root)) {
my $w=$Page->getAttribute("imageWidth");
my $h=$Page->getAttribute("imageHeight");
$out.=qq{<Page ID="Page1" PHYSICAL_IMG_NR="$pageNr" HEIGHT="$h" WIDTH="$w"><PrintSpace>};
for my $RegionRefIndexed ($xpc->findnodes('//PAGE:RegionRefIndexed', $Page)) {
my $id=$RegionRefIndexed->getAttribute("regionRef");
for my $region ($xpc->findnodes('//PAGE:*[@id="'.$id.'"]')) {
# if($region->nodeName ne "TextRegion") { next; } # perhaps Table with TextLine?
my $outLine="";
for my $TextLine ($xpc->findnodes(".//PAGE:TextLine", $region)) {
my $xmin=1e308; my $xmax=-1e308;
my $ymin=1e308; my $ymax=-1e308;
for my $coords ($xpc->findnodes(".//PAGE:Coords", $TextLine)) {
for my $xy (split /\s+/, $coords->getAttribute("points")) {
my($x,$y)=split /,/, $xy, 2;
$xmin=min($xmin, $x);
$xmax=max($xmax, $x);
$ymin=min($ymin, $y);
$ymax=max($ymax, $y);
my $w=$xmax-$xmin;
my $h=$ymax-$ymin;
for my $TextEquiv ($xpc->findnodes(".//PAGE:TextEquiv/PAGE:Unicode", $TextLine)) {
my $txt=$TextEquiv->textContent; $txt=~s/[\r\n]//g; $txt=~s/[\x00-\x08\x0a-\x1f]/ /g;
$outLine.=qq{<TextLine HEIGHT="$h" WIDTH="$w" VPOS="$ymin" HPOS="$xmin"><String CONTENT="}.xml_quote($txt).qq{"/></TextLine>};
if(length($outLine)) {
my $ofn=$xml_fn; $ofn=~s/\.xml$/.alto.xml/;
open my $f, ">:utf8", $ofn or die;
print $f $out;
close $f;
