Skip to content

Instantly share code, notes, and snippets.

@jbarth-ubhd
Created January 14, 2022 12:46
Show Gist options
  • Save jbarth-ubhd/4826031b9de3b9c394be0da40bee14b6 to your computer and use it in GitHub Desktop.
Save jbarth-ubhd/4826031b9de3b9c394be0da40bee14b6 to your computer and use it in GitHub Desktop.
minimalistic ABBYY XML to PAGE XML
#!/usr/bin/perl
use strict;
use utf8;
use XML::LibXML;
use XML::Quote;
binmode STDOUT, ":utf8";
my $dom=XML::LibXML->load_xml(location=>$ARGV[0]);
my $root=$dom->documentElement;
print qq{<?xml version="1.0" encoding="UTF-8"?>
<!-- NOT FOR ARCHIVAL PURPOSE, converted from $ARGV[0] -->
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-OCR_00001">\n};
printf qq{<pc:Metadata>\n};
printf qq{<pc:Creator>abbyy2page.pl</pc:Creator>\n};
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime(time);
printf qq{<pc:Created>%04d-%02d-%02dT%02d:%02d:%02d</pc:Created>\n}, $year+1900, $mon+1, $mday, $hour, $min, $sec;
printf qq{<pc:LastChange>%04d-%02d-%02dT%02d:%02d:%02d</pc:LastChange>\n}, $year+1900, $mon+1, $mday, $hour, $min, $sec;
printf qq{</pc:Metadata>\n};
for my $page ($root->findnodes("page")) {
printf qq{<pc:Page imageWidth="%d" imageHeight="%d" imageFilename="xxx.tif">\n}, $page->getAttribute("width"), $page->getAttribute("height");
my @blocks=$page->findnodes(".//block");
printf "<pc:ReadingOrder><pc:OrderedGroup id=\"orderedGroup\">\n";
for(my $i=1; $i<=$#blocks; $i++) {
printf qq{<pc:RegionRefIndexed index="$i" regionRef="region$i"/>\n};
}
printf "</pc:OrderedGroup></pc:ReadingOrder>\n";
my $bnr=1;
for my $block ($page->findnodes(".//block")) {
printf "<pc:TextRegion id=\"region$bnr\">\n";
printf qq{<pc:Coords points="%d,%d %d,%d %d,%d %d,%d"/>\n},
$block->getAttribute("l"), $block->getAttribute("t"),
$block->getAttribute("r"), $block->getAttribute("t"),
$block->getAttribute("r"), $block->getAttribute("b"),
$block->getAttribute("l"), $block->getAttribute("b");
my $lnr=1;
for my $line ($block->findnodes(".//line")) {
printf "<pc:TextLine id=\"region$bnr-line$lnr\">\n";
printf qq{<pc:Coords points="%d,%d %d,%d %d,%d %d,%d"/>\n},
$line->getAttribute("l"), $line->getAttribute("t"),
$line->getAttribute("r"), $line->getAttribute("t"),
$line->getAttribute("r"), $line->getAttribute("b"),
$line->getAttribute("l"), $line->getAttribute("b");
printf "<pc:TextEquiv><pc:Unicode>%s</pc:Unicode></pc:TextEquiv>\n", xml_quote($line->textContent);
printf "</pc:TextLine>\n";
$lnr++;
}
printf "</pc:TextRegion>\n";
$bnr++;
}
printf "</pc:Page>\n";
}
print "</pc:PcGts>\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment