Created
January 14, 2022 12:46
-
-
Save jbarth-ubhd/4826031b9de3b9c394be0da40bee14b6 to your computer and use it in GitHub Desktop.
minimalistic ABBYY XML to PAGE XML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use utf8; | |
use XML::LibXML; | |
use XML::Quote; | |
binmode STDOUT, ":utf8"; | |
my $dom=XML::LibXML->load_xml(location=>$ARGV[0]); | |
my $root=$dom->documentElement; | |
print qq{<?xml version="1.0" encoding="UTF-8"?> | |
<!-- NOT FOR ARCHIVAL PURPOSE, converted from $ARGV[0] --> | |
<pc:PcGts xmlns:pc="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd" pcGtsId="OCR-D-OCR_00001">\n}; | |
printf qq{<pc:Metadata>\n}; | |
printf qq{<pc:Creator>abbyy2page.pl</pc:Creator>\n}; | |
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = gmtime(time); | |
printf qq{<pc:Created>%04d-%02d-%02dT%02d:%02d:%02d</pc:Created>\n}, $year+1900, $mon+1, $mday, $hour, $min, $sec; | |
printf qq{<pc:LastChange>%04d-%02d-%02dT%02d:%02d:%02d</pc:LastChange>\n}, $year+1900, $mon+1, $mday, $hour, $min, $sec; | |
printf qq{</pc:Metadata>\n}; | |
for my $page ($root->findnodes("page")) { | |
printf qq{<pc:Page imageWidth="%d" imageHeight="%d" imageFilename="xxx.tif">\n}, $page->getAttribute("width"), $page->getAttribute("height"); | |
my @blocks=$page->findnodes(".//block"); | |
printf "<pc:ReadingOrder><pc:OrderedGroup id=\"orderedGroup\">\n"; | |
for(my $i=1; $i<=$#blocks; $i++) { | |
printf qq{<pc:RegionRefIndexed index="$i" regionRef="region$i"/>\n}; | |
} | |
printf "</pc:OrderedGroup></pc:ReadingOrder>\n"; | |
my $bnr=1; | |
for my $block ($page->findnodes(".//block")) { | |
printf "<pc:TextRegion id=\"region$bnr\">\n"; | |
printf qq{<pc:Coords points="%d,%d %d,%d %d,%d %d,%d"/>\n}, | |
$block->getAttribute("l"), $block->getAttribute("t"), | |
$block->getAttribute("r"), $block->getAttribute("t"), | |
$block->getAttribute("r"), $block->getAttribute("b"), | |
$block->getAttribute("l"), $block->getAttribute("b"); | |
my $lnr=1; | |
for my $line ($block->findnodes(".//line")) { | |
printf "<pc:TextLine id=\"region$bnr-line$lnr\">\n"; | |
printf qq{<pc:Coords points="%d,%d %d,%d %d,%d %d,%d"/>\n}, | |
$line->getAttribute("l"), $line->getAttribute("t"), | |
$line->getAttribute("r"), $line->getAttribute("t"), | |
$line->getAttribute("r"), $line->getAttribute("b"), | |
$line->getAttribute("l"), $line->getAttribute("b"); | |
printf "<pc:TextEquiv><pc:Unicode>%s</pc:Unicode></pc:TextEquiv>\n", xml_quote($line->textContent); | |
printf "</pc:TextLine>\n"; | |
$lnr++; | |
} | |
printf "</pc:TextRegion>\n"; | |
$bnr++; | |
} | |
printf "</pc:Page>\n"; | |
} | |
print "</pc:PcGts>\n"; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment