Created
February 28, 2020 13:20
-
-
Save jbarth-ubhd/0e867c20008639145386a7978fdb27a4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use utf8; | |
binmode(STDOUT, ":utf8"); | |
binmode(STDERR, ":utf8"); | |
use XML::LibXML; | |
use File::Slurp; | |
use XML::Quote; | |
use List::Util qw(min max); | |
# QUICK AND DIRTY. NOT FOR PRODUCTION USE. | |
# for output of OCR-D. | |
my $prog="$0"; | |
for my $xml_fn (@ARGV) { | |
if($xml_fn=~/\.alto\.xml$/) { next; } | |
my $xml_str=read_file($xml_fn); | |
my $dom; | |
eval { | |
$dom=XML::LibXML->load_xml(string=>$xml_str); | |
}; | |
if($@) { warn "$prog: $xml_fn: $@\n"; next; } | |
my $out='<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'; | |
$out.='<!-- generated by '.$prog.' -->'; | |
$out.='<alto xmlns="http://www.loc.gov/standards/alto/ns-v2#">'; | |
$out.='<Layout>'; | |
my $xpc = XML::LibXML::XPathContext->new($dom); | |
$xpc->registerNs('PAGE', 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15'); | |
my $root=$dom->documentElement; | |
my $pageNr=0; | |
for my $Page ($xpc->findnodes('/PAGE:PcGts/PAGE:Page', $root)) { | |
$pageNr++; | |
my $w=$Page->getAttribute("imageWidth"); | |
my $h=$Page->getAttribute("imageHeight"); | |
$out.=qq{<Page ID="Page1" PHYSICAL_IMG_NR="$pageNr" HEIGHT="$h" WIDTH="$w"><PrintSpace>}; | |
for my $RegionRefIndexed ($xpc->findnodes('//PAGE:RegionRefIndexed', $Page)) { | |
my $id=$RegionRefIndexed->getAttribute("regionRef"); | |
for my $region ($xpc->findnodes('//PAGE:*[@id="'.$id.'"]')) { | |
# if($region->nodeName ne "TextRegion") { next; } # perhaps Table with TextLine? | |
my $outLine=""; | |
for my $TextLine ($xpc->findnodes(".//PAGE:TextLine", $region)) { | |
my $xmin=1e308; my $xmax=-1e308; | |
my $ymin=1e308; my $ymax=-1e308; | |
for my $coords ($xpc->findnodes(".//PAGE:Coords", $TextLine)) { | |
for my $xy (split /\s+/, $coords->getAttribute("points")) { | |
my($x,$y)=split /,/, $xy, 2; | |
$xmin=min($xmin, $x); | |
$xmax=max($xmax, $x); | |
$ymin=min($ymin, $y); | |
$ymax=max($ymax, $y); | |
} | |
} | |
my $w=$xmax-$xmin; | |
my $h=$ymax-$ymin; | |
for my $TextEquiv ($xpc->findnodes(".//PAGE:TextEquiv/PAGE:Unicode", $TextLine)) { | |
my $txt=$TextEquiv->textContent; $txt=~s/[\r\n]//g; $txt=~s/[\x00-\x08\x0a-\x1f]/ /g; | |
$outLine.=qq{<TextLine HEIGHT="$h" WIDTH="$w" VPOS="$ymin" HPOS="$xmin"><String CONTENT="}.xml_quote($txt).qq{"/></TextLine>}; | |
} | |
} | |
if(length($outLine)) { | |
$out.='<TextBlock>'.$outLine.'</TextBlock>'; | |
} | |
} | |
} | |
$out.='</PrintSpace></Page>'; | |
} | |
$out.='</Layout></alto>'; | |
my $ofn=$xml_fn; $ofn=~s/\.xml$/.alto.xml/; | |
open my $f, ">:utf8", $ofn or die; | |
print $f $out; | |
close $f; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment