Created
June 11, 2024 14:47
-
-
Save bertsky/b82f0400b842761e0e3e92917fd19cfc to your computer and use it in GitHub Desktop.
DFG METS: overwrite PAGE-XML @imageFilename from image fileGrp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import click | |
from ocrd_utils import MIMETYPE_PAGE | |
from ocrd_models.ocrd_mets import OcrdMets | |
from ocrd_models.ocrd_page import to_xml | |
from ocrd_modelfactory import page_from_file | |
@click.command() | |
@click.option('-m', '--mets-file', default="mets.xml", help="path to METS of workspace") | |
@click.option('-I', '--input-file-grp', required=True, help="fileGrp to pick image files from") | |
@click.option('-O', '--output-file-grp', required=True, help="fileGrp to modify PAGE files from") | |
def cli(mets_file, input_file_grp, output_file_grp): | |
""" | |
open METS file, for each physical page in the structmap, | |
get the local filename of the respective image file in the input fileGrp, | |
get the local filename of the respective PAGE-XML file in the output fileGrp, | |
open the PAGE-XML file and overwrite its `@imageFilename` with the local path | |
of the image file in the METS. Finally serialise the modified PAGE-XML file. | |
""" | |
os.chdir(os.path.dirname(mets_file)) | |
mets = OcrdMets(filename=os.path.basename(mets_file)) | |
page2image = {ocrd_file.pageId: ocrd_file.local_filename | |
for ocrd_file in mets.find_all_files(fileGrp=input_file_grp) | |
if ocrd_file.mimetype.startswith('image/')} | |
page2PAGE = {ocrd_file.pageId: ocrd_file.local_filename | |
for ocrd_file in mets.find_all_files(fileGrp=output_file_grp) | |
if ocrd_file.mimetype == MIMETYPE_PAGE} | |
for page in page2PAGE: | |
PAGE = page2PAGE[page] | |
img = page2image[page] | |
pcgts = page_from_file(PAGE) | |
pcgts.get_Page().set_imageFilename(img) | |
with open(PAGE, 'w', encoding='utf-8') as f: | |
f.write(to_xml(pcgts)) | |
if __name__ == '__main__': | |
cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment