Last active
February 4, 2025 20:03
-
-
Save mjordan/320befe368809e172f142426c50eee64 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script to generate tiles for an image viewed using the Mirador Viewer. Also takes a screenshot | |
of the page for quick QA of results. Input is a file containing a list of node IDs for nodes with | |
an Islandora Model of Page or Image. | |
This script should be run on a batch of Page or Image nodes prior to running generate_paged_content_iiif_manifests.py | |
since IIIF Presentation manifest generation is much more reliable if the constituent images have already been pre-cached. | |
Note: IIIF Presentation manifests for nodes with a Paged Content model are pre-cached by a different script, | |
generate_paged_content_iiif_manifests.py. | |
Usage: python tile_warmer.py node_ids.txt # Where node_ids.txt is a file containing a list of node IDs, one per line. | |
""" | |
import sys | |
import os | |
from time import sleep | |
import re | |
import logging | |
from pathlib import Path | |
from concurrent.futures import ThreadPoolExecutor | |
import random | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
import cv2 | |
import numpy as np | |
url_input_csv_filename = sys.argv[1].strip() | |
################################# | |
### Configuration variables. #### | |
################################# | |
log_file_path = Path(url_input_csv_filename).stem + ".log" | |
# screenshots_dir_path must exist. | |
screenshots_dir_path = "/tmp/screenshots" | |
base_url = "https://digital-ps.lib.sfu.ca" | |
# Set to False to process the node IDs in the order they appear in the input | |
# file, True to shuffle the list of node IDs before iterating through it. | |
randomize_input = True | |
# Maxiumum number of threads the script is allowed to use. Increasing | |
# this number will add more load to the server. | |
max_workers = 3 | |
# We pause to allow the tiles to be generated. It's also at this point in | |
# time that the screenshot is taken. | |
sleep_length = 35 | |
# % of pixels in image within gray range. We start with 75% gray pixels | |
# and decrease it to 60% for nodes we need to retry to account for partially | |
# tiled images. | |
large_gray_area_threshold = 75 | |
large_gray_area_threshold_reduced = 60 | |
# Define the gray range (pure black is 0, pure white is 255). | |
lower_gray = 235 | |
upper_gray = 255 | |
logging.basicConfig( | |
filename=log_file_path, | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s", | |
datefmt="%d-%b-%y %H:%M:%S", | |
) | |
chrome_options = Options() | |
chrome_options.add_argument("--headless=new") | |
chrome_options.add_argument("--start-maximized") | |
# We track nodes that didn't get fully tiled the first time around. Second and | |
# third time around we test the render using the gray threshold defined in | |
# large_gray_area_threshold_reduced. | |
nodes_to_retile = list() | |
################# | |
### Functions ### | |
################# | |
def mirador_is_empty(screenshot_file_path, url): | |
"""Attempt to determine if the screenshot contains an empty | |
(i.e. all gray) Mirador Viewer by calculating the ratio | |
of gray pixels to the total number of pixels in the cropped image. | |
Returns True if the image did not appear to be fully tiled | |
(i.e., the image is mostly gray pixels), False if it did | |
appear to be successfully tiled (i.e., the image is not mostly | |
gray pixels). | |
""" | |
# Load the image. | |
image = cv2.imread(screenshot_file_path) | |
# Convert it to grayscale. | |
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
# Get the width of the image. | |
dimensions = gray_image.shape | |
width = dimensions[1] | |
# Crop the image to remove stuff below the Mirador Viewer, | |
# making for a more accurate proportion of gray pixels. | |
gray_image_cropped = gray_image[0:width, 0:width] | |
# Apply thresholding to get the gray areas. | |
thresh, gray_mask = cv2.threshold( | |
gray_image_cropped, lower_gray, upper_gray, cv2.THRESH_BINARY | |
) | |
# Get the total number of pixels in the image. | |
total_pixels = gray_image_cropped.size | |
# Get the number of gray (within the range) pixels in the image.s | |
gray_pixel_count = np.sum(gray_mask == 255) | |
# Get the percentage of gray area in the image. | |
gray_area_percentage = (gray_pixel_count / total_pixels) * 100 | |
if url in nodes_to_retile: | |
gray_threshold = large_gray_area_threshold_reduced | |
else: | |
gray_threshold = large_gray_area_threshold | |
if gray_area_percentage > gray_threshold: | |
return True | |
else: | |
return False | |
def render_node(url, screenshot_file_path): | |
"""Hits the node with Chrome (via Selenium) to trigger Cantaloupe to | |
generate and cache the tiles. | |
""" | |
try: | |
driver = webdriver.Chrome(options=chrome_options) | |
driver.get(url) | |
sleep(sleep_length) | |
required_width = driver.execute_script( | |
"return document.documentElement.scrollWidth" | |
) | |
required_height = driver.execute_script( | |
"return document.documentElement.scrollHeight" | |
) | |
driver.set_window_size(required_width, required_height) | |
driver.save_screenshot(screenshot_file_path) | |
driver.quit() | |
except Exception as e: | |
logging.error( | |
f"Attempt to generate IIIF tiles for {url} encountered an error: {e}" | |
) | |
def warm_url(node_id): | |
"""Processes a single node by hitting it with Selenium. If the resulting | |
screenshot shows that the tiling was incomplete, retry it. | |
""" | |
url = f"{base_url}/node/{node_id}" | |
print(f"Warming image tiles for {url}.") | |
screenshot_filename = re.sub("[^0-9a-zA-Z]+", "_", url) | |
screenshot_file_path = os.path.join( | |
screenshots_dir_path, screenshot_filename + ".png" | |
) | |
logging.info(f"Warming image tiles for {url}.") | |
# First render. | |
render_node(url, screenshot_file_path) | |
# If the screenshot shows that the tiling wasn't complete, try a second time. | |
if mirador_is_empty(screenshot_file_path, url) is True: | |
logging.warning( | |
f"Screenshot for {url} shows tiling was incomplete. Will try a second time." | |
) | |
if url not in nodes_to_retile: | |
nodes_to_retile.append(url) | |
# Second render. | |
render_node(url, screenshot_file_path) | |
if mirador_is_empty(screenshot_file_path, url) is True: | |
logging.warning( | |
f"Screenshot for {url} shows second attempt at tiling was incomplete." | |
) | |
# If the second attempt didn't render it fully, try a third and final time. | |
render_node(url, screenshot_file_path) | |
if mirador_is_empty(screenshot_file_path, url) is True: | |
logging.error( | |
f"Screenshot for {url} shows third and final attempt at tiling was incomplete." | |
) | |
else: | |
logging.info( | |
f"Screenshot for {url} shows third attempt at tiling was complete." | |
) | |
else: | |
logging.info( | |
f"Screenshot for {url} shows second attempt at tiling was complete." | |
) | |
else: | |
if url in nodes_to_retile: | |
logging.info( | |
f"Screenshot for rewarmed node {url} shows tiling was complete." | |
) | |
nodes_to_retile.remove(url) | |
def warm_all_urls(nids): | |
"""Thread pool manager. Walks through the list of URLs and assigns threads to process | |
up to max_workers nodes at a time. | |
""" | |
with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
executor.map(warm_url, nids) | |
########################## | |
### Main script logic. ### | |
########################## | |
if __name__ == "__main__": | |
logging.info( | |
f"Starting tile warmer job, processing {max_workers} nodes at a time. Chrome will wait {sleep_length} seconds per node before taking a screenshot." | |
) | |
with open(url_input_csv_filename) as fh: | |
lines = fh.read().splitlines() | |
if randomize_input is True: | |
lines = random.sample(lines, len(lines)) | |
# Filter out 'nid' column header. | |
nids_to_warm = [line for line in lines if line.isnumeric()] | |
warm_all_urls(nids_to_warm) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment