mjordan · February 4, 2025 20:03
diff --git a/tile_warmer.py b/tile_warmer.py
 """
   Script to generate tiles for an image viewed using the Mirador Viewer. Also takes a screenshot
   of the page for quick QA of results. Input is a file containing a list of node IDs for nodes with
   an Islandora Model of Page or Image.

   This script should be run on a batch of Page or Image nodes prior to running generate_paged_content_iiif_manifests.py
   since IIIF Presentation manifest generation is much more reliable if the constituent images have already been pre-cached.

   Note: IIIF Presentation manifests for nodes with a Paged Content model are pre-cached by a different script,
   generate_paged_content_iiif_manifests.py.

   Usage: python tile_warmer.py node_ids.txt # Where node_ids.txt is a file containing a list of node IDs, one per line.
 """

 import sys
 import os
 from time import sleep
 import re
 import logging
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 import random

 from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 import cv2
 import numpy as np

 url_input_csv_filename = sys.argv[1].strip()

 #################################
 ### Configuration variables. ####
 #################################

 log_file_path = Path(url_input_csv_filename).stem + ".log"
 # screenshots_dir_path must exist.
 screenshots_dir_path = "/tmp/screenshots"
 base_url = "https://digital-ps.lib.sfu.ca"

 # Set to False to process the node IDs in the order they appear in the input
 # file, True to shuffle the list of node IDs before iterating through it.
 randomize_input = True

 # Maxiumum number of threads the script is allowed to use. Increasing
 # this number will add more load to the server.
 max_workers = 3
 # We pause to allow the tiles to be generated. It's also at this point in
 # time that the screenshot is taken.
 sleep_length = 35

 # % of pixels in image within gray range. We start with 75% gray pixels
 # and decrease it to 60% for nodes we need to retry to account for partially
 # tiled images.
 large_gray_area_threshold = 75
 large_gray_area_threshold_reduced = 60
 # Define the gray range (pure black is 0, pure white is 255).
 lower_gray = 235
 upper_gray = 255

 logging.basicConfig(
    filename=log_file_path,
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%d-%b-%y %H:%M:%S",
 )

 chrome_options = Options()
 chrome_options.add_argument("--headless=new")
 chrome_options.add_argument("--start-maximized")

 # We track nodes that didn't get fully tiled the first time around. Second and
 # third time around we test the render using the gray threshold defined in
 # large_gray_area_threshold_reduced.
 nodes_to_retile = list()

 #################
 ### Functions ###
 #################


 def mirador_is_empty(screenshot_file_path, url):
    """Attempt to determine if the screenshot contains an empty
    (i.e. all gray) Mirador Viewer by calculating the ratio
    of gray pixels to the total number of pixels in the cropped image.

    Returns True if the image did not appear to be fully tiled
    (i.e., the image is mostly gray pixels), False if it did
    appear to be successfully tiled (i.e., the image is not mostly
    gray pixels).
    """
    # Load the image.
    image = cv2.imread(screenshot_file_path)

    # Convert it to grayscale.
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Get the width of the image.
    dimensions = gray_image.shape
    width = dimensions[1]
    # Crop the image to remove stuff below the Mirador Viewer,
    # making for a more accurate proportion of gray pixels.
    gray_image_cropped = gray_image[0:width, 0:width]

    # Apply thresholding to get the gray areas.
    thresh, gray_mask = cv2.threshold(
        gray_image_cropped, lower_gray, upper_gray, cv2.THRESH_BINARY
    )

    # Get the total number of pixels in the image.
    total_pixels = gray_image_cropped.size

    # Get the number of gray (within the range) pixels in the image.s
    gray_pixel_count = np.sum(gray_mask == 255)

    # Get the percentage of gray area in the image.
    gray_area_percentage = (gray_pixel_count / total_pixels) * 100

    if url in nodes_to_retile:
        gray_threshold = large_gray_area_threshold_reduced
    else:
        gray_threshold = large_gray_area_threshold

    if gray_area_percentage > gray_threshold:
        return True
    else:
        return False


 def render_node(url, screenshot_file_path):
    """Hits the node with Chrome (via Selenium) to trigger Cantaloupe to
    generate and cache the tiles.
    """
    try:
        driver = webdriver.Chrome(options=chrome_options)

        driver.get(url)
        sleep(sleep_length)
        required_width = driver.execute_script(
            "return document.documentElement.scrollWidth"
        )
        required_height = driver.execute_script(
            "return document.documentElement.scrollHeight"
        )
        driver.set_window_size(required_width, required_height)
        driver.save_screenshot(screenshot_file_path)
        driver.quit()
    except Exception as e:
        logging.error(
            f"Attempt to generate IIIF tiles for {url} encountered an error: {e}"
        )


 def warm_url(node_id):
    """Processes a single node by hitting it with Selenium. If the resulting
    screenshot shows that the tiling was incomplete, retry it.
    """
    url = f"{base_url}/node/{node_id}"
    print(f"Warming image tiles for {url}.")
    screenshot_filename = re.sub("[^0-9a-zA-Z]+", "_", url)
    screenshot_file_path = os.path.join(
        screenshots_dir_path, screenshot_filename + ".png"
    )
    logging.info(f"Warming image tiles for {url}.")
    # First render.
    render_node(url, screenshot_file_path)
    # If the screenshot shows that the tiling wasn't complete, try a second time.
    if mirador_is_empty(screenshot_file_path, url) is True:
        logging.warning(
            f"Screenshot for {url} shows tiling was incomplete. Will try a second time."
        )
        if url not in nodes_to_retile:
            nodes_to_retile.append(url)
        # Second render.
        render_node(url, screenshot_file_path)
        if mirador_is_empty(screenshot_file_path, url) is True:
            logging.warning(
                f"Screenshot for {url} shows second attempt at tiling was incomplete."
            )
            # If the second attempt didn't render it fully, try a third and final time.
            render_node(url, screenshot_file_path)
            if mirador_is_empty(screenshot_file_path, url) is True:
                logging.error(
                    f"Screenshot for {url} shows third and final attempt at tiling was incomplete."
                )
            else:
                logging.info(
                    f"Screenshot for {url} shows third attempt at tiling was complete."
                )
        else:
            logging.info(
                f"Screenshot for {url} shows second attempt at tiling was complete."
            )
    else:
        if url in nodes_to_retile:
            logging.info(
                f"Screenshot for rewarmed node {url} shows tiling was complete."
            )
            nodes_to_retile.remove(url)


 def warm_all_urls(nids):
    """Thread pool manager. Walks through the list of URLs and assigns threads to process
    up to max_workers nodes at a time.
    """
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(warm_url, nids)


 ##########################
 ### Main script logic. ###
 ##########################

 if __name__ == "__main__":
    logging.info(
        f"Starting tile warmer job, processing {max_workers} nodes at a time. Chrome will wait {sleep_length} seconds per node before taking a screenshot."
    )
    with open(url_input_csv_filename) as fh:
        lines = fh.read().splitlines()
        if randomize_input is True:
            lines = random.sample(lines, len(lines))
        # Filter out 'nid' column header.
        nids_to_warm = [line for line in lines if line.isnumeric()]
    warm_all_urls(nids_to_warm)
	"""
	Script to generate tiles for an image viewed using the Mirador Viewer. Also takes a screenshot
	of the page for quick QA of results. Input is a file containing a list of node IDs for nodes with
	an Islandora Model of Page or Image.

	This script should be run on a batch of Page or Image nodes prior to running generate_paged_content_iiif_manifests.py
	since IIIF Presentation manifest generation is much more reliable if the constituent images have already been pre-cached.

	Note: IIIF Presentation manifests for nodes with a Paged Content model are pre-cached by a different script,
	generate_paged_content_iiif_manifests.py.

	Usage: python tile_warmer.py node_ids.txt # Where node_ids.txt is a file containing a list of node IDs, one per line.
	"""

	import sys
	import os
	from time import sleep
	import re
	import logging
	from pathlib import Path
	from concurrent.futures import ThreadPoolExecutor
	import random

	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	import cv2
	import numpy as np

	url_input_csv_filename = sys.argv[1].strip()

	#################################
	### Configuration variables. ####
	#################################

	log_file_path = Path(url_input_csv_filename).stem + ".log"
	# screenshots_dir_path must exist.
	screenshots_dir_path = "/tmp/screenshots"
	base_url = "https://digital-ps.lib.sfu.ca"

	# Set to False to process the node IDs in the order they appear in the input
	# file, True to shuffle the list of node IDs before iterating through it.
	randomize_input = True

	# Maxiumum number of threads the script is allowed to use. Increasing
	# this number will add more load to the server.
	max_workers = 3
	# We pause to allow the tiles to be generated. It's also at this point in
	# time that the screenshot is taken.
	sleep_length = 35

	# % of pixels in image within gray range. We start with 75% gray pixels
	# and decrease it to 60% for nodes we need to retry to account for partially
	# tiled images.
	large_gray_area_threshold = 75
	large_gray_area_threshold_reduced = 60
	# Define the gray range (pure black is 0, pure white is 255).
	lower_gray = 235
	upper_gray = 255

	logging.basicConfig(
	filename=log_file_path,
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	datefmt="%d-%b-%y %H:%M:%S",
	)

	chrome_options = Options()
	chrome_options.add_argument("--headless=new")
	chrome_options.add_argument("--start-maximized")

	# We track nodes that didn't get fully tiled the first time around. Second and
	# third time around we test the render using the gray threshold defined in
	# large_gray_area_threshold_reduced.
	nodes_to_retile = list()

	#################
	### Functions ###
	#################


	def mirador_is_empty(screenshot_file_path, url):
	"""Attempt to determine if the screenshot contains an empty
	(i.e. all gray) Mirador Viewer by calculating the ratio
	of gray pixels to the total number of pixels in the cropped image.

	Returns True if the image did not appear to be fully tiled
	(i.e., the image is mostly gray pixels), False if it did
	appear to be successfully tiled (i.e., the image is not mostly
	gray pixels).
	"""
	# Load the image.
	image = cv2.imread(screenshot_file_path)

	# Convert it to grayscale.
	gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

	# Get the width of the image.
	dimensions = gray_image.shape
	width = dimensions[1]
	# Crop the image to remove stuff below the Mirador Viewer,
	# making for a more accurate proportion of gray pixels.
	gray_image_cropped = gray_image[0:width, 0:width]

	# Apply thresholding to get the gray areas.
	thresh, gray_mask = cv2.threshold(
	gray_image_cropped, lower_gray, upper_gray, cv2.THRESH_BINARY
	)

	# Get the total number of pixels in the image.
	total_pixels = gray_image_cropped.size

	# Get the number of gray (within the range) pixels in the image.s
	gray_pixel_count = np.sum(gray_mask == 255)

	# Get the percentage of gray area in the image.
	gray_area_percentage = (gray_pixel_count / total_pixels) * 100

	if url in nodes_to_retile:
	gray_threshold = large_gray_area_threshold_reduced
	else:
	gray_threshold = large_gray_area_threshold

	if gray_area_percentage > gray_threshold:
	return True
	else:
	return False


	def render_node(url, screenshot_file_path):
	"""Hits the node with Chrome (via Selenium) to trigger Cantaloupe to
	generate and cache the tiles.
	"""
	try:
	driver = webdriver.Chrome(options=chrome_options)

	driver.get(url)
	sleep(sleep_length)
	required_width = driver.execute_script(
	"return document.documentElement.scrollWidth"
	)
	required_height = driver.execute_script(
	"return document.documentElement.scrollHeight"
	)
	driver.set_window_size(required_width, required_height)
	driver.save_screenshot(screenshot_file_path)
	driver.quit()
	except Exception as e:
	logging.error(
	f"Attempt to generate IIIF tiles for {url} encountered an error: {e}"
	)


	def warm_url(node_id):
	"""Processes a single node by hitting it with Selenium. If the resulting
	screenshot shows that the tiling was incomplete, retry it.
	"""
	url = f"{base_url}/node/{node_id}"
	print(f"Warming image tiles for {url}.")
	screenshot_filename = re.sub("[^0-9a-zA-Z]+", "_", url)
	screenshot_file_path = os.path.join(
	screenshots_dir_path, screenshot_filename + ".png"
	)
	logging.info(f"Warming image tiles for {url}.")
	# First render.
	render_node(url, screenshot_file_path)
	# If the screenshot shows that the tiling wasn't complete, try a second time.
	if mirador_is_empty(screenshot_file_path, url) is True:
	logging.warning(
	f"Screenshot for {url} shows tiling was incomplete. Will try a second time."
	)
	if url not in nodes_to_retile:
	nodes_to_retile.append(url)
	# Second render.
	render_node(url, screenshot_file_path)
	if mirador_is_empty(screenshot_file_path, url) is True:
	logging.warning(
	f"Screenshot for {url} shows second attempt at tiling was incomplete."
	)
	# If the second attempt didn't render it fully, try a third and final time.
	render_node(url, screenshot_file_path)
	if mirador_is_empty(screenshot_file_path, url) is True:
	logging.error(
	f"Screenshot for {url} shows third and final attempt at tiling was incomplete."
	)
	else:
	logging.info(
	f"Screenshot for {url} shows third attempt at tiling was complete."
	)
	else:
	logging.info(
	f"Screenshot for {url} shows second attempt at tiling was complete."
	)
	else:
	if url in nodes_to_retile:
	logging.info(
	f"Screenshot for rewarmed node {url} shows tiling was complete."
	)
	nodes_to_retile.remove(url)


	def warm_all_urls(nids):
	"""Thread pool manager. Walks through the list of URLs and assigns threads to process
	up to max_workers nodes at a time.
	"""
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	executor.map(warm_url, nids)


	##########################
	### Main script logic. ###
	##########################

	if __name__ == "__main__":
	logging.info(
	f"Starting tile warmer job, processing {max_workers} nodes at a time. Chrome will wait {sleep_length} seconds per node before taking a screenshot."
	)
	with open(url_input_csv_filename) as fh:
	lines = fh.read().splitlines()
	if randomize_input is True:
	lines = random.sample(lines, len(lines))
	# Filter out 'nid' column header.
	nids_to_warm = [line for line in lines if line.isnumeric()]
	warm_all_urls(nids_to_warm)