Last active
December 29, 2024 07:59
-
-
Save danmackinlay/30484659e5077079441dd0badc84b476 to your computer and use it in GitHub Desktop.
A post_render script for quarto which generates thumbnails for all .qmd files for which it can guess
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
r""" | |
Update doc metadata by parsing HTML version for images, generating thumbnails of the first admissible image, and updating the metadata with the thumbnail URL. | |
""" | |
from pathlib import Path | |
import subprocess | |
import argparse | |
import logging | |
from ruamel.yaml import YAML | |
from bs4 import BeautifulSoup | |
import os | |
import re | |
# Initialize logger without configuring it yet | |
logger = logging.getLogger(__name__) | |
def configure_logging(log_level: str): | |
""" | |
Configures the logging level and format. | |
Args: | |
log_level (str): The logging level as a string. | |
""" | |
numeric_level = getattr(logging, log_level.upper(), None) | |
if not isinstance(numeric_level, int): | |
logger.warning(f"Invalid log level: {log_level}. Defaulting to INFO.") | |
numeric_level = logging.INFO | |
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s') | |
THUMB_WIDTH = 320 | |
THUMB_HEIGHT = 320 | |
yaml = YAML(typ='rt') | |
def read(fname): | |
""" | |
Reads a Quarto/Pandoc markdown file and extracts its YAML metadata and content. | |
Args: | |
fname (Path): Path to the `.qmd` file. | |
Returns: | |
tuple: A dictionary of metadata and a list of content lines. | |
""" | |
metadata = {} | |
outlines = [] | |
try: | |
with open(fname, 'r', encoding='utf8') as fp: | |
lines = fp.readlines() | |
if not lines: | |
return {}, [] | |
if lines[0].strip() == '---': # YAML header | |
# Load the data we need to parse | |
to_parse = [] | |
i = 1 | |
for i, line in enumerate(lines[1:], start=1): | |
# When we find a terminator (`---` or '...'), stop. | |
if line.strip() in ('---', '...'): | |
break | |
to_parse.append(line) | |
parsed = yaml.load("".join(to_parse)) | |
for k, v in parsed.items(): | |
metadata[k.lower()] = v | |
content_start = i + 1 | |
outlines = lines[content_start:] | |
else: | |
# Handle files without YAML front matter | |
for i, line in enumerate(lines): | |
kv = line.split(':', 1) | |
if len(kv) == 2: | |
name, value = kv[0].lower(), kv[1].strip() | |
metadata[name] = value | |
else: | |
break | |
outlines = lines[i+1:] | |
return metadata, outlines | |
except Exception as e: | |
logging.warning(f"Failed to read {fname}: {str(e)}") | |
return {}, [] | |
def write(fname, metadata, content): | |
""" | |
Writes the YAML metadata and content back to a `.qmd` file. | |
Args: | |
fname (Path): Path to the `.qmd` file. | |
metadata (dict): Metadata to write. | |
content (str): Markdown content to write. | |
""" | |
try: | |
with open(fname, 'w', encoding='utf8') as fp: | |
fp.write('---\n') | |
yaml.dump(metadata, fp) | |
fp.write('---\n') | |
fp.write(content) | |
logging.info(f"Successfully wrote updated metadata to {fname}") | |
except Exception as e: | |
logging.warning(f"Failed to write {fname}: {str(e)}") | |
def get_file_mod_time(path): | |
""" | |
Gets the modification time of a file. | |
Args: | |
path (Path): Path to the file. | |
Returns: | |
float: Modification time in seconds since the epoch. | |
""" | |
try: | |
return os.path.getmtime(path) | |
except OSError: | |
# Return the Unix epoch time if the file does not exist | |
return 0.0 | |
def create_thumbnail(image_path, thumbnail_path): | |
""" | |
Creates a thumbnail of the given image using VIPS. | |
Args: | |
image_path (Path): Path to the original image. | |
thumbnail_path (Path): Path where the thumbnail will be saved. | |
Returns: | |
bool: True if thumbnail creation was successful, False otherwise. | |
""" | |
try: | |
# Ensure the parent directory exists | |
thumbnail_path.parent.mkdir(parents=True, exist_ok=True) | |
# Construct the command | |
cmd = [ | |
"vips", | |
"thumbnail", | |
str(image_path), | |
str(thumbnail_path), | |
str(THUMB_WIDTH), | |
"--height", str(THUMB_HEIGHT), | |
"--size", "down", # never upscale | |
"--crop", "attention", # Crop to the interesting bit | |
] | |
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
logging.info(f"Thumbnail created successfully at {thumbnail_path}") | |
return True | |
except subprocess.CalledProcessError as e: | |
logging.warning(f"Failed to create/update thumbnail for {image_path}: {e.stderr.decode().strip()}") | |
return False | |
except Exception as e: | |
logging.warning(f"An error occurred while creating thumbnail for {image_path}: {str(e)}") | |
return False | |
def extract_thumbnail_url_html(htmlcontent): | |
""" | |
Extracts the first admissible image URL from HTML content. | |
Args: | |
htmlcontent (str): The HTML content as a string. | |
Returns: | |
str or None: The image URL if found, else None. | |
""" | |
soup = BeautifulSoup(htmlcontent, 'html.parser') | |
if soup.body: | |
for img in soup.find_all('img'): | |
img_url = img.get('src') | |
if not img_url or img_url.startswith(('http', 'data:')): | |
logging.debug(f"Skipping image with URL: {img_url}") | |
continue | |
# Find all parent Divs with 'figure' or 'illustration' classes | |
parent_divs = img.find_parents('div', class_=lambda x: x and ('figure' in x or 'illustration' in x)) | |
if not parent_divs: | |
logging.debug(f"No parent Div with 'figure' or 'illustration' classes found for image: {img_url}") | |
continue | |
# Check if any of the parent Divs have 'foreign' or 'sep' classes | |
skip_image = False | |
for div in parent_divs: | |
classes = div.get('class', []) | |
if 'foreign' in classes or 'sep' in classes: | |
logging.debug(f"Skipping image '{img_url}' inside Div with classes: {classes}") | |
skip_image = True | |
break | |
if skip_image: | |
continue | |
# If none of the parent Divs have 'foreign' or 'sep', process this image | |
logging.debug(f"Selected image for thumbnail: {img_url}") | |
return img_url | |
return None | |
def extract_thumbnail_url_md(content_lines): | |
""" | |
Given the lines of a Quarto/Pandoc markdown file, find the FIRST image | |
in a Div block that: | |
- has class .figure | |
- does NOT have class .foreign | |
Args: | |
content_lines (list): List of lines from the markdown content. | |
Returns: | |
str or None: The image path if found, else None. | |
""" | |
div_start_pattern = re.compile(r'^:::\{([^}]*)\}\s*$') # e.g., :::{#fig-pews .figure .illustration .right} | |
div_end_pattern = re.compile(r'^:::\s*$') | |
def parse_div_attrs(attrs_str): | |
""" | |
Parses the attributes string of a Div block. | |
Args: | |
attrs_str (str): The attributes string, e.g., "#id .class1 .class2". | |
Returns: | |
dict: Dictionary with "id" and "classes" keys. | |
""" | |
pieces = attrs_str.split() | |
id_ = None | |
classes = [] | |
for piece in pieces: | |
if piece.startswith('#'): | |
id_ = piece[1:] | |
elif piece.startswith('.'): | |
classes.append(piece[1:]) | |
return {"id": id_, "classes": set(classes)} | |
in_div_block = False | |
div_attrs = None | |
block_lines = [] | |
for idx, line in enumerate(content_lines, start=1): | |
if not in_div_block: | |
# Look for the start of a DIV | |
m = div_start_pattern.match(line) | |
if m: | |
in_div_block = True | |
div_attrs = parse_div_attrs(m.group(1)) | |
logging.debug(f"Entering DIV block at line {idx} with classes: {div_attrs['classes']}") | |
block_lines = [] | |
else: | |
# Inside a DIV block | |
if div_end_pattern.match(line): | |
# End of DIV block | |
logging.debug(f"Exiting DIV block at line {idx} with classes: {div_attrs['classes']}") | |
if "figure" in div_attrs["classes"] and "foreign" not in div_attrs["classes"]: | |
# Look for the FIRST image link in the block_lines | |
image_pattern = re.compile(r'!\[.*?\]\(([^)]+)\)') | |
for bl in block_lines: | |
img_match = image_pattern.search(bl) | |
if img_match: | |
img_path = img_match.group(1) | |
logging.debug(f"Selected image for thumbnail from markdown: {img_path}") | |
return img_path # Return the file path | |
# Reset state | |
in_div_block = False | |
div_attrs = None | |
block_lines = [] | |
else: | |
# Accumulate lines within the DIV block | |
block_lines.append(line) | |
return None | |
def resolve_image_path(img_url, fname, site_root=Path('.')): | |
""" | |
Resolves the filesystem path of an image based on its URL. | |
Args: | |
img_url (str): The image URL from the markdown. | |
fname (Path): Path to the `.qmd` file. | |
site_root (Path): The root directory of the site. | |
Returns: | |
Path: The resolved filesystem path to the image. | |
""" | |
if img_url.startswith('/'): | |
# Absolute path: relative to site root | |
filesystem_path = site_root / img_url.lstrip('/') | |
logging.debug(f"Resolved absolute image URL '{img_url}' to filesystem path '{filesystem_path}'") | |
else: | |
# Relative path: relative to the `.qmd` file's directory | |
filesystem_path = fname.parent / img_url | |
logging.debug(f"Resolved relative image URL '{img_url}' to filesystem path '{filesystem_path}'") | |
return filesystem_path.resolve() | |
def resolve_thumbnail_path(thumbnail_url, fname, site_root=Path('.')): | |
""" | |
Resolves the filesystem path of a thumbnail based on its URL. | |
Args: | |
thumbnail_url (str): The thumbnail URL to be used in metadata. | |
fname (Path): Path to the `.qmd` file. | |
site_root (Path): The root directory of the site. | |
Returns: | |
Path: The resolved filesystem path to the thumbnail. | |
""" | |
if thumbnail_url.startswith('/'): | |
# Absolute path: relative to site root | |
filesystem_path = site_root / thumbnail_url.lstrip('/') | |
logging.debug(f"Resolved absolute thumbnail URL '{thumbnail_url}' to filesystem path '{filesystem_path}'") | |
else: | |
# Relative path: relative to the `.qmd` file's directory | |
filesystem_path = fname.parent / thumbnail_url | |
logging.debug(f"Resolved relative thumbnail URL '{thumbnail_url}' to filesystem path '{filesystem_path}'") | |
return filesystem_path.resolve() | |
def massage_one_file(fname, site_root=Path('.')): | |
""" | |
Processes a single `.qmd` file: extracts the first admissible image, creates a thumbnail, and updates metadata. | |
Args: | |
fname (Path): Path to the `.qmd` file. | |
site_root (Path): The root directory of the site. | |
""" | |
qmdname = fname.with_suffix('.qmd') | |
htmlname = site_root / "_site" / fname.with_suffix('.html') | |
thumbnail_url = None | |
metadata, qmd_lines = read(qmdname) | |
# Determine which file to prefer based on modification times | |
use_html = False | |
if htmlname.exists(): | |
html_mod_time = get_file_mod_time(htmlname) | |
qmd_mod_time = get_file_mod_time(qmdname) | |
if html_mod_time > qmd_mod_time: | |
use_html = True | |
logging.debug(f"HTML file {htmlname} is newer than QMD file {qmdname}. Using HTML for image extraction.") | |
else: | |
logging.debug(f"HTML file {htmlname} is not newer than QMD file {qmdname}. Using QMD for image extraction.") | |
else: | |
logging.debug(f"HTML file {htmlname} does not exist. Using QMD for image extraction.") | |
if use_html: | |
try: | |
with open(htmlname, 'r', encoding='utf8') as html_fp: | |
html_content = html_fp.read() | |
img_url = extract_thumbnail_url_html(html_content) | |
except Exception as e: | |
logging.warning(f"Failed to read HTML file {htmlname}: {str(e)}") | |
img_url = extract_thumbnail_url_md(qmd_lines) | |
else: | |
img_url = extract_thumbnail_url_md(qmd_lines) | |
if img_url is None: | |
logging.warning(f"Could not find image URL for {qmdname}") | |
return | |
# Thumbnail URL has the extension changed to .thumbnail.avif | |
thumbnail_url = re.sub(r'\.[^.]+$', '.thumbnail.avif', img_url) | |
# Resolve filesystem paths | |
img_path = resolve_image_path(img_url, fname, site_root) | |
thumbnail_path = resolve_thumbnail_path(thumbnail_url, fname, site_root) | |
logging.debug(f"Image path: {img_path}") | |
logging.debug(f"Thumbnail path: {thumbnail_path}") | |
if get_file_mod_time(thumbnail_path) > get_file_mod_time(img_path): | |
logging.info(f"Thumbnail for {img_url} is up to date.") | |
return | |
if create_thumbnail(img_path, thumbnail_path): | |
# Successfully created or updated thumbnail | |
old_thumbnail = metadata.get('image', None) | |
if old_thumbnail != thumbnail_url: | |
metadata['image'] = thumbnail_url | |
logging.info(f"{qmdname} metadata updated to include thumbnail {thumbnail_url}.") | |
write(qmdname, metadata, "".join(qmd_lines)) | |
if htmlname.exists(): | |
try: | |
with open(htmlname, 'r', encoding='utf8') as html_fp: | |
html_lines = html_fp.read() | |
write(htmlname, metadata, html_lines) | |
except Exception as e: | |
logging.warning(f"Failed to update HTML file {htmlname}: {str(e)}") | |
else: | |
# Thumbnail creation failed | |
logging.warning(f"Failed to create thumbnail for {img_url} in {qmdname}") | |
def process_files(files: list): | |
""" | |
Main function to process multiple `.qmd` files. | |
Args: | |
files (list): List of file paths to process. | |
""" | |
file_list = [] | |
if files: | |
for fname in files: | |
path = Path(fname) | |
if path.is_file(): | |
file_list.append(path) | |
else: | |
logger.warning(f"File not found or is not a file: {fname}") | |
else: | |
## If file_list is empty, use default glob | |
logger.info( | |
"No files provided. Using default glob pattern '**/*.qmd' excluding files starting with '_'." | |
) | |
# Use rglob to search recursively | |
for path in Path(".").rglob("*.qmd"): | |
if not path.name.startswith("_"): | |
file_list.append(path) | |
if not file_list: | |
logger.warning("No .qmd files found to process.") | |
return | |
for fname in file_list: | |
if fname.is_file(): | |
logging.info(f"Processing file: {fname}") | |
massage_one_file(fname) | |
def parse_arguments(): | |
""" | |
Parses command-line arguments. | |
""" | |
parser = argparse.ArgumentParser( | |
description="Update image thumbnails in Quarto/Pandoc markdown files." | |
) | |
parser.add_argument( | |
"files", | |
nargs="*", | |
help="List of files to process. If omitted, all '*.qmd' files (excluding those starting with '_') will be processed.", | |
) | |
parser.add_argument( | |
"--log-level", | |
type=str, | |
default="INFO", | |
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], | |
help="Set the logging level (default: INFO).", | |
) | |
return parser.parse_args() | |
def main(): | |
args = parse_arguments() | |
configure_logging(args.log_level) | |
# Check if the environment variable ON_GITHUB is set to a truthy value | |
if os.getenv('ON_GITHUB', '').lower() in ('1', 'true', 'yes'): | |
logger.info("Running on GitHub Actions. We cannot process images here, so lets assume nothing to do") | |
return | |
process_files(files=args.files) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment