Skip to content

Instantly share code, notes, and snippets.

@danmackinlay
Last active December 29, 2024 07:59
Show Gist options
  • Save danmackinlay/30484659e5077079441dd0badc84b476 to your computer and use it in GitHub Desktop.
Save danmackinlay/30484659e5077079441dd0badc84b476 to your computer and use it in GitHub Desktop.
A post_render script for quarto which generates thumbnails for all .qmd files for which it can guess
#!/usr/bin/env python
r"""
Update doc metadata by parsing HTML version for images, generating thumbnails of the first admissible image, and updating the metadata with the thumbnail URL.
"""
from pathlib import Path
import subprocess
import argparse
import logging
from ruamel.yaml import YAML
from bs4 import BeautifulSoup
import os
import re
# Initialize logger without configuring it yet
logger = logging.getLogger(__name__)
def configure_logging(log_level: str):
"""
Configures the logging level and format.
Args:
log_level (str): The logging level as a string.
"""
numeric_level = getattr(logging, log_level.upper(), None)
if not isinstance(numeric_level, int):
logger.warning(f"Invalid log level: {log_level}. Defaulting to INFO.")
numeric_level = logging.INFO
logging.basicConfig(level=numeric_level, format='%(levelname)s: %(message)s')
THUMB_WIDTH = 320
THUMB_HEIGHT = 320
yaml = YAML(typ='rt')
def read(fname):
"""
Reads a Quarto/Pandoc markdown file and extracts its YAML metadata and content.
Args:
fname (Path): Path to the `.qmd` file.
Returns:
tuple: A dictionary of metadata and a list of content lines.
"""
metadata = {}
outlines = []
try:
with open(fname, 'r', encoding='utf8') as fp:
lines = fp.readlines()
if not lines:
return {}, []
if lines[0].strip() == '---': # YAML header
# Load the data we need to parse
to_parse = []
i = 1
for i, line in enumerate(lines[1:], start=1):
# When we find a terminator (`---` or '...'), stop.
if line.strip() in ('---', '...'):
break
to_parse.append(line)
parsed = yaml.load("".join(to_parse))
for k, v in parsed.items():
metadata[k.lower()] = v
content_start = i + 1
outlines = lines[content_start:]
else:
# Handle files without YAML front matter
for i, line in enumerate(lines):
kv = line.split(':', 1)
if len(kv) == 2:
name, value = kv[0].lower(), kv[1].strip()
metadata[name] = value
else:
break
outlines = lines[i+1:]
return metadata, outlines
except Exception as e:
logging.warning(f"Failed to read {fname}: {str(e)}")
return {}, []
def write(fname, metadata, content):
"""
Writes the YAML metadata and content back to a `.qmd` file.
Args:
fname (Path): Path to the `.qmd` file.
metadata (dict): Metadata to write.
content (str): Markdown content to write.
"""
try:
with open(fname, 'w', encoding='utf8') as fp:
fp.write('---\n')
yaml.dump(metadata, fp)
fp.write('---\n')
fp.write(content)
logging.info(f"Successfully wrote updated metadata to {fname}")
except Exception as e:
logging.warning(f"Failed to write {fname}: {str(e)}")
def get_file_mod_time(path):
"""
Gets the modification time of a file.
Args:
path (Path): Path to the file.
Returns:
float: Modification time in seconds since the epoch.
"""
try:
return os.path.getmtime(path)
except OSError:
# Return the Unix epoch time if the file does not exist
return 0.0
def create_thumbnail(image_path, thumbnail_path):
"""
Creates a thumbnail of the given image using VIPS.
Args:
image_path (Path): Path to the original image.
thumbnail_path (Path): Path where the thumbnail will be saved.
Returns:
bool: True if thumbnail creation was successful, False otherwise.
"""
try:
# Ensure the parent directory exists
thumbnail_path.parent.mkdir(parents=True, exist_ok=True)
# Construct the command
cmd = [
"vips",
"thumbnail",
str(image_path),
str(thumbnail_path),
str(THUMB_WIDTH),
"--height", str(THUMB_HEIGHT),
"--size", "down", # never upscale
"--crop", "attention", # Crop to the interesting bit
]
result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logging.info(f"Thumbnail created successfully at {thumbnail_path}")
return True
except subprocess.CalledProcessError as e:
logging.warning(f"Failed to create/update thumbnail for {image_path}: {e.stderr.decode().strip()}")
return False
except Exception as e:
logging.warning(f"An error occurred while creating thumbnail for {image_path}: {str(e)}")
return False
def extract_thumbnail_url_html(htmlcontent):
"""
Extracts the first admissible image URL from HTML content.
Args:
htmlcontent (str): The HTML content as a string.
Returns:
str or None: The image URL if found, else None.
"""
soup = BeautifulSoup(htmlcontent, 'html.parser')
if soup.body:
for img in soup.find_all('img'):
img_url = img.get('src')
if not img_url or img_url.startswith(('http', 'data:')):
logging.debug(f"Skipping image with URL: {img_url}")
continue
# Find all parent Divs with 'figure' or 'illustration' classes
parent_divs = img.find_parents('div', class_=lambda x: x and ('figure' in x or 'illustration' in x))
if not parent_divs:
logging.debug(f"No parent Div with 'figure' or 'illustration' classes found for image: {img_url}")
continue
# Check if any of the parent Divs have 'foreign' or 'sep' classes
skip_image = False
for div in parent_divs:
classes = div.get('class', [])
if 'foreign' in classes or 'sep' in classes:
logging.debug(f"Skipping image '{img_url}' inside Div with classes: {classes}")
skip_image = True
break
if skip_image:
continue
# If none of the parent Divs have 'foreign' or 'sep', process this image
logging.debug(f"Selected image for thumbnail: {img_url}")
return img_url
return None
def extract_thumbnail_url_md(content_lines):
"""
Given the lines of a Quarto/Pandoc markdown file, find the FIRST image
in a Div block that:
- has class .figure
- does NOT have class .foreign
Args:
content_lines (list): List of lines from the markdown content.
Returns:
str or None: The image path if found, else None.
"""
div_start_pattern = re.compile(r'^:::\{([^}]*)\}\s*$') # e.g., :::{#fig-pews .figure .illustration .right}
div_end_pattern = re.compile(r'^:::\s*$')
def parse_div_attrs(attrs_str):
"""
Parses the attributes string of a Div block.
Args:
attrs_str (str): The attributes string, e.g., "#id .class1 .class2".
Returns:
dict: Dictionary with "id" and "classes" keys.
"""
pieces = attrs_str.split()
id_ = None
classes = []
for piece in pieces:
if piece.startswith('#'):
id_ = piece[1:]
elif piece.startswith('.'):
classes.append(piece[1:])
return {"id": id_, "classes": set(classes)}
in_div_block = False
div_attrs = None
block_lines = []
for idx, line in enumerate(content_lines, start=1):
if not in_div_block:
# Look for the start of a DIV
m = div_start_pattern.match(line)
if m:
in_div_block = True
div_attrs = parse_div_attrs(m.group(1))
logging.debug(f"Entering DIV block at line {idx} with classes: {div_attrs['classes']}")
block_lines = []
else:
# Inside a DIV block
if div_end_pattern.match(line):
# End of DIV block
logging.debug(f"Exiting DIV block at line {idx} with classes: {div_attrs['classes']}")
if "figure" in div_attrs["classes"] and "foreign" not in div_attrs["classes"]:
# Look for the FIRST image link in the block_lines
image_pattern = re.compile(r'!\[.*?\]\(([^)]+)\)')
for bl in block_lines:
img_match = image_pattern.search(bl)
if img_match:
img_path = img_match.group(1)
logging.debug(f"Selected image for thumbnail from markdown: {img_path}")
return img_path # Return the file path
# Reset state
in_div_block = False
div_attrs = None
block_lines = []
else:
# Accumulate lines within the DIV block
block_lines.append(line)
return None
def resolve_image_path(img_url, fname, site_root=Path('.')):
"""
Resolves the filesystem path of an image based on its URL.
Args:
img_url (str): The image URL from the markdown.
fname (Path): Path to the `.qmd` file.
site_root (Path): The root directory of the site.
Returns:
Path: The resolved filesystem path to the image.
"""
if img_url.startswith('/'):
# Absolute path: relative to site root
filesystem_path = site_root / img_url.lstrip('/')
logging.debug(f"Resolved absolute image URL '{img_url}' to filesystem path '{filesystem_path}'")
else:
# Relative path: relative to the `.qmd` file's directory
filesystem_path = fname.parent / img_url
logging.debug(f"Resolved relative image URL '{img_url}' to filesystem path '{filesystem_path}'")
return filesystem_path.resolve()
def resolve_thumbnail_path(thumbnail_url, fname, site_root=Path('.')):
"""
Resolves the filesystem path of a thumbnail based on its URL.
Args:
thumbnail_url (str): The thumbnail URL to be used in metadata.
fname (Path): Path to the `.qmd` file.
site_root (Path): The root directory of the site.
Returns:
Path: The resolved filesystem path to the thumbnail.
"""
if thumbnail_url.startswith('/'):
# Absolute path: relative to site root
filesystem_path = site_root / thumbnail_url.lstrip('/')
logging.debug(f"Resolved absolute thumbnail URL '{thumbnail_url}' to filesystem path '{filesystem_path}'")
else:
# Relative path: relative to the `.qmd` file's directory
filesystem_path = fname.parent / thumbnail_url
logging.debug(f"Resolved relative thumbnail URL '{thumbnail_url}' to filesystem path '{filesystem_path}'")
return filesystem_path.resolve()
def massage_one_file(fname, site_root=Path('.')):
"""
Processes a single `.qmd` file: extracts the first admissible image, creates a thumbnail, and updates metadata.
Args:
fname (Path): Path to the `.qmd` file.
site_root (Path): The root directory of the site.
"""
qmdname = fname.with_suffix('.qmd')
htmlname = site_root / "_site" / fname.with_suffix('.html')
thumbnail_url = None
metadata, qmd_lines = read(qmdname)
# Determine which file to prefer based on modification times
use_html = False
if htmlname.exists():
html_mod_time = get_file_mod_time(htmlname)
qmd_mod_time = get_file_mod_time(qmdname)
if html_mod_time > qmd_mod_time:
use_html = True
logging.debug(f"HTML file {htmlname} is newer than QMD file {qmdname}. Using HTML for image extraction.")
else:
logging.debug(f"HTML file {htmlname} is not newer than QMD file {qmdname}. Using QMD for image extraction.")
else:
logging.debug(f"HTML file {htmlname} does not exist. Using QMD for image extraction.")
if use_html:
try:
with open(htmlname, 'r', encoding='utf8') as html_fp:
html_content = html_fp.read()
img_url = extract_thumbnail_url_html(html_content)
except Exception as e:
logging.warning(f"Failed to read HTML file {htmlname}: {str(e)}")
img_url = extract_thumbnail_url_md(qmd_lines)
else:
img_url = extract_thumbnail_url_md(qmd_lines)
if img_url is None:
logging.warning(f"Could not find image URL for {qmdname}")
return
# Thumbnail URL has the extension changed to .thumbnail.avif
thumbnail_url = re.sub(r'\.[^.]+$', '.thumbnail.avif', img_url)
# Resolve filesystem paths
img_path = resolve_image_path(img_url, fname, site_root)
thumbnail_path = resolve_thumbnail_path(thumbnail_url, fname, site_root)
logging.debug(f"Image path: {img_path}")
logging.debug(f"Thumbnail path: {thumbnail_path}")
if get_file_mod_time(thumbnail_path) > get_file_mod_time(img_path):
logging.info(f"Thumbnail for {img_url} is up to date.")
return
if create_thumbnail(img_path, thumbnail_path):
# Successfully created or updated thumbnail
old_thumbnail = metadata.get('image', None)
if old_thumbnail != thumbnail_url:
metadata['image'] = thumbnail_url
logging.info(f"{qmdname} metadata updated to include thumbnail {thumbnail_url}.")
write(qmdname, metadata, "".join(qmd_lines))
if htmlname.exists():
try:
with open(htmlname, 'r', encoding='utf8') as html_fp:
html_lines = html_fp.read()
write(htmlname, metadata, html_lines)
except Exception as e:
logging.warning(f"Failed to update HTML file {htmlname}: {str(e)}")
else:
# Thumbnail creation failed
logging.warning(f"Failed to create thumbnail for {img_url} in {qmdname}")
def process_files(files: list):
"""
Main function to process multiple `.qmd` files.
Args:
files (list): List of file paths to process.
"""
file_list = []
if files:
for fname in files:
path = Path(fname)
if path.is_file():
file_list.append(path)
else:
logger.warning(f"File not found or is not a file: {fname}")
else:
## If file_list is empty, use default glob
logger.info(
"No files provided. Using default glob pattern '**/*.qmd' excluding files starting with '_'."
)
# Use rglob to search recursively
for path in Path(".").rglob("*.qmd"):
if not path.name.startswith("_"):
file_list.append(path)
if not file_list:
logger.warning("No .qmd files found to process.")
return
for fname in file_list:
if fname.is_file():
logging.info(f"Processing file: {fname}")
massage_one_file(fname)
def parse_arguments():
"""
Parses command-line arguments.
"""
parser = argparse.ArgumentParser(
description="Update image thumbnails in Quarto/Pandoc markdown files."
)
parser.add_argument(
"files",
nargs="*",
help="List of files to process. If omitted, all '*.qmd' files (excluding those starting with '_') will be processed.",
)
parser.add_argument(
"--log-level",
type=str,
default="INFO",
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Set the logging level (default: INFO).",
)
return parser.parse_args()
def main():
args = parse_arguments()
configure_logging(args.log_level)
# Check if the environment variable ON_GITHUB is set to a truthy value
if os.getenv('ON_GITHUB', '').lower() in ('1', 'true', 'yes'):
logger.info("Running on GitHub Actions. We cannot process images here, so lets assume nothing to do")
return
process_files(files=args.files)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment