Skip to content

Instantly share code, notes, and snippets.

@petergi
Last active October 7, 2025 20:41
Show Gist options
  • Save petergi/b1817f6456907eca78b08d4bb7af1fc6 to your computer and use it in GitHub Desktop.
Save petergi/b1817f6456907eca78b08d4bb7af1fc6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
The Python script extracts metadata (title and author) from EPUB files and renames them based on the extracted information.
:param epub_path: The `epub_path` parameter in the `get_epub_metadata` function is the file path to the EPUB file from which you want to extract the title and author metadata. You should provide the full path to the EPUB file as a string when calling this function. For example, if
:return: Defines a Python script that extracts metadata (title and author) from EPUB files and renames the files based on this metadata. The `get_epub_metadata` function extracts the metadata from an EPUB file and returns a dictionary containing the title and author. The `rename_epubs` function renames EPUB files in a specified directory based on the extracted metadata.
"""
import os
import re
import xml.etree.ElementTree as ET
import zipfile
def get_epub_metadata(epub_path):
"""
Extracts the title and author metadata from an EPUB file.
This function opens the given EPUB file (which is a ZIP archive), locates the OPF (Open Packaging Format) file
by reading the 'META-INF/container.xml', and then parses the OPF file to extract the book's title and author
using the Dublin Core metadata standard.
Args:
epub_path (str): The file path to the EPUB file.
Returns:
dict: A dictionary containing the 'title' and 'author' of the EPUB.
If the metadata is not found, 'Unknown' is returned for the respective field.
Raises:
zipfile.BadZipFile: If the provided file is not a valid ZIP archive.
ET.ParseError: If the XML files inside the EPUB are malformed.
KeyError: If the OPF file path cannot be found in the container.xml.
Example:
>>> get_epub_metadata("book.epub")
{'title': 'Foundation', 'author': 'Isaac Asimov'}
"""
with zipfile.ZipFile(epub_path, "r") as zip_file:
# Read the EPUB's container.xml to find the OPF file path
container = zip_file.read("META-INF/container.xml")
root = ET.fromstring(container)
opf_path = root.find(
".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile"
).get("full-path")
# Read and parse the OPF file for metadata
opf_content = zip_file.read(opf_path)
opf_root = ET.fromstring(opf_content)
# Define the namespace for Dublin Core metadata
ns = {"dc": "http://purl.org/dc/elements/1.1/"}
# Extract title and author (creator) elements
title = opf_root.find(".//dc:title", ns)
creator = opf_root.find(".//dc:creator", ns)
return {
"title": title.text if title is not None else "Unknown",
"author": creator.text if creator is not None else "Unknown",
}
def sanitize_filename(text):
"""
Removes characters from a string that are invalid in filenames on most filesystems.
This function strips out the following characters, which are not allowed in filenames on Windows and are problematic on other operating systems as well:
< > : " / \ | ? *
Args:
text (str): The input string to be sanitized for use as a filename.
Returns:
str: The sanitized string with invalid filename characters removed and leading/trailing whitespace stripped.
Example:
>>> sanitize_filename('My:Book/Title?.epub')
'MyBookTitle.epub'
"""
return re.sub(r'[<>:"/\\|?*]', "", text).strip()
def rename_epubs(directory="."):
"""
Renames all EPUB files in the specified directory based on their internal metadata.
For each `.epub` file found in the directory, this function:
- Extracts the author and title metadata using `get_epub_metadata()`.
- Sanitizes the author and title strings to ensure they are safe for filenames.
- Renames the file to the format: "Author - Title.epub".
- Skips renaming if the filename is already correct.
- Prints a message for each successful rename or error encountered.
Args:
directory (str, optional): The path to the directory containing EPUB files.
Defaults to the current directory (".")
Returns:
None
Example:
>>> rename_epubs("/path/to/epub/files")
Renamed: file000001.epub -> Isaac Asimov - Foundation.epub
Renamed: file000002.epub -> Ursula K Le Guin - The Dispossessed.epub
Error processing file000003.epub: Not a valid EPUB file
Notes:
- Requires the helper functions `get_epub_metadata(filepath)` and `sanitize_filename(text)`.
- Handles exceptions gracefully, printing errors but continuing with other files.
"""
for filename in os.listdir(directory):
# Process only files with .epub extension (case-insensitive)
if filename.lower().endswith(".epub"):
filepath = os.path.join(directory, filename)
try:
# Extract metadata (author and title) from the EPUB file
metadata = get_epub_metadata(filepath)
# Sanitize metadata to create a safe filename
new_name = f"{sanitize_filename(metadata['author'])} - {sanitize_filename(metadata['title'])}.epub"
new_path = os.path.join(directory, new_name)
# Only rename if the new name is different
if filepath != new_path:
os.rename(filepath, new_path)
print(f"Renamed: {filename} -> {new_name}")
except (zipfile.BadZipFile, ET.ParseError, KeyError) as e:
# Print error and continue with the next file
print(f"Error processing {filename}: {e}")
if __name__ == "__main__":
rename_epubs()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment