Last active
October 7, 2025 20:41
-
-
Save petergi/b1817f6456907eca78b08d4bb7af1fc6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
The Python script extracts metadata (title and author) from EPUB files and renames them based on the extracted information. | |
:param epub_path: The `epub_path` parameter in the `get_epub_metadata` function is the file path to the EPUB file from which you want to extract the title and author metadata. You should provide the full path to the EPUB file as a string when calling this function. For example, if | |
:return: Defines a Python script that extracts metadata (title and author) from EPUB files and renames the files based on this metadata. The `get_epub_metadata` function extracts the metadata from an EPUB file and returns a dictionary containing the title and author. The `rename_epubs` function renames EPUB files in a specified directory based on the extracted metadata. | |
""" | |
import os | |
import re | |
import xml.etree.ElementTree as ET | |
import zipfile | |
def get_epub_metadata(epub_path): | |
""" | |
Extracts the title and author metadata from an EPUB file. | |
This function opens the given EPUB file (which is a ZIP archive), locates the OPF (Open Packaging Format) file | |
by reading the 'META-INF/container.xml', and then parses the OPF file to extract the book's title and author | |
using the Dublin Core metadata standard. | |
Args: | |
epub_path (str): The file path to the EPUB file. | |
Returns: | |
dict: A dictionary containing the 'title' and 'author' of the EPUB. | |
If the metadata is not found, 'Unknown' is returned for the respective field. | |
Raises: | |
zipfile.BadZipFile: If the provided file is not a valid ZIP archive. | |
ET.ParseError: If the XML files inside the EPUB are malformed. | |
KeyError: If the OPF file path cannot be found in the container.xml. | |
Example: | |
>>> get_epub_metadata("book.epub") | |
{'title': 'Foundation', 'author': 'Isaac Asimov'} | |
""" | |
with zipfile.ZipFile(epub_path, "r") as zip_file: | |
# Read the EPUB's container.xml to find the OPF file path | |
container = zip_file.read("META-INF/container.xml") | |
root = ET.fromstring(container) | |
opf_path = root.find( | |
".//{urn:oasis:names:tc:opendocument:xmlns:container}rootfile" | |
).get("full-path") | |
# Read and parse the OPF file for metadata | |
opf_content = zip_file.read(opf_path) | |
opf_root = ET.fromstring(opf_content) | |
# Define the namespace for Dublin Core metadata | |
ns = {"dc": "http://purl.org/dc/elements/1.1/"} | |
# Extract title and author (creator) elements | |
title = opf_root.find(".//dc:title", ns) | |
creator = opf_root.find(".//dc:creator", ns) | |
return { | |
"title": title.text if title is not None else "Unknown", | |
"author": creator.text if creator is not None else "Unknown", | |
} | |
def sanitize_filename(text): | |
""" | |
Removes characters from a string that are invalid in filenames on most filesystems. | |
This function strips out the following characters, which are not allowed in filenames on Windows and are problematic on other operating systems as well: | |
< > : " / \ | ? * | |
Args: | |
text (str): The input string to be sanitized for use as a filename. | |
Returns: | |
str: The sanitized string with invalid filename characters removed and leading/trailing whitespace stripped. | |
Example: | |
>>> sanitize_filename('My:Book/Title?.epub') | |
'MyBookTitle.epub' | |
""" | |
return re.sub(r'[<>:"/\\|?*]', "", text).strip() | |
def rename_epubs(directory="."): | |
""" | |
Renames all EPUB files in the specified directory based on their internal metadata. | |
For each `.epub` file found in the directory, this function: | |
- Extracts the author and title metadata using `get_epub_metadata()`. | |
- Sanitizes the author and title strings to ensure they are safe for filenames. | |
- Renames the file to the format: "Author - Title.epub". | |
- Skips renaming if the filename is already correct. | |
- Prints a message for each successful rename or error encountered. | |
Args: | |
directory (str, optional): The path to the directory containing EPUB files. | |
Defaults to the current directory (".") | |
Returns: | |
None | |
Example: | |
>>> rename_epubs("/path/to/epub/files") | |
Renamed: file000001.epub -> Isaac Asimov - Foundation.epub | |
Renamed: file000002.epub -> Ursula K Le Guin - The Dispossessed.epub | |
Error processing file000003.epub: Not a valid EPUB file | |
Notes: | |
- Requires the helper functions `get_epub_metadata(filepath)` and `sanitize_filename(text)`. | |
- Handles exceptions gracefully, printing errors but continuing with other files. | |
""" | |
for filename in os.listdir(directory): | |
# Process only files with .epub extension (case-insensitive) | |
if filename.lower().endswith(".epub"): | |
filepath = os.path.join(directory, filename) | |
try: | |
# Extract metadata (author and title) from the EPUB file | |
metadata = get_epub_metadata(filepath) | |
# Sanitize metadata to create a safe filename | |
new_name = f"{sanitize_filename(metadata['author'])} - {sanitize_filename(metadata['title'])}.epub" | |
new_path = os.path.join(directory, new_name) | |
# Only rename if the new name is different | |
if filepath != new_path: | |
os.rename(filepath, new_path) | |
print(f"Renamed: {filename} -> {new_name}") | |
except (zipfile.BadZipFile, ET.ParseError, KeyError) as e: | |
# Print error and continue with the next file | |
print(f"Error processing {filename}: {e}") | |
if __name__ == "__main__": | |
rename_epubs() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment