Skip to content

Instantly share code, notes, and snippets.

@eristoddle
Last active May 28, 2025 01:15
Show Gist options
  • Save eristoddle/5a8e7dd0597d09d00aa5de066788c303 to your computer and use it in GitHub Desktop.
Save eristoddle/5a8e7dd0597d09d00aa5de066788c303 to your computer and use it in GitHub Desktop.
Python OSX Book app highlight and note exporter. This will work standalone to export highlights as markdown files or in Obsidian with the Python Scripter plugin.
import os
import glob
import sqlite3
import logging
import sys
import re
from typing import List, Tuple, NamedTuple, Optional
ANNOTATION_DB_PATTERN = "~/Library/Containers/com.apple.iBooksX/Data/Documents/AEAnnotation/AEAnnotation*.sqlite"
LIBRARY_DB_PATTERN = (
"~/Library/Containers/com.apple.iBooksX/Data/Documents/BKLibrary/BKLibrary*.sqlite"
)
logging.basicConfig(
level=logging.ERROR, format="%(asctime)s - %(levelname)s - %(message)s"
)
class BookDetail(NamedTuple):
asset_id: str
title: str
author: str | None
description: str | None
epub_id: str | None
path: str | None
isbn: str | None
language: str | None
publisher: str | None
publication_date: str | None
cover: str | None
def sanitize_frontmatter(text: str) -> str:
if not text:
return ""
replacements = {
":": " -",
"[": "(",
"]": ")",
"{": "(",
"}": ")",
"#": "",
"|": "-",
">": "-",
"\\": "/",
"\n": " ",
"\r": " ",
}
result = str(text)
for char, replacement in replacements.items():
result = result.replace(char, replacement)
result = " ".join(result.split())
return result.strip()
def extract_chapter_from_cfi(cfi: str) -> Optional[str]:
"""
Extract chapter information directly from CFI string.
Examples:
'epubcfi(/6/12[chapter_4]!/4/10/1,:0,:4)' -> 'Chapter 4'
'epubcfi(/6/24[c3.xhtml]!/4/188/2/1,:0,:1)' -> 'Chapter 3'
'epubcfi(/6/16[AHR5106_US-Trade_BBP-Text-3]!/...)' -> 'Preface'
"""
if not cfi:
return None
try:
# Extract content in brackets from CFI
bracket_match = re.search(r"\[([^\]]+)\]", cfi)
if not bracket_match:
return None
chapter_id = bracket_match.group(1)
# Handle different chapter naming patterns
if "chapter_" in chapter_id.lower():
# Extract number from 'chapter_4' -> 'Chapter 4'
chapter_num = re.search(r"chapter_(\d+)", chapter_id.lower())
if chapter_num:
return f"Chapter {chapter_num.group(1)}"
elif (
chapter_id.lower().startswith("c")
and chapter_id[1:].split(".")[0].isdigit()
):
# Handle 'c3.xhtml' -> 'Chapter 3'
chapter_num = re.search(r"c(\d+)", chapter_id.lower())
if chapter_num:
return f"Chapter {chapter_num.group(1)}"
elif "preface" in chapter_id.lower() or "foreword" in chapter_id.lower():
return "Preface"
elif "introduction" in chapter_id.lower() or "intro" in chapter_id.lower():
return "Introduction"
elif "appendix" in chapter_id.lower():
return "Appendix"
elif "title" in chapter_id.lower():
return "Title Page"
elif "text" in chapter_id.lower():
# Handle generic text sections - try to make them readable
if "text-2" in chapter_id.lower():
return "Preface"
elif "text-3" in chapter_id.lower():
return "Preface (continued)"
elif "text-5" in chapter_id.lower():
return "How to Begin"
else:
return "Text Section"
else:
# Fallback: clean up the raw chapter ID
cleaned = chapter_id.replace("_", " ").replace("-", " ")
return cleaned.title()
except Exception:
return None
def get_chapter_from_cfi_original(epub_path: str, location: str) -> str | None:
"""Original function - renamed for clarity and used as fallback"""
try:
import ebooklib
from ebooklib import epub
book = epub.read_epub(epub_path)
# Handle location like "epubcfi(/6/12[chapter_4]!/4/10/1,:0,:4)"
chapter_match = re.search(r"\[([^\]]*)\]", location)
if chapter_match:
chapter_id = chapter_match.group(1)
# Look through TOC for matching chapter
def search_toc(items):
for item in items:
if isinstance(item, tuple):
section, href = item
if chapter_id in str(href):
return section
# Handle nested TOC
if isinstance(section, tuple):
result = search_toc(section)
if result:
return result
return None
return search_toc(book.toc)
except Exception as e:
print(f"Error getting chapter: {e}")
return None
return None
def get_chapter_display_name(
cfi: str, epub_path: str = None, fallback_method: bool = True
) -> Optional[str]:
"""
Get chapter name with fallback to the original method if needed.
"""
# Try the new CFI-based method first
chapter = extract_chapter_from_cfi(cfi)
if chapter:
return chapter
# Fallback to original method if available and requested
if fallback_method and epub_path:
return get_chapter_from_cfi_original(epub_path, cfi)
return None
def get_epub_metadata(epub_path: str):
try:
import ebooklib
from ebooklib import epub
import base64
if not epub_path:
return None
try:
book = epub.read_epub(epub_path)
metadata = {
"isbn": next(
(
val
for _, val in book.get_metadata("DC", "identifier")
if isinstance(val, str) and "isbn" in val.lower()
),
None,
),
"language": next(
(val[0] for val in book.get_metadata("DC", "language")), None
),
"publisher": next(
(val[0] for val in book.get_metadata("DC", "publisher")), None
),
"publication_date": next(
(val[0] for val in book.get_metadata("DC", "date")), None
),
}
cover_base64 = None
for item in book.get_items():
if item.get_type() == ebooklib.ITEM_COVER:
cover_data = item.get_content()
cover_base64 = base64.b64encode(cover_data).decode("utf-8")
break
metadata["cover"] = cover_base64
return metadata
except Exception as e:
print(f"Error reading epub: {e}")
return None
except ImportError:
return None
def get_db_path(pattern: str) -> str:
paths = glob.glob(os.path.expanduser(pattern))
if not paths:
raise FileNotFoundError(f"No database found matching pattern: {pattern}")
return paths[0]
def get_book_details() -> List[BookDetail]:
try:
with sqlite3.connect(get_db_path(LIBRARY_DB_PATTERN)) as conn:
cursor = conn.cursor()
cursor.execute(
"""SELECT ZASSETID, ZSORTTITLE, ZSORTAUTHOR, ZBOOKDESCRIPTION, ZEPUBID, ZPATH
FROM ZBKLIBRARYASSET"""
)
return [
BookDetail(
asset_id=row[0],
title=row[1],
author=row[2],
description=row[3],
epub_id=row[4],
path=row[5],
isbn=None,
language=None,
publisher=None,
publication_date=None,
cover=None,
)
for row in cursor.fetchall()
]
except sqlite3.Error as e:
logging.error(f"Database error: {e}")
raise
def get_books_with_highlights() -> List[str]:
book_ids = [book.asset_id for book in get_book_details()]
placeholders = ",".join("?" for _ in book_ids)
try:
with sqlite3.connect(get_db_path(ANNOTATION_DB_PATTERN)) as conn:
cursor = conn.cursor()
cursor.execute(
f"""SELECT DISTINCT ZANNOTATIONASSETID
FROM ZAEANNOTATION
WHERE ZANNOTATIONASSETID IN ({placeholders})
AND ZANNOTATIONSELECTEDTEXT != "";""",
book_ids,
)
return [entry[0] for entry in cursor.fetchall()]
except sqlite3.Error as e:
logging.error(f"Database error: {e}")
raise
def parse_cfi_for_sorting(cfi: str) -> Tuple[int, ...]:
"""
Parse an EPUB CFI string and return a tuple of numbers for sorting.
Example: 'epubcfi(/6/24[c3.xhtml]!/4/188/2/1,:0,:1)'
Returns: (6, 24, 4, 188, 2, 1, 0, 1)
"""
if not cfi or not cfi.startswith("epubcfi("):
return (0,) # Default for invalid CFI
try:
# Remove 'epubcfi(' and ')' and split by common delimiters
content = cfi[8:-1] # Remove 'epubcfi(' and ')'
# Extract all numbers from the CFI
numbers = []
# Split by major sections (! separates different parts)
parts = content.split("!")
for part in parts:
# Find all numbers in each part, ignoring text in brackets
# Remove bracketed content first
clean_part = re.sub(r"\[[^\]]*\]", "", part)
# Extract numbers
nums = re.findall(r"\d+", clean_part)
numbers.extend([int(n) for n in nums])
return tuple(numbers) if numbers else (0,)
except Exception:
return (0,)
def export_annotations(
asset_id: str, book_details: List[BookDetail], file_path: str, extra_meta: dict
) -> None:
try:
with sqlite3.connect(get_db_path(ANNOTATION_DB_PATTERN)) as conn:
cursor = conn.cursor()
# Get all annotations without ordering first
cursor.execute(
"""SELECT ZANNOTATIONSELECTEDTEXT, ZANNOTATIONNOTE, ZANNOTATIONLOCATION,
ZPLABSOLUTEPHYSICALLOCATION
FROM ZAEANNOTATION
WHERE ZANNOTATIONASSETID = ? AND ZANNOTATIONSELECTEDTEXT != "";""",
(asset_id,),
)
annotations = cursor.fetchall()
except sqlite3.Error as e:
logging.error(f"Database error: {e}")
raise
# Sort annotations by CFI location
def cfi_sort_key(annotation):
return parse_cfi_for_sorting(
annotation[2]
) # annotation[2] is ZANNOTATIONLOCATION
sorted_annotations = sorted(annotations, key=cfi_sort_key)
create_file(book_details, sorted_annotations, file_path, extra_meta)
def export_annotations_simple_numeric_sort(
asset_id: str, book_details: List[BookDetail], file_path: str, extra_meta: dict
) -> None:
"""
Simpler approach: extract the first few numbers from CFI and sort by those.
This works well for books with consistent CFI structure.
"""
try:
with sqlite3.connect(get_db_path(ANNOTATION_DB_PATTERN)) as conn:
cursor = conn.cursor()
cursor.execute(
"""SELECT ZANNOTATIONSELECTEDTEXT, ZANNOTATIONNOTE, ZANNOTATIONLOCATION
FROM ZAEANNOTATION
WHERE ZANNOTATIONASSETID = ? AND ZANNOTATIONSELECTEDTEXT != "";""",
(asset_id,),
)
annotations = cursor.fetchall()
except sqlite3.Error as e:
logging.error(f"Database error: {e}")
raise
# Simple sort by extracting main chapter/section numbers
def simple_cfi_sort_key(annotation):
cfi = annotation[2] or ""
# Extract the main chapter number (e.g., /6/24 -> [6, 24])
numbers = re.findall(r"/(\d+)", cfi)
return [int(n) for n in numbers[:3]] # Use first 3 levels for sorting
sorted_annotations = sorted(annotations, key=simple_cfi_sort_key)
create_file(book_details, sorted_annotations, file_path, extra_meta)
def format_citation(book_detail: BookDetail, location_info: str = None) -> str:
"""Generate citation with available data"""
citation_parts = []
if book_detail.author:
citation_parts.append(book_detail.author)
if book_detail.title:
citation_parts.append(f"*{book_detail.title}*")
if book_detail.publisher:
citation_parts.append(book_detail.publisher)
if book_detail.publication_date:
year = (
book_detail.publication_date[:4]
if len(book_detail.publication_date) >= 4
else book_detail.publication_date
)
citation_parts.append(year)
if location_info:
citation_parts.append(f"loc. {location_info}")
return ", ".join(citation_parts) + "."
def create_file(
book_detail: BookDetail,
annotations: List[Tuple[str, str, str]],
file_path: str,
extra_meta: dict,
) -> None:
if extra_meta:
book_detail = book_detail._replace(**extra_meta)
try:
# Frontmatter
output_md = "---\n"
for key, value in {
field: getattr(book_detail, field) for field in BookDetail._fields
}.items():
if value and key != "cover":
output_md += f"{key}: {sanitize_frontmatter(value)}\n"
output_md += "---\n\n"
# Title
output_md += f"# {book_detail.title} by {book_detail.author}\n\n"
# Cover image
if extra_meta and extra_meta.get("cover"):
output_md += f'<p align="center"><img src="data:image/jpeg;base64,{extra_meta["cover"]}" width="50%"></p>\n\n'
# Metadata
output_md += "## Metadata\n\n"
for key, value in {
field: getattr(book_detail, field) for field in BookDetail._fields
}.items():
if key == "path":
output_md += f"- {key}: [{value}](file://{value})\n"
elif key == "author":
if len(sys.argv) > 3:
output_md += f"- {key}: [[{sys.argv[3]}/Authors/{value}]]\n"
else:
output_md += f"- {key}: [[Authors/{value}]]\n"
elif value and key != "cover":
output_md += f"- {key}: {value}\n"
output_md += "- tags: #book/notes\n"
# Annotations with improved chapter handling
output_md += "\n## Annotations\n\n"
current_chapter = None
for highlight, note, location, physical_location in annotations:
chapter = get_chapter_display_name(location, book_detail.path)
# Add chapter heading if we're in a new chapter
if chapter and chapter != current_chapter:
current_chapter = chapter
output_md += f"### {chapter}\n\n"
# Add link to book location
# epubcfi_link = f"ibooks://assetid/{book_detail.epub_id}#{location}"
# output_md += f"[📖 Open in iBooks]({epubcfi_link}) | "
# Add the highlight
output_md += "\n".join([f"> {line}" for line in highlight.split("\n")])
output_md += f"\n\n"
# Add citation
loc_info = (
str(int(physical_location))
if physical_location and physical_location > 0
else None
)
citation = format_citation(book_detail, loc_info)
output_md += f"*{citation}*\n\n"
# Add note if present
if note:
output_md += f"**Note:** {note}\n\n"
output_md += f"---\n\n"
file_name = f"{book_detail.title} - {book_detail.author}.md"
with open(
(
os.path.abspath(os.path.join(file_path, file_name))
if file_path
else file_name
),
"w",
) as mdfile:
mdfile.write(output_md)
except IOError as e:
logging.error(f"Error writing to file: {e}")
raise
def main():
try:
file_path = None
if len(sys.argv) > 1:
file_path = sys.argv[1]
if len(sys.argv) > 3:
vault_path = sys.argv[1]
folder = sys.argv[3]
file_path = os.path.join(vault_path, folder)
book_details = get_book_details()
books_with_highlights = get_books_with_highlights()
except (FileNotFoundError, sqlite3.Error) as e:
logging.error(f"Error initializing: {e}")
print("An error occurred accessing the Books database.")
return
for book in books_with_highlights:
try:
book_detail = next((bd for bd in book_details if bd.asset_id == book), None)
if book_detail:
extra_meta = get_epub_metadata(book_detail.path)
export_annotations(book, book_detail, file_path, extra_meta)
print(f"Exported annotations for book: {book_detail.title}")
else:
logging.error(f"Book details not found for asset_id: {book}")
except (ValueError, sqlite3.Error, IOError) as e:
print(f"Error exporting annotations: {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment