Created
April 3, 2023 19:22
-
-
Save dado3212/43aed1d0d702d7392fd82c97e3166a81 to your computer and use it in GitHub Desktop.
A script to extract the alt text from images in an ePub
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ebooklib import epub | |
import io | |
from PIL import Image | |
from bs4 import BeautifulSoup | |
# Open the EPUB file | |
book = epub.read_epub('frugal.epub') | |
image_alts = {} | |
raw_images = {} | |
# Iterate over each item in the book, extracting the alt and the raw image | |
# files | |
for item in book.get_items(): | |
if item.get_type() == 9: # xhtml file | |
soup = BeautifulSoup(item.get_content(), 'lxml') | |
images = soup.find_all('img') | |
for image in images: | |
alt = image.get('alt', None) | |
src = image.get('src', None) | |
src = src[10:] # strip ../Images/ prefix | |
image_alts[src] = alt | |
if item.get_type() == 1: # image | |
img = Image.open(io.BytesIO(item.get_content())) | |
name = item.get_name()[7:] # strip Images/ prefix | |
raw_images[name] = img | |
# For Frugal Only | |
# # Chapter Titles | |
# sorted_chapter_titles = {} | |
# for key in image_alts: | |
# if key[:2] == 'ch': | |
# number = int(key[2:][:-4]) | |
# sorted_chapter_titles[number] = image_alts[key] | |
# sorted_chapter_titles = dict(sorted(sorted_chapter_titles.items())) | |
# for k in sorted_chapter_titles: | |
# print(str(k) + ' - ' + sorted_chapter_titles[k]) | |
# print() | |
# print('Epilogue - ' + image_alts['epi.jpg']) | |
# # FAQs | |
# print() | |
# print('FAQs') | |
# print() | |
# sorted_faqs = {} | |
# for key in image_alts: | |
# if key[:3] == 'faq': | |
# number = int(key[3:][:-4]) | |
# sorted_faqs[number] = image_alts[key] | |
# sorted_faqs = dict(sorted(sorted_faqs.items())) | |
# for k in sorted_faqs: | |
# print(str(k) + ' - ' + sorted_faqs[k]) | |
# print() | |
# # Margins | |
# print() | |
# print('Margins') | |
# print() | |
# sorted_margins = {} | |
# for key in image_alts: | |
# if key[:7] == 'margins': | |
# number = int(key[7:][:-4]) | |
# sorted_margins[number] = image_alts[key] | |
# sorted_margins = dict(sorted(sorted_margins.items())) | |
# for k in sorted_margins: | |
# print(str(k) + ' - ' + sorted_margins[k]) | |
# print() | |
for key in image_alts: | |
print(key) | |
print(image_alts[key]) | |
print() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment