Created
April 17, 2024 15:23
-
-
Save jlmalone/4f9d7d88a0628256e3e2aee70131a4b7 to your computer and use it in GitHub Desktop.
Extract the text out of an ePub
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# pip install ebooklib beautifulsoup4 python-dotenv | |
import ebooklib | |
from dotenv import load_dotenv | |
import os | |
from ebooklib import epub | |
from bs4 import BeautifulSoup | |
def extract_text_from_epub(file_path): | |
book = epub.read_epub(file_path) | |
text_content = [] | |
for item in book.get_items(): | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
soup = BeautifulSoup(item.content, 'html.parser') | |
text_content.append(soup.get_text()) | |
return "\n\n".join(text_content) | |
load_dotenv() | |
epub_path = os.getenv("EPUB_PATH") | |
# Specify the path to your EPUB file | |
text = extract_text_from_epub(epub_path) | |
print(text) | |
# TODO decide where text should be saved or whatever |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment