Created
December 5, 2024 10:19
-
-
Save DxPoly/7b7fd7ded0dcfbf65992e1c6ea3fcff5 to your computer and use it in GitHub Desktop.
Pragmatic Bookshelf EPUB file processer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import zipfile | |
import tempfile | |
import os | |
import shutil | |
from pathlib import Path | |
def process_html_content(content): | |
""" | |
Add <pre> tags around <table class="processedcode"> elements | |
""" | |
pattern = r'(<table class="processedcode"[^>]*>[\s\S]*?</table>)' | |
def replacer(match): | |
return f'<pre>{match.group(1)}</pre>' | |
return re.sub(pattern, replacer, content) | |
def process_epub(epub_path): | |
""" | |
Process all HTML files in the EPUB file | |
""" | |
# Create temporary directory | |
temp_dir = tempfile.mkdtemp() | |
temp_epub = os.path.join(temp_dir, 'temp.epub') | |
try: | |
# Copy original file to temp directory | |
shutil.copy2(epub_path, temp_epub) | |
# Create output filename | |
output_path = str(Path(epub_path).with_stem(Path(epub_path).stem + '_processed')) | |
# Process EPUB file | |
with zipfile.ZipFile(epub_path, 'r') as zip_ref: | |
# Create new ZIP file | |
with zipfile.ZipFile(output_path, 'w') as zip_out: | |
# Iterate through all files | |
for item in zip_ref.infolist(): | |
data = zip_ref.read(item.filename) | |
# Process HTML files | |
if item.filename.endswith(('.html', '.xhtml')): | |
content = data.decode('utf-8') | |
processed_content = process_html_content(content) | |
zip_out.writestr(item, processed_content.encode('utf-8')) | |
else: | |
# Copy other files directly | |
zip_out.writestr(item, data) | |
print(f"Processing complete. Output file: {output_path}") | |
return output_path | |
finally: | |
# Clean up temporary files | |
shutil.rmtree(temp_dir) | |
if __name__ == "__main__": | |
import sys | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <epub_file_path>") | |
sys.exit(1) | |
epub_path = sys.argv[1] | |
process_epub(epub_path) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The eBook from Pragmatic Bookshelf has an issue in the epub file where code may not be wrapped in
or