Skip to content

Instantly share code, notes, and snippets.

@DxPoly
Created December 5, 2024 10:19
Show Gist options
  • Save DxPoly/7b7fd7ded0dcfbf65992e1c6ea3fcff5 to your computer and use it in GitHub Desktop.
Save DxPoly/7b7fd7ded0dcfbf65992e1c6ea3fcff5 to your computer and use it in GitHub Desktop.
Pragmatic Bookshelf EPUB file processer
import re
import zipfile
import tempfile
import os
import shutil
from pathlib import Path
def process_html_content(content):
"""
Add <pre> tags around <table class="processedcode"> elements
"""
pattern = r'(<table class="processedcode"[^>]*>[\s\S]*?</table>)'
def replacer(match):
return f'<pre>{match.group(1)}</pre>'
return re.sub(pattern, replacer, content)
def process_epub(epub_path):
"""
Process all HTML files in the EPUB file
"""
# Create temporary directory
temp_dir = tempfile.mkdtemp()
temp_epub = os.path.join(temp_dir, 'temp.epub')
try:
# Copy original file to temp directory
shutil.copy2(epub_path, temp_epub)
# Create output filename
output_path = str(Path(epub_path).with_stem(Path(epub_path).stem + '_processed'))
# Process EPUB file
with zipfile.ZipFile(epub_path, 'r') as zip_ref:
# Create new ZIP file
with zipfile.ZipFile(output_path, 'w') as zip_out:
# Iterate through all files
for item in zip_ref.infolist():
data = zip_ref.read(item.filename)
# Process HTML files
if item.filename.endswith(('.html', '.xhtml')):
content = data.decode('utf-8')
processed_content = process_html_content(content)
zip_out.writestr(item, processed_content.encode('utf-8'))
else:
# Copy other files directly
zip_out.writestr(item, data)
print(f"Processing complete. Output file: {output_path}")
return output_path
finally:
# Clean up temporary files
shutil.rmtree(temp_dir)
if __name__ == "__main__":
import sys
if len(sys.argv) != 2:
print("Usage: python script.py <epub_file_path>")
sys.exit(1)
epub_path = sys.argv[1]
process_epub(epub_path)
@DxPoly
Copy link
Author

DxPoly commented Dec 5, 2024

The eBook from Pragmatic Bookshelf has an issue in the epub file where code may not be wrapped in or

 tags. As a result, when using translation software to translate the epub file, the code is also translated, leading to a poor reading experience. The purpose of this script is to wrap specific code snippets in 
 tags, allowing the translation tool to skip these code segments.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment