Last active
August 20, 2024 09:34
-
-
Save florian-obradovic/c23cc6d7702fe29b8e6227b1bd156ed7 to your computer and use it in GitHub Desktop.
This Python script parses through all HTML files and replaces the external image references with embedded base64 encoded strings.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This Python script parses through all HTML files and replaces the external image references with embedded base64 encoded strings. | |
## create a directory | |
# mkdir ~/path2convert && cd ~/path2convert | |
# git init # optional to revert changes | |
# git add -A && git commit -m "Start" # optional to revert changes | |
## create a virtual environment | |
# python3 -m venv ./ | |
# source ./bin/activate | |
# pip3 install beautifulsoup4 pillow | |
## Example: python3 embedd-images-into-html-base64.py | |
import os | |
import base64 | |
from bs4 import BeautifulSoup | |
from urllib.parse import unquote | |
# Directory containing your HTML files | |
html_files_directory = os.path.expanduser('~/Downloads/paname-convert') | |
# Function to convert image to base64 | |
def img_to_base64(img_path): | |
with open(img_path, 'rb') as img_file: | |
encoded_string = base64.b64encode(img_file.read()).decode('utf-8') | |
return encoded_string | |
# Function to process a single HTML file | |
def process_html_file(file_path): | |
with open(file_path, 'r', encoding='utf-8') as file: | |
soup = BeautifulSoup(file, 'html.parser') | |
# Find all image tags | |
for img in soup.find_all('img'): | |
img_src = img.get('src') | |
if img_src and not img_src.startswith('data:'): # Only process if it's not already base64 | |
decoded_img_src = unquote(img_src) # Decode the URL-encoded path | |
img_path = os.path.join(os.path.dirname(file_path), decoded_img_src) | |
# Check if the image file exists | |
if os.path.exists(img_path): | |
mime_type = f"image/{img_path.split('.')[-1]}" | |
base64_string = img_to_base64(img_path) | |
img['src'] = f"data:{mime_type};base64,{base64_string}" | |
print(f"Replaced image {img_src} with base64 in {file_path}") | |
else: | |
print(f"Image not found: {img_path}") | |
else: | |
print(f"Skipping image {img_src} in {file_path} (already base64 or invalid)") | |
# Save the modified HTML back to the file | |
with open(file_path, 'w', encoding='utf-8') as file: | |
file.write(str(soup)) | |
# Function to process all HTML files in a directory | |
def process_html_files_in_directory(directory): | |
for root, dirs, files in os.walk(directory): | |
for file in files: | |
if file.endswith('.html'): | |
process_html_file(os.path.join(root, file)) | |
# Process all HTML files in the specified directory | |
process_html_files_in_directory(html_files_directory) | |
print("All HTML files have been processed and images have been embedded as Base64.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment