Skip to content

Instantly share code, notes, and snippets.

@florian-obradovic
Last active August 20, 2024 09:34
Show Gist options
  • Save florian-obradovic/c23cc6d7702fe29b8e6227b1bd156ed7 to your computer and use it in GitHub Desktop.
Save florian-obradovic/c23cc6d7702fe29b8e6227b1bd156ed7 to your computer and use it in GitHub Desktop.
This Python script parses through all HTML files and replaces the external image references with embedded base64 encoded strings.
# This Python script parses through all HTML files and replaces the external image references with embedded base64 encoded strings.
## create a directory
# mkdir ~/path2convert && cd ~/path2convert
# git init # optional to revert changes
# git add -A && git commit -m "Start" # optional to revert changes
## create a virtual environment
# python3 -m venv ./
# source ./bin/activate
# pip3 install beautifulsoup4 pillow
## Example: python3 embedd-images-into-html-base64.py
import os
import base64
from bs4 import BeautifulSoup
from urllib.parse import unquote
# Directory containing your HTML files
html_files_directory = os.path.expanduser('~/Downloads/paname-convert')
# Function to convert image to base64
def img_to_base64(img_path):
with open(img_path, 'rb') as img_file:
encoded_string = base64.b64encode(img_file.read()).decode('utf-8')
return encoded_string
# Function to process a single HTML file
def process_html_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
# Find all image tags
for img in soup.find_all('img'):
img_src = img.get('src')
if img_src and not img_src.startswith('data:'): # Only process if it's not already base64
decoded_img_src = unquote(img_src) # Decode the URL-encoded path
img_path = os.path.join(os.path.dirname(file_path), decoded_img_src)
# Check if the image file exists
if os.path.exists(img_path):
mime_type = f"image/{img_path.split('.')[-1]}"
base64_string = img_to_base64(img_path)
img['src'] = f"data:{mime_type};base64,{base64_string}"
print(f"Replaced image {img_src} with base64 in {file_path}")
else:
print(f"Image not found: {img_path}")
else:
print(f"Skipping image {img_src} in {file_path} (already base64 or invalid)")
# Save the modified HTML back to the file
with open(file_path, 'w', encoding='utf-8') as file:
file.write(str(soup))
# Function to process all HTML files in a directory
def process_html_files_in_directory(directory):
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.html'):
process_html_file(os.path.join(root, file))
# Process all HTML files in the specified directory
process_html_files_in_directory(html_files_directory)
print("All HTML files have been processed and images have been embedded as Base64.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment