Created
February 15, 2025 20:11
-
-
Save havardgulldahl/2a7ef3c440d2f0d934c4139259a21cfa to your computer and use it in GitHub Desktop.
A python script to translate a pdf inline, where the translated strings are overlayed on top of the original text -- like translate.google.com will do.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import fitz # PyMuPDF | |
import asyncio | |
from googletrans import Translator | |
from tqdm.asyncio import tqdm # For async progress reporting | |
__author__ = "[email protected], 2025" | |
def normalize_color(color_int): | |
""" | |
Convert a 24-bit integer RGB color to a tuple with normalized values (0.0 - 1.0). | |
""" | |
red = (color_int >> 16) & 0xFF # Extract the red component | |
green = (color_int >> 8) & 0xFF # Extract the green component | |
blue = color_int & 0xFF # Extract the blue component | |
# Normalize each color component to the range 0.0 - 1.0 | |
return (red / 255.0, green / 255.0, blue / 255.0) | |
async def translate_pdf_async( | |
input_pdf_path, output_pdf_path, target_language="es", font_path=None | |
): | |
""" | |
Asynchronously translate the text content of a PDF while preserving its original layout, | |
including fonts, styles, and images. Adds a light gray background to translated strings. | |
Displays progress bars for pages and spans. | |
Args: | |
input_pdf_path (str): Path to the input PDF. | |
output_pdf_path (str): Path to save the translated PDF. | |
target_language (str): Target language code for translation (e.g., "es" for Spanish). | |
font_path (str): Path to a font file (TTF/OTF) to use for text insertion. If None, a built-in font will be used. | |
Returns: | |
None | |
""" | |
# Open the original PDF | |
doc = fitz.open(input_pdf_path) | |
# Create a new PDF document for the translated version | |
translated_doc = fitz.open() | |
# Initialize the translator | |
translator = Translator() | |
# Initialize progress bar for pages | |
print("Translating PDF...") | |
page_progress = tqdm(total=len(doc), desc="Pages", unit="page") | |
# Loop through pages in the original PDF | |
for page_number in range(len(doc)): | |
page = doc[page_number] | |
# Copy the original page into the translated document | |
new_page = translated_doc.new_page( | |
-1, width=page.rect.width, height=page.rect.height | |
) | |
new_page.show_pdf_page(new_page.rect, doc, page_number) | |
# Get text information as a dictionary (blocks, lines, spans) | |
text_dict = page.get_text("dict") | |
# Count spans for the current page | |
spans_count = sum( | |
len(line["spans"]) | |
for block in text_dict["blocks"] | |
if block["type"] == 0 | |
for line in block["lines"] | |
) | |
span_progress = tqdm( | |
total=spans_count, desc=f"Page {page_number + 1}", unit="span", leave=False | |
) | |
# Loop over text blocks | |
for block in text_dict["blocks"]: | |
# We process only text blocks (type==0) | |
if block["type"] != 0: | |
continue | |
# iterate over the lines, but from the bottom up | |
reversed_lines = block["lines"][::-1] | |
for line in reversed_lines: # block["lines"]: | |
for span in line["spans"]: | |
orig_text = span["text"] | |
if orig_text.strip(): # If there's actual text | |
# Display the current string being translated in the span progress bar | |
span_progress.set_description( | |
f"Translating: {orig_text[:30]}..." | |
) | |
# Translate the text asynchronously | |
try: | |
trans = await translator.translate( | |
orig_text, target_language | |
) | |
translated_text = trans.text | |
except Exception as e: | |
print(f"Translation error on text: {orig_text}\nError: {e}") | |
translated_text = orig_text # Fallback | |
# Get the original span details like position, font, and size | |
font_size = span["size"] | |
bbox = span["bbox"] # Bounding box of the text | |
# Draw a light gray background rectangle (RGBA with alpha for transparency) | |
# reduce the size of the rectangle slightly to avoid overlapping with adjacent text | |
bbox = [bbox[0], bbox[1] - 1, bbox[2], bbox[3] + 1] | |
rect = fitz.Rect(bbox) | |
new_page.draw_rect( | |
rect, | |
color=(0.85, 0.85, 0.85), # Light gray color | |
fill=(0.85, 0.85, 0.85), # Light gray fill | |
overlay=True, | |
) | |
# Insert the translated text at the same position | |
new_page.insert_text( | |
fitz.Point(bbox[0], bbox[3]), # Top-left corner of the bbox | |
translated_text, | |
fontsize=font_size | |
* 0.9, # Reduce font size slightly, since we are using a default font | |
# fontname="test.ttf", # span["font"], # Use the original font | |
color=normalize_color(span.get("color", 0)), | |
) | |
# Update the span progress bar | |
span_progress.update(1) | |
# Close the span progress bar for the current page | |
span_progress.close() | |
# Update the page progress bar | |
page_progress.update(1) | |
# Close the page progress bar | |
page_progress.close() | |
# Save the new translated PDF | |
translated_doc.save(output_pdf_path) | |
print(f"Translated PDF saved to {output_pdf_path}") | |
# If the script is run directly | |
if __name__ == "__main__": | |
import argparse | |
# Define default paths and settings | |
input_pdf = "input.pdf" # Default input PDF path | |
output_pdf = "output_translated.pdf" # Default output PDF path | |
target_language = "en" # Default target language | |
# Add command-line argument parsing | |
parser = argparse.ArgumentParser( | |
description="Translate a PDF file while preserving its layout." | |
) | |
parser.add_argument( | |
"--input", type=str, default=input_pdf, help="Path to the input PDF file." | |
) | |
parser.add_argument( | |
"--output", | |
type=str, | |
default=output_pdf, | |
help="Path to save the translated PDF.", | |
) | |
parser.add_argument( | |
"--lang", | |
type=str, | |
default=target_language, | |
help="Target language for translation (e.g., 'es').", | |
) | |
args = parser.parse_args() | |
# Run the translation function asynchronously | |
asyncio.run(translate_pdf_async(args.input, args.output, args.lang)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The code does not care to reflow text, or respect other elements of the pdf page.
The output is not super pretty, but this gets the job done.