Skip to content

Instantly share code, notes, and snippets.

@samkahchiin
Last active May 8, 2025 03:35
Show Gist options
  • Save samkahchiin/2968f3060b85eed0fac2374d3e74eb07 to your computer and use it in GitHub Desktop.
Save samkahchiin/2968f3060b85eed0fac2374d3e74eb07 to your computer and use it in GitHub Desktop.
Extract Myanmar text from PDF. This script will convert Win Innwa to Unicode using Burglish
import os
import time
import fitz # PyMuPDF
from docx import Document
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
def set_font_selection(driver, panel_type, font_name):
"""Set font for either source (input) or target (output) panel"""
# Determine which dropdown to click
dropdown_selector = "ol#source > li > p" if panel_type == "source" else "ol#target > li > p"
# Click the dropdown
dropdown = driver.find_element(By.CSS_SELECTOR, dropdown_selector)
ActionChains(driver).move_to_element(dropdown).click().perform()
time.sleep(0.5) # Allow dropdown animation
# IMPORTANT: xpath is required to locate the element
# Select the font
font_xpath = f"""
//ol[@id='{panel_type}']//li[
@id='{font_name}' or
@data-detail='{font_name}' or
text()='{font_name}'
]
"""
font_element = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.XPATH, font_xpath)))
font_element.click()
time.sleep(0.3)
def trigger_conversion(driver):
"""Click the convert button"""
switch_btn = driver.find_element(By.CSS_SELECTOR, "#targetOption > .icon-convert-normal")
ActionChains(driver).move_to_element(switch_btn).click().perform()
time.sleep(0.5) # Allow conversion to complete
def convert_text(driver, text):
"""Input text and get converted result"""
# Clear and input text
input_area = driver.find_element(By.ID, "sourceText")
input_area.clear()
driver.execute_script("arguments[0].value = arguments[1];", input_area, text)
# Trigger conversion
trigger_conversion(driver)
# Get output
return driver.find_element(By.ID, "targetText").get_attribute("value")
def auto_convert_pdf(pdf_path, output_docx):
# 1. Extract text from PDF
pdf_texts = []
with fitz.open(pdf_path) as pdf:
for page in pdf:
pdf_texts.append(page.get_text())
# 2. Configure Selenium
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-extensions")
options.add_argument("--blink-settings=imagesEnabled=false")
driver = webdriver.Chrome(options=options)
driver.get("https://scriptive.github.io/burglish/")
driver.maximize_window()
# 3. Set conversion options
set_font_selection(driver, "source", "WinInnwa") # Input font
set_font_selection(driver, "target", "Myanmar3") # Output font
# 4. Process each page
doc = Document()
for i, text in enumerate(pdf_texts):
# Input text
converted_text = convert_text(driver, text)
# Add to DOCX
doc.add_paragraph(f"Page {i+1}")
doc.add_paragraph(converted_text)
if i < len(pdf_texts) - 1:
doc.add_page_break()
# 5. Cleanup
driver.quit()
doc.save(output_docx)
print(f"Converted DOCX saved to {os.path.abspath(output_docx)}")
# Usage
auto_convert_pdf("../input/body.pdf", "../output/body.docx")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment