Last active
May 8, 2025 03:35
-
-
Save samkahchiin/2968f3060b85eed0fac2374d3e74eb07 to your computer and use it in GitHub Desktop.
Extract Myanmar text from PDF. This script will convert Win Innwa to Unicode using Burglish
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import time | |
import fitz # PyMuPDF | |
from docx import Document | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options | |
from selenium.webdriver.chrome.service import Service | |
from selenium.webdriver.common.action_chains import ActionChains | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from webdriver_manager.chrome import ChromeDriverManager | |
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install())) | |
def set_font_selection(driver, panel_type, font_name): | |
"""Set font for either source (input) or target (output) panel""" | |
# Determine which dropdown to click | |
dropdown_selector = "ol#source > li > p" if panel_type == "source" else "ol#target > li > p" | |
# Click the dropdown | |
dropdown = driver.find_element(By.CSS_SELECTOR, dropdown_selector) | |
ActionChains(driver).move_to_element(dropdown).click().perform() | |
time.sleep(0.5) # Allow dropdown animation | |
# IMPORTANT: xpath is required to locate the element | |
# Select the font | |
font_xpath = f""" | |
//ol[@id='{panel_type}']//li[ | |
@id='{font_name}' or | |
@data-detail='{font_name}' or | |
text()='{font_name}' | |
] | |
""" | |
font_element = WebDriverWait(driver, 10).until( | |
EC.presence_of_element_located((By.XPATH, font_xpath))) | |
font_element.click() | |
time.sleep(0.3) | |
def trigger_conversion(driver): | |
"""Click the convert button""" | |
switch_btn = driver.find_element(By.CSS_SELECTOR, "#targetOption > .icon-convert-normal") | |
ActionChains(driver).move_to_element(switch_btn).click().perform() | |
time.sleep(0.5) # Allow conversion to complete | |
def convert_text(driver, text): | |
"""Input text and get converted result""" | |
# Clear and input text | |
input_area = driver.find_element(By.ID, "sourceText") | |
input_area.clear() | |
driver.execute_script("arguments[0].value = arguments[1];", input_area, text) | |
# Trigger conversion | |
trigger_conversion(driver) | |
# Get output | |
return driver.find_element(By.ID, "targetText").get_attribute("value") | |
def auto_convert_pdf(pdf_path, output_docx): | |
# 1. Extract text from PDF | |
pdf_texts = [] | |
with fitz.open(pdf_path) as pdf: | |
for page in pdf: | |
pdf_texts.append(page.get_text()) | |
# 2. Configure Selenium | |
options = Options() | |
options.add_argument("--headless=new") | |
options.add_argument("--disable-gpu") | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-extensions") | |
options.add_argument("--blink-settings=imagesEnabled=false") | |
driver = webdriver.Chrome(options=options) | |
driver.get("https://scriptive.github.io/burglish/") | |
driver.maximize_window() | |
# 3. Set conversion options | |
set_font_selection(driver, "source", "WinInnwa") # Input font | |
set_font_selection(driver, "target", "Myanmar3") # Output font | |
# 4. Process each page | |
doc = Document() | |
for i, text in enumerate(pdf_texts): | |
# Input text | |
converted_text = convert_text(driver, text) | |
# Add to DOCX | |
doc.add_paragraph(f"Page {i+1}") | |
doc.add_paragraph(converted_text) | |
if i < len(pdf_texts) - 1: | |
doc.add_page_break() | |
# 5. Cleanup | |
driver.quit() | |
doc.save(output_docx) | |
print(f"Converted DOCX saved to {os.path.abspath(output_docx)}") | |
# Usage | |
auto_convert_pdf("../input/body.pdf", "../output/body.docx") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment