Last active
March 6, 2018 03:20
-
-
Save jojonas/094a8d426c24aa35a0944b6de13bbe29 to your computer and use it in GitHub Desktop.
Change the language of Microsoft Office documents
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Changing the spelling language for Microsoft Office documents is a pain. | |
You can set the default spell-checker language as you desire, but this will | |
not modify existing templates. Templates are copied during installation | |
to the path '%appdata%\Microsoft\Templates' and done. Text elements within | |
these templates will retain their default spelling language. | |
This leads to a quite painful user experience when working with templates | |
and multiple languages. | |
Luckily, Office documents are nowadays (since Office 2007) stored in the | |
"Office Open XML" format, which is a ZIP file containing a bunch of XML | |
files. The spelling language is an attribute on some of these XML elements. | |
This script changes the spelling of all text-boxes, etc, by iterating | |
through the XML files and changing the content of the "lang" attribute. | |
It will create a new Office Open XML document with the modified XML files | |
(and all other non-XML files), suffixed by the desired language. | |
Example: Translating "my_slides.pptx" to American English will result in a | |
file called "my_slides_en-US.pptx". | |
The target language can be given with the "--lang" argument. See "--help" | |
for details. | |
(C) 2016 Jonas Lieb | |
""" | |
import os, os.path | |
import zipfile | |
import tempfile | |
import re | |
def translate_xml(filename, lang="en-US", only=None): | |
# We are using a "simple" regular expression here instead of a real XML parser | |
# because the Python Standard Library parser (ElementTree) changes the | |
# namespace prefixes within the XML files. | |
# This is basically equal within the XML-logic, but causes Microsoft | |
# Office to throw an error and offer to repair the file on opening it. | |
pattern = r'(<[^>]*lang=")([^\"]*)("[^>]*>)' | |
# ugly: Read entire file content to memory | |
with open(filename, 'r', encoding="utf-8") as file: | |
text = file.read() | |
# Maintain a set of replaced languages (neat for debugging) | |
seen = set() | |
# Callback for regular-expression matching | |
def replace(match): | |
old_language = match.group(2) | |
if only is not None and old_language not in only: | |
return match.group(0) | |
if old_language == lang: | |
return match.group(0) | |
seen.add(old_language) | |
return match.expand(r'\1' + lang + r'\3') | |
# Perform the RE-substitution | |
translated = re.sub(pattern, replace, text) | |
# Dump everything back to the same (temporary!) XML file | |
with open(filename, 'w', encoding="utf-8") as file: | |
file.write(translated) | |
# Print that neat debugging info | |
if seen: | |
print("Modified file '%s'. Observed languages (before): %s" \ | |
% (os.path.basename(filename), ",".join(seen))) | |
def translate_archive(filename, lang="en_US", only=None): | |
# Append language suffix | |
old_name, old_extension = os.path.splitext(filename) | |
new_filename = old_name + "_" + lang + old_extension | |
print("Translating document '%s' to '%s'." % (filename, new_filename)) | |
# Obtain the name of a temporary directory, usually resides within a | |
# users %appdata% (or /tmp on Linux). Python deletes it when the context | |
# manager is left. | |
with tempfile.TemporaryDirectory() as tmpdir: | |
print("Using temporary directory '%s'." % tmpdir) | |
# Use the zipfile module for opening Office Open XML files (yes, they're) | |
# just plain old .zip files with a fancy extension. | |
with zipfile.ZipFile(filename, 'r') as source_zip, \ | |
zipfile.ZipFile(new_filename, 'x') as destination_zip: | |
# We extract the files one-by-one to keep track of the ZipInfo objects | |
# during translation. | |
for fileinfo in source_zip.infolist(): | |
extracted_filename = source_zip.extract(fileinfo, path=tmpdir) | |
# Only touch .xml files | |
extension = os.path.splitext(fileinfo.filename)[1].lower() | |
if extension == ".xml": | |
translate_xml(extracted_filename, lang=lang, only=only) | |
# Write to destination file | |
destination_zip.write(extracted_filename, | |
arcname=fileinfo.filename, | |
compress_type=fileinfo.compress_type) | |
if __name__=="__main__": | |
import argparse | |
# Supported files. Wikipedia lists only .docx, .pptx and .xlsx as valid extensions | |
# for Office Open XML, but Microsoft uses the .dotx, .potx and .xltx files | |
# for its templates. | |
supported = (".docx", ".dotx", ".xlsx", ".xltx", ".pptx", ".potx", ".ppsx") | |
parser = argparse.ArgumentParser(description="Change spelling language of all elements in an Office document. " \ | |
"Supported file types: %s" % (", ".join(supported))) | |
# Custom "type-conversion" function which checks the file extension | |
# Is automatically called by the argument parser | |
def file_type(filename): | |
ext = os.path.splitext(filename)[1].lower() | |
if ext not in supported: | |
parser.error("File type must be one of: %s." % (", ".join(supported))) | |
return filename | |
parser.add_argument("filename", type=file_type, nargs="+", help="Files to process" ) | |
parser.add_argument("--lang", type=str, help="Destination language (e.g. en-US or de-DE)", default="en-US" ) | |
parser.add_argument("--only", type=str, help="Only substitute these languages, default: all.", nargs="*", default=None) | |
args = parser.parse_args() | |
for filename in args.filename: | |
translate_archive(filename, lang=args.lang, only=args.only) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment