Last active
May 7, 2019 15:30
-
-
Save dlukes/2b5c2a163cd8adba420aaae0c8ea2c00 to your computer and use it in GitHub Desktop.
Remove dates from comments and tracked edits in docx. Also, a cheatsheet for namespaces in lxml.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""Usage: {} AUTHOR_SUBSTRING INPUT.DOCX OUTPUT.DOCX | |
Remove date metadata from Word document for authors matching | |
AUTHOR_SUBSTRING. Handy if you don't want other people to know when | |
exactly you found time to work on their document ;) | |
In more detail: Read INPUT.DOCX, extract the comments and tracked edits, | |
manipulate them (cf. functions `modify_comments()` and | |
`modify_tracked_edits()` -- by default, they remove date metadata | |
when the author contains AUTHOR_SUBSTRING, but you can tweak them | |
depending on your needs), then create OUTPUT.DOCX with the modified | |
comments and tracked edits. | |
Note that OUTPUT.DOCX tends to be larger than the input file for some | |
reason. To get back to a reasonable size, just re-save OUTPUT.DOCX from | |
Word. | |
""" | |
import sys | |
import zipfile | |
from itertools import chain | |
from lxml import etree | |
W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" | |
NSMAP = {"w": W_NS} | |
W = "{" + W_NS + "}" | |
def modify_comments(comments, author_substr): | |
et = etree.fromstring(comments) | |
comments = et.xpath("//w:comment", namespaces=NSMAP) | |
for c in comments: | |
if author_substr in c.attrib[W + "author"]: | |
del c.attrib[W + "date"] | |
# NOTE: basically do whatever you want here with the comment | |
# based on its XML attributes and content | |
# author = c.attrib[W + "author"] | |
# date = c.attrib[W + "date"] | |
# print(f"{author} commented on {date}:") | |
# print(c.xpath("string(.)")) | |
# NOTE: if you want to completely delete the comment, you can't | |
# just do `et.remove(c)`, because there's also a comment marker | |
# within the document itself, so that's more of a hassle | |
return etree.tostring(et) | |
def modify_tracked_edits(document, author_substr): | |
et = etree.fromstring(document) | |
tracked_edits = chain( | |
et.xpath("//w:ins", namespaces=NSMAP), et.xpath("//w:del", namespaces=NSMAP) | |
) | |
for te in tracked_edits: | |
# NOTE: the guidelines are the same as in modify_comments above | |
if author_substr in te.attrib[W + "author"]: | |
del te.attrib[W + "date"] | |
return etree.tostring(et) | |
def main(): | |
author_substr, in_fname, out_fname = sys.argv[1:] | |
comments_path = "word/comments.xml" | |
document_path = "word/document.xml" | |
# read in existing comments | |
with zipfile.ZipFile(in_fname) as docx_in: | |
comments = docx_in.read(comments_path) | |
document = docx_in.read(document_path) | |
comments = modify_comments(comments, author_substr) | |
document = modify_tracked_edits(document, author_substr) | |
# recreate zip archive with new version of comments | |
with zipfile.ZipFile(in_fname) as docx_in: | |
with zipfile.ZipFile(out_fname, "w") as docx_out: | |
# NOTE: this is some kind of ZIP file thing, NOT Word | |
# comments... | |
docx_out.comment = docx_in.comment | |
for item in docx_in.infolist(): | |
# ... the old version of these gets ignored here... | |
if item.filename not in (comments_path, document_path): | |
docx_out.writestr(item, docx_in.read(item)) | |
# ... and the updated versions get added here: | |
docx_out.writestr(comments_path, comments) | |
docx_out.writestr(document_path, document) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
hi, do you know some solution to add a new comment in the docx format file?