dlukes · May 7, 2019 15:30 · leyiwang · Mar 7, 2019
diff --git a/remove_docx_dates.py b/remove_docx_dates.py
 #!/usr/bin/env python3
 """Usage: {} AUTHOR_SUBSTRING INPUT.DOCX OUTPUT.DOCX

 Remove date metadata from Word document for authors matching
 AUTHOR_SUBSTRING. Handy if you don't want other people to know when
 exactly you found time to work on their document ;)

 In more detail: Read INPUT.DOCX, extract the comments and tracked edits,
 manipulate them (cf. functions `modify_comments()` and
 `modify_tracked_edits()` -- by default, they remove date metadata
 when the author contains AUTHOR_SUBSTRING, but you can tweak them
 depending on your needs), then create OUTPUT.DOCX with the modified
 comments and tracked edits.

 Note that OUTPUT.DOCX tends to be larger than the input file for some
 reason. To get back to a reasonable size, just re-save OUTPUT.DOCX from
 Word.

 """

 import sys
 import zipfile
 from itertools import chain

 from lxml import etree

 W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
 NSMAP = {"w": W_NS}
 W = "{" + W_NS + "}"


 def modify_comments(comments, author_substr):
    et = etree.fromstring(comments)
    comments = et.xpath("//w:comment", namespaces=NSMAP)
    for c in comments:
        if author_substr in c.attrib[W + "author"]:
            del c.attrib[W + "date"]
        # NOTE: basically do whatever you want here with the comment
        # based on its XML attributes and content
        # author = c.attrib[W + "author"]
        # date = c.attrib[W + "date"]
        # print(f"{author} commented on {date}:")
        # print(c.xpath("string(.)"))
        # NOTE: if you want to completely delete the comment, you can't
        # just do `et.remove(c)`, because there's also a comment marker
        # within the document itself, so that's more of a hassle
    return etree.tostring(et)


 def modify_tracked_edits(document, author_substr):
    et = etree.fromstring(document)
    tracked_edits = chain(
        et.xpath("//w:ins", namespaces=NSMAP), et.xpath("//w:del", namespaces=NSMAP)
    )
    for te in tracked_edits:
        # NOTE: the guidelines are the same as in modify_comments above
        if author_substr in te.attrib[W + "author"]:
            del te.attrib[W + "date"]
    return etree.tostring(et)


 def main():
    author_substr, in_fname, out_fname = sys.argv[1:]
    comments_path = "word/comments.xml"
    document_path = "word/document.xml"

    # read in existing comments
    with zipfile.ZipFile(in_fname) as docx_in:
        comments = docx_in.read(comments_path)
        document = docx_in.read(document_path)

    comments = modify_comments(comments, author_substr)
    document = modify_tracked_edits(document, author_substr)

    # recreate zip archive with new version of comments
    with zipfile.ZipFile(in_fname) as docx_in:
        with zipfile.ZipFile(out_fname, "w") as docx_out:
            # NOTE: this is some kind of ZIP file thing, NOT Word
            # comments...
            docx_out.comment = docx_in.comment
            for item in docx_in.infolist():
                # ... the old version of these gets ignored here...
                if item.filename not in (comments_path, document_path):
                    docx_out.writestr(item, docx_in.read(item))
            # ... and the updated versions get added here:
            docx_out.writestr(comments_path, comments)
            docx_out.writestr(document_path, document)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""Usage: {} AUTHOR_SUBSTRING INPUT.DOCX OUTPUT.DOCX

	Remove date metadata from Word document for authors matching
	AUTHOR_SUBSTRING. Handy if you don't want other people to know when
	exactly you found time to work on their document ;)

	In more detail: Read INPUT.DOCX, extract the comments and tracked edits,
	manipulate them (cf. functions `modify_comments()` and
	`modify_tracked_edits()` -- by default, they remove date metadata
	when the author contains AUTHOR_SUBSTRING, but you can tweak them
	depending on your needs), then create OUTPUT.DOCX with the modified
	comments and tracked edits.

	Note that OUTPUT.DOCX tends to be larger than the input file for some
	reason. To get back to a reasonable size, just re-save OUTPUT.DOCX from
	Word.

	"""

	import sys
	import zipfile
	from itertools import chain

	from lxml import etree

	W_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
	NSMAP = {"w": W_NS}
	W = "{" + W_NS + "}"


	def modify_comments(comments, author_substr):
	et = etree.fromstring(comments)
	comments = et.xpath("//w:comment", namespaces=NSMAP)
	for c in comments:
	if author_substr in c.attrib[W + "author"]:
	del c.attrib[W + "date"]
	# NOTE: basically do whatever you want here with the comment
	# based on its XML attributes and content
	# author = c.attrib[W + "author"]
	# date = c.attrib[W + "date"]
	# print(f"{author} commented on {date}:")
	# print(c.xpath("string(.)"))
	# NOTE: if you want to completely delete the comment, you can't
	# just do `et.remove(c)`, because there's also a comment marker
	# within the document itself, so that's more of a hassle
	return etree.tostring(et)


	def modify_tracked_edits(document, author_substr):
	et = etree.fromstring(document)
	tracked_edits = chain(
	et.xpath("//w:ins", namespaces=NSMAP), et.xpath("//w:del", namespaces=NSMAP)
	)
	for te in tracked_edits:
	# NOTE: the guidelines are the same as in modify_comments above
	if author_substr in te.attrib[W + "author"]:
	del te.attrib[W + "date"]
	return etree.tostring(et)


	def main():
	author_substr, in_fname, out_fname = sys.argv[1:]
	comments_path = "word/comments.xml"
	document_path = "word/document.xml"

	# read in existing comments
	with zipfile.ZipFile(in_fname) as docx_in:
	comments = docx_in.read(comments_path)
	document = docx_in.read(document_path)

	comments = modify_comments(comments, author_substr)
	document = modify_tracked_edits(document, author_substr)

	# recreate zip archive with new version of comments
	with zipfile.ZipFile(in_fname) as docx_in:
	with zipfile.ZipFile(out_fname, "w") as docx_out:
	# NOTE: this is some kind of ZIP file thing, NOT Word
	# comments...
	docx_out.comment = docx_in.comment
	for item in docx_in.infolist():
	# ... the old version of these gets ignored here...
	if item.filename not in (comments_path, document_path):
	docx_out.writestr(item, docx_in.read(item))
	# ... and the updated versions get added here:
	docx_out.writestr(comments_path, comments)
	docx_out.writestr(document_path, document)


	if __name__ == "__main__":
	main()