Last active
August 29, 2015 14:08
-
-
Save r0yfire/ef7e78aa649e0c03fa9e to your computer and use it in GitHub Desktop.
Patch docx file with a tracking URL | www.docping.me
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Patch docx file with a tracking URL | |
Author: Roy Firestein (roy[at]firestein[dot]net) | |
Date: October 28, 2014 | |
Based on code from https://docping.me | |
""" | |
from os import path | |
import os | |
from tempfile import mkstemp, mkdtemp | |
from xml.dom.minidom import parse, parseString | |
import zipfile | |
import logging | |
from lxml import etree | |
def patch(docx_file, tracking_url): | |
logging.info('Patching document: %s' % docx_file) | |
docx_file = path.abspath(docx_file) | |
# Extract | |
logging.info('Extracting docx') | |
docx_path = extract(docx_file) | |
if docx_path is None: | |
raise "No such file" | |
# Patch | |
safe_id = get_safe_id(docx_path) | |
rel = makeelement("Relationship", attributes={ | |
"Id": "rId%s" %safe_id, | |
"Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image", | |
"TargetMode": "External", | |
"Target": tracking_url | |
}) | |
update_relationships(docx_path, rel) | |
update_document(docx_path, safe_id) | |
# Compress | |
logging.info('Compressing docx') | |
docx_path = compress(docx_path, docx_file) | |
return docx_path | |
def extract(docx_path): | |
if not path.exists(docx_path): | |
return None | |
temp_dir = mkdtemp() | |
logging.info('Creating temp dir: %s' % temp_dir) | |
with zipfile.ZipFile(docx_path) as z: | |
z.extractall(path.abspath(temp_dir)) | |
return temp_dir | |
def compress(docx_path, docx_file): | |
assert path.isdir(docx_path) | |
os.chdir(docx_path) | |
docxfile = zipfile.ZipFile(docx_file, mode='w', compression=zipfile.ZIP_DEFLATED) | |
for root, dirs, files in os.walk("."): | |
for f in files: | |
docxfile.write(path.join(root, f)) | |
docxfile.close() | |
return docx_file | |
def get_safe_id(extracted_path): | |
dom = parse(path.join(extracted_path, 'word', '_rels', 'document.xml.rels')) | |
ids = [] | |
for r in dom.getElementsByTagName('Relationship'): | |
ids.append(int(r.getAttribute('Id').replace("rId", ""))) | |
ids = sorted(ids) | |
safe_id = ids[-1] + 1 | |
logging.info('Safe ID: %s' % safe_id) | |
return safe_id | |
def update_relationships(extracted_path, new_rel): | |
file_path = path.join(extracted_path, 'word', '_rels', 'document.xml.rels') | |
xmlcontent = open(file_path, 'r').read() | |
document = etree.fromstring(xmlcontent) | |
document.append(new_rel) | |
fh = open(file_path, 'w') | |
treestring = etree.tostring(document.getroottree(), pretty_print=True) | |
fh.write(u'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n%s' %treestring) | |
fh.close() | |
return True | |
def update_document(extracted_path, safe_id): | |
file_path = path.join(extracted_path, 'word', 'document.xml') | |
xmlcontent = open(file_path, 'r').read() | |
document = etree.fromstring(xmlcontent) | |
body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0] | |
ctx = {"id": safe_id, "uuid": uuid.uuid4()} | |
tracking_image = etree.fromstring(make_tracking_code(ctx)) | |
body.append(tracking_image) | |
fh = open(file_path, 'w') | |
treestring = etree.tostring(document.getroottree(), pretty_print=True) | |
fh.write(u'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n%s' %treestring) | |
fh.close() | |
return True | |
def makeelement(tagname, tagtext=None, nsprefix='w', attributes=None, attrnsprefix=None): | |
'''Create an element & return it''' | |
newelement = etree.Element(tagname) | |
# Add attributes with namespaces | |
if attributes: | |
for tagattribute in attributes: | |
newelement.set(tagattribute, attributes[tagattribute]) | |
if tagtext: | |
newelement.text = tagtext | |
return newelement | |
def make_tracking_code(ctx): | |
return """<w:drawing xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14"> | |
<wp:inline distT="0" distB="0" distL="0" distR="0"> | |
<wp:extent cx="1" cy="1"/> | |
<wp:effectExtent l="0" t="0" r="0" b="0"/> | |
<wp:docPr id="4" name="gh6RuSZpJaptfEulBXmWORNwPEUbgy"/> | |
<wp:cNvGraphicFramePr> | |
<a:graphicFrameLocks xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" noChangeAspect="1"/> | |
</wp:cNvGraphicFramePr> | |
<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main"> | |
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture"> | |
<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture"> | |
<pic:nvPicPr> | |
<pic:cNvPr id="0" name="%s"/> | |
<pic:cNvPicPr/> | |
</pic:nvPicPr> | |
<pic:blipFill> | |
<a:blip r:link="rId%s"/> | |
<a:stretch> | |
<a:fillRect/> | |
</a:stretch> | |
</pic:blipFill> | |
<pic:spPr> | |
<a:xfrm> | |
<a:off x="0" y="0"/> | |
<a:ext cx="1" cy="1"/> | |
</a:xfrm> | |
<a:prstGeom prst="rect"> | |
<a:avLst/> | |
</a:prstGeom> | |
</pic:spPr> | |
</pic:pic> | |
</a:graphicData> | |
</a:graphic> | |
</wp:inline> | |
</w:drawing>""" %(ctx['uuid'], ctx['id']) | |
nsprefixes = { | |
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main', | |
'o': 'urn:schemas-microsoft-com:office:office', | |
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006', | |
# Text Content | |
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main', | |
'w10': 'urn:schemas-microsoft-com:office:word', | |
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml', | |
# Drawing | |
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main', | |
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', | |
'mv': 'urn:schemas-microsoft-com:mac:vml', | |
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture', | |
'v': 'urn:schemas-microsoft-com:vml', | |
'wp': ('http://schemas.openxmlformats.org/drawingml/2006/wordprocessing' | |
'Drawing'), | |
# Properties (core and extended) | |
'cp': ('http://schemas.openxmlformats.org/package/2006/metadata/core-pr' | |
'operties'), | |
'dc': 'http://purl.org/dc/elements/1.1/', | |
'ep': ('http://schemas.openxmlformats.org/officeDocument/2006/extended-' | |
'properties'), | |
'xsi': 'http://www.w3.org/2001/XMLSchema-instance', | |
# Content Types | |
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types', | |
# Package Relationships | |
'r': ('http://schemas.openxmlformats.org/officeDocument/2006/relationsh' | |
'ips'), | |
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships', | |
# Dublin Core document properties | |
'dcmitype': 'http://purl.org/dc/dcmitype/', | |
'dcterms': 'http://purl.org/dc/terms/' | |
} | |
def main(): | |
from optparse import OptionParser, OptionGroup | |
usage_text = "usage: %prog <file.docx> <tracking URL>" | |
parser = OptionParser(usage_text) | |
(menu, args) = parser.parse_args() | |
if len(args) != 2: | |
parser.print_usage() | |
else: | |
res = patch(args[0], args[1]) | |
print "File patched (%s)" %res | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment