Skip to content

Instantly share code, notes, and snippets.

@r0yfire
Last active August 29, 2015 14:08
Show Gist options
  • Save r0yfire/ef7e78aa649e0c03fa9e to your computer and use it in GitHub Desktop.
Save r0yfire/ef7e78aa649e0c03fa9e to your computer and use it in GitHub Desktop.
Patch docx file with a tracking URL | www.docping.me
#!/usr/bin/env python
"""
Patch docx file with a tracking URL
Author: Roy Firestein (roy[at]firestein[dot]net)
Date: October 28, 2014
Based on code from https://docping.me
"""
from os import path
import os
from tempfile import mkstemp, mkdtemp
from xml.dom.minidom import parse, parseString
import zipfile
import logging
from lxml import etree
def patch(docx_file, tracking_url):
logging.info('Patching document: %s' % docx_file)
docx_file = path.abspath(docx_file)
# Extract
logging.info('Extracting docx')
docx_path = extract(docx_file)
if docx_path is None:
raise "No such file"
# Patch
safe_id = get_safe_id(docx_path)
rel = makeelement("Relationship", attributes={
"Id": "rId%s" %safe_id,
"Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image",
"TargetMode": "External",
"Target": tracking_url
})
update_relationships(docx_path, rel)
update_document(docx_path, safe_id)
# Compress
logging.info('Compressing docx')
docx_path = compress(docx_path, docx_file)
return docx_path
def extract(docx_path):
if not path.exists(docx_path):
return None
temp_dir = mkdtemp()
logging.info('Creating temp dir: %s' % temp_dir)
with zipfile.ZipFile(docx_path) as z:
z.extractall(path.abspath(temp_dir))
return temp_dir
def compress(docx_path, docx_file):
assert path.isdir(docx_path)
os.chdir(docx_path)
docxfile = zipfile.ZipFile(docx_file, mode='w', compression=zipfile.ZIP_DEFLATED)
for root, dirs, files in os.walk("."):
for f in files:
docxfile.write(path.join(root, f))
docxfile.close()
return docx_file
def get_safe_id(extracted_path):
dom = parse(path.join(extracted_path, 'word', '_rels', 'document.xml.rels'))
ids = []
for r in dom.getElementsByTagName('Relationship'):
ids.append(int(r.getAttribute('Id').replace("rId", "")))
ids = sorted(ids)
safe_id = ids[-1] + 1
logging.info('Safe ID: %s' % safe_id)
return safe_id
def update_relationships(extracted_path, new_rel):
file_path = path.join(extracted_path, 'word', '_rels', 'document.xml.rels')
xmlcontent = open(file_path, 'r').read()
document = etree.fromstring(xmlcontent)
document.append(new_rel)
fh = open(file_path, 'w')
treestring = etree.tostring(document.getroottree(), pretty_print=True)
fh.write(u'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n%s' %treestring)
fh.close()
return True
def update_document(extracted_path, safe_id):
file_path = path.join(extracted_path, 'word', 'document.xml')
xmlcontent = open(file_path, 'r').read()
document = etree.fromstring(xmlcontent)
body = document.xpath('/w:document/w:body', namespaces=nsprefixes)[0]
ctx = {"id": safe_id, "uuid": uuid.uuid4()}
tracking_image = etree.fromstring(make_tracking_code(ctx))
body.append(tracking_image)
fh = open(file_path, 'w')
treestring = etree.tostring(document.getroottree(), pretty_print=True)
fh.write(u'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>\n%s' %treestring)
fh.close()
return True
def makeelement(tagname, tagtext=None, nsprefix='w', attributes=None, attrnsprefix=None):
'''Create an element & return it'''
newelement = etree.Element(tagname)
# Add attributes with namespaces
if attributes:
for tagattribute in attributes:
newelement.set(tagattribute, attributes[tagattribute])
if tagtext:
newelement.text = tagtext
return newelement
def make_tracking_code(ctx):
return """<w:drawing xmlns:wpc="http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas" xmlns:mc="http://schemas.openxmlformats.org/markup-compatibility/2006" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:wp14="http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:wpg="http://schemas.microsoft.com/office/word/2010/wordprocessingGroup" xmlns:wpi="http://schemas.microsoft.com/office/word/2010/wordprocessingInk" xmlns:wne="http://schemas.microsoft.com/office/word/2006/wordml" xmlns:wps="http://schemas.microsoft.com/office/word/2010/wordprocessingShape" mc:Ignorable="w14 wp14">
<wp:inline distT="0" distB="0" distL="0" distR="0">
<wp:extent cx="1" cy="1"/>
<wp:effectExtent l="0" t="0" r="0" b="0"/>
<wp:docPr id="4" name="gh6RuSZpJaptfEulBXmWORNwPEUbgy"/>
<wp:cNvGraphicFramePr>
<a:graphicFrameLocks xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" noChangeAspect="1"/>
</wp:cNvGraphicFramePr>
<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
<pic:nvPicPr>
<pic:cNvPr id="0" name="%s"/>
<pic:cNvPicPr/>
</pic:nvPicPr>
<pic:blipFill>
<a:blip r:link="rId%s"/>
<a:stretch>
<a:fillRect/>
</a:stretch>
</pic:blipFill>
<pic:spPr>
<a:xfrm>
<a:off x="0" y="0"/>
<a:ext cx="1" cy="1"/>
</a:xfrm>
<a:prstGeom prst="rect">
<a:avLst/>
</a:prstGeom>
</pic:spPr>
</pic:pic>
</a:graphicData>
</a:graphic>
</wp:inline>
</w:drawing>""" %(ctx['uuid'], ctx['id'])
nsprefixes = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office',
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
# Text Content
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'w10': 'urn:schemas-microsoft-com:office:word',
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
# Drawing
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
'mv': 'urn:schemas-microsoft-com:mac:vml',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml',
'wp': ('http://schemas.openxmlformats.org/drawingml/2006/wordprocessing'
'Drawing'),
# Properties (core and extended)
'cp': ('http://schemas.openxmlformats.org/package/2006/metadata/core-pr'
'operties'),
'dc': 'http://purl.org/dc/elements/1.1/',
'ep': ('http://schemas.openxmlformats.org/officeDocument/2006/extended-'
'properties'),
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
# Content Types
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
# Package Relationships
'r': ('http://schemas.openxmlformats.org/officeDocument/2006/relationsh'
'ips'),
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
# Dublin Core document properties
'dcmitype': 'http://purl.org/dc/dcmitype/',
'dcterms': 'http://purl.org/dc/terms/'
}
def main():
from optparse import OptionParser, OptionGroup
usage_text = "usage: %prog <file.docx> <tracking URL>"
parser = OptionParser(usage_text)
(menu, args) = parser.parse_args()
if len(args) != 2:
parser.print_usage()
else:
res = patch(args[0], args[1])
print "File patched (%s)" %res
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment