Created
April 20, 2015 10:31
-
-
Save bluec0re/eba991ba4fea8ef5a3e6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import absolute_import, print_function, unicode_literals | |
import jinja2 | |
from lxml import etree, html | |
from lxml.html import clean | |
import logging | |
import sys | |
import zipfile | |
from preprocess import preprocess | |
import json | |
from cgi import escape | |
__author__ = 'bluec0re' | |
log = logging.getLogger(__name__) | |
def transform_html(root, init=False, default_style=None): | |
bold = root.tag == 'strong' or 'bold' in root.attrib.get('style', '') | |
italic = root.tag == 'i' or 'italic' in root.attrib.get('style', '') | |
if root.tag == 'li': | |
default_style = root.getparent().get('class', 'ListParagraph') | |
if root.tag == 'pre': | |
default_style = 'poc' | |
style = root.attrib.get('class', default_style) | |
new_paragraph = root.tag in ('p', 'h1', 'h2', 'h3', 'h4', 'li', 'pre', 'br') and not init or style | |
new_r = not init or new_paragraph or bold or italic | |
result = '' | |
if new_r: | |
result += '</w:t></w:r>' | |
if new_paragraph: | |
result += '</w:p>' | |
result += '<w:p><w:pPr>' | |
if style: | |
result += '<w:pStyle w:val="%s"/>' % escape(style) | |
if root.tag == 'li': | |
result += '<w:numPr><w:ilvl w:val="0"/><w:numId w:val="5"/></w:numPr>' | |
result += '<w:rPr></w:rPr>' | |
result += '</w:pPr>' | |
if new_r: | |
result += '<w:r><w:rPr>' | |
if bold: | |
result += '<w:b />' | |
result += '</w:rPr><w:t>' | |
if root.text is not None: | |
result += escape(root.text).strip() | |
for child in root.getchildren(): | |
result += transform_html(child, default_style=style) | |
if root.tail is not None: | |
result += escape(root.tail).strip() | |
return result | |
def preprocess_html(context): | |
if isinstance(context, dict): | |
for key, value in context.items(): | |
context[key] = preprocess_html(value) | |
return context | |
elif isinstance(context, list): | |
return [preprocess_html(v) for v in context] | |
elif isinstance(context, tuple): | |
return (preprocess_html(v) for v in context) | |
elif isinstance(context, (str, unicode)): | |
# clean html first | |
cleaner = clean.Cleaner() | |
cleaner.safe_attrs_only = True | |
cleaner.safe_attrs = ('style', 'class') | |
cleaner.allow_tags = ('p', 'a', 'br', 'span', 'strong', 'h1', 'h2', 'h3', 'h4', 'i', 'ul', 'li', 'br', 'pre') | |
cleaner.remove_unknown_tags = False | |
h = cleaner.clean_html(context) | |
h = html.fromstring(h) | |
# transform to docx code | |
if h.find('p') is not None or h.find('span') is not None or\ | |
h.find('strong') is not None or h.find('a') is not None: | |
value = transform_html(h, True) | |
else: | |
# remove enclosing tag | |
roottag = h.tag | |
value = etree.tostring(h) | |
value = value[len(roottag) + 2:-(len(roottag)+3)] | |
return value | |
else: | |
return context | |
def render(doc, context, debug=False): | |
if isinstance(doc, etree._Element): | |
doc = etree.tostring(doc, | |
encoding='utf-8', | |
xml_declaration=True, | |
standalone=True).decode('utf-8') | |
template = jinja2.Template(doc) | |
context = preprocess_html(context) | |
if debug: | |
doc = template.render(**context).encode('utf-8') | |
with open('templated.xml', 'w') as fp: | |
fp.write(doc) | |
doc = etree.XML(doc) | |
else: | |
doc = etree.XML(template.render(**context).encode('utf-8')) | |
# cleanup control nodes | |
for el in doc.xpath('//*[@is_control="true"]'): | |
par = el.getparent() | |
par.remove(el) | |
if par.find('w:r/w:t', par.nsmap) is None: | |
par.getparent().remove(par) | |
return etree.tostring(doc, | |
encoding='utf-8', | |
xml_declaration=True, | |
standalone=True) | |
def main(preproc=True): | |
zipin = zipfile.ZipFile(sys.argv[1]) | |
if preproc: | |
doc = preprocess(zipin.open('word/document.xml'), debug=True) | |
else: | |
doc = zipin.read('word/document.xml').decode('utf-8') | |
processed_doc = render(doc, json.load(sys.stdin)) | |
print(processed_doc) | |
target = 'Processed_' + sys.argv[1] | |
outzip = zipfile.ZipFile(target, "w") | |
for fileinfo in zipin.infolist(): | |
if fileinfo.filename != 'word/document.xml': | |
outzip.writestr(fileinfo, zipin.read(fileinfo)) | |
else: | |
outzip.writestr('word/document.xml', processed_doc) | |
if __name__ == '__main__': | |
logging.basicConfig(level='DEBUG') | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment