Created
December 11, 2019 08:34
-
-
Save longhotsummer/08d1d2e9eb5494638986cc288ab91157 to your computer and use it in GitHub Desktop.
Tests xmldiff on some basic AKN HTML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from xmldiff import main, formatting | |
import lxml.etree | |
import lxml.html | |
XSLT = u'''<?xml version="1.0"?> | |
<xsl:stylesheet version="1.0" | |
xmlns:diff="http://namespaces.shoobx.com/diff" | |
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"> | |
<xsl:template match="@diff:insert-formatting"> | |
<xsl:attribute name="class"> | |
<xsl:value-of select="'ins'"/> | |
</xsl:attribute> | |
</xsl:template> | |
<xsl:template match="@diff:insert"> | |
<xsl:attribute name="classx"> | |
<xsl:value-of select="'ins '"/> | |
<xsl:value-of select="../@class"/> | |
</xsl:attribute> | |
</xsl:template> | |
<xsl:template match="@diff:delete"> | |
<xsl:attribute name="classx"> | |
<xsl:value-of select="'del '"/> | |
<xsl:value-of select="../@class"/> | |
</xsl:attribute> | |
</xsl:template> | |
<xsl:template match="diff:delete"> | |
<del><xsl:apply-templates /></del> | |
</xsl:template> | |
<xsl:template match="diff:insert"> | |
<ins><xsl:apply-templates /></ins> | |
</xsl:template> | |
<xsl:template match="@* | node()"> | |
<xsl:copy> | |
<xsl:apply-templates select="@* | node()"/> | |
</xsl:copy> | |
</xsl:template> | |
</xsl:stylesheet>''' | |
XSLT_TEMPLATE = lxml.etree.fromstring(XSLT) | |
class HTMLFormatter(formatting.XMLFormatter): | |
def render(self, result): | |
transform = lxml.etree.XSLT(XSLT_TEMPLATE) | |
result = transform(result) | |
# XSLT doesn't let us add an element to an attribute, so here | |
# we move "classx" over onto "class" | |
for node in result.xpath('//*[@classx]'): | |
node.set('class', node.attrib.pop('classx')) | |
return super(HTMLFormatter, self).render(result) | |
old_s = ''' | |
<section class="akn-section" id="section-6" data-id="section-6"><h3>6. Power to control production, sale, etc., of drugs to which Part II applies</h3> | |
<section class="akn-paragraph akn--no-indent" id="section-6.paragraph0" data-id="section-6.paragraph0"><span class="akn-content"><span class="akn-blockList" id="section-6.paragraph0.list0" data-id="section-6.paragraph0.list0"><span class="akn-listIntroduction">The Minister may by regulations-</span><span class="akn-item" id="section-6.paragraph0.list0.a" data-id="section-6.paragraph0.list0.a"><span class="akn-num">(a)</span><span class="akn-p">provide for controlling or restricting the production, possession, sale and distribution of drugs to which this Part applies;</span></span><span class="akn-item" id="section-6.paragraph0.list0.b" data-id="section-6.paragraph0.list0.b"><span class="akn-num">(b)</span><span class="akn-p">provide for prohibiting the production, possession, sale or distribution of any drug to which this Part applies except by persons licensed or otherwise authorised in that behalf by the Minister, and the cultivation of plants from which such drugs are derived;</span></span><span class="akn-item" id="section-6.paragraph0.list0.c" data-id="section-6.paragraph0.list0.c"><span class="akn-num">(c)</span><span class="akn-p">prescribe measures to be taken for the eradication of plants, to which regulations made under paragraph (b) apply, found to be growing wild.</span></span></span></span></section></section> | |
''' | |
new_s = ''' | |
<section class="akn-section" id="section-6" data-id="section-6"><h3>6. Power to control production, sale, etc., of drugs to which Part II applies</h3> | |
<section class="akn-paragraph akn--no-indent" id="section-6.paragraph0" data-id="section-6.paragraph0"><span class="akn-content"><span class="akn-blockList" id="section-6.paragraph0.list0" data-id="section-6.paragraph0.list0"><span class="akn-listIntroduction">The Minister may by regulations-</span><span class="akn-item" id="section-6.paragraph0.list0.a" data-id="section-6.paragraph0.list0.a"><span class="akn-num">(a)</span><span class="akn-p">provide for controlling or restricting the production, possession, sale and distribution of drugs to which this Part applies;</span></span><span class="akn-item" id="section-6.paragraph0.list0.aa" data-id="section-6.paragraph0.list0.aa"><span class="akn-num">(aa)</span><span class="akn-p">provide for prohibiting the production, possession, sale or distribution of any drug to which this Part applies except by persons licensed or otherwise authorised in that behalf by the Minister, and the cultivation of plants from which such drugs are derived;</span></span><span class="akn-item" id="section-6.paragraph0.list0.c" data-id="section-6.paragraph0.list0.c"><span class="akn-num">(c)</span><span class="akn-p">prescribe measures to be really taken for the eradication of plants, to which regulations made under paragraph (b) apply, found to be growing wild.</span></span></span><span class="akn-p">and some closeout.</span></span></section></section> | |
''' | |
old_html = lxml.etree.fromstring(old_s) | |
new_html = lxml.etree.fromstring(new_s) | |
for html in [old_html, new_html]: | |
for node in html.xpath('//*[@id]'): | |
del node.attrib['id'] | |
del node.attrib['data-id'] | |
formatter = formatting.XMLFormatter(normalize=formatting.WS_NONE) | |
formatter = HTMLFormatter(normalize=formatting.WS_NONE, pretty_print=True) | |
diff = main.diff_trees(old_html, new_html, formatter=formatter, diff_options={ | |
'F': 0.75, | |
'uniqueattrs': [], | |
'ratio_mode': 'fast', | |
}) | |
print(diff) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment