Skip to content

Instantly share code, notes, and snippets.

@shellexy
Created March 26, 2012 06:43
Show Gist options
  • Save shellexy/2203495 to your computer and use it in GitHub Desktop.
Save shellexy/2203495 to your computer and use it in GitHub Desktop.
html 易读格式显示 html 版本间差异
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:
from difflib import SequenceMatcher
import re
from StringIO import StringIO
import cgi
import base64
try: import i18n
except: from gettext import gettext as _
def htmlEncode(s, esc=cgi.escape):
return esc(s, 1)
commentRE = re.compile('<!--.*?-->', re.S)
tagRE = re.compile('<.*?>', re.S)
headRE = re.compile('<\s*head\s*>', re.S | re.I)
uniPunctuationRE = re.compile(u' ?([,。!《》;:‘’“”『』()]) ?')
class HTMLMatcher(SequenceMatcher):
def __init__(self, source1, source2):
SequenceMatcher.__init__(self, None, source1, source2)
def set_seq1(self, a):
SequenceMatcher.set_seq1(self, self.splitHTML(a))
def set_seq2(self, b):
SequenceMatcher.set_seq2(self, self.splitHTML(b))
def splitTags(self, t):
result = []
pos = 0
while 1:
match = tagRE.search(t, pos=pos)
if not match:
result.append(t[pos:])
break
result.append(t[pos:match.start()])
result.append(match.group(0))
pos = match.end()
return result
def splitWords(self, t):
## 将中文按标点拆分句子对比差异
t = uniPunctuationRE.sub(u' \\1 ', t.strip().decode('utf8')).encode('utf8')
return t.strip().split()
def splitHTML(self, t):
t = commentRE.sub('', t)
r = self.splitTags(t)
result = []
for item in r:
if item.startswith('<'):
result.append(item)
else:
result.extend(self.splitWords(item))
return result
def htmlDiff(self, addStylesheet=False):
opcodes = self.get_opcodes()
a = self.a
b = self.b
out = StringIO()
#print [o[0] for o in opcodes]
for tag, i1, i2, j1, j2 in opcodes:
if tag == 'equal':
for item in a[i1:i2]:
out.write(item)
out.write(' ')
if tag == 'delete' or tag == 'replace':
self.textDelete(a[i1:i2], out)
if tag == 'insert' or tag == 'replace':
self.textInsert(b[j1:j2], out)
html = out.getvalue()
out.close()
if addStylesheet:
html = self.addStylesheet(html, self.stylesheet())
## 删除中文标点前后多余空格
html = uniPunctuationRE.sub(u'\\1', html.decode('utf8')).encode('utf8')
return html
def textDelete(self, lst, out):
inSpan = False
for item in lst:
if item.startswith('<'):
if inSpan:
out.write(self.endDeleteText())
inSpan = False
out.write(self.formatDeleteTag(item))
else:
if not inSpan:
out.write(self.startDeleteText())
inSpan = True
out.write(item)
out.write(' ')
if inSpan:
out.write(self.endDeleteText())
def textInsert(self, lst, out):
inSpan = False
for item in lst:
if item.startswith('<'):
if inSpan:
out.write(self.endInsertText())
inSpan = False
out.write(self.formatInsertTag(item))
out.write(item)
out.write(' ')
else:
if not inSpan:
out.write(self.startInsertText())
inSpan = True
out.write(item)
out.write(' ')
if inSpan:
out.write(self.endInsertText())
def stylesheet(self):
return '''
.insert { background-color: #aaffaa }
.delete { background-color: #ff8888; text-decoration: line-through }
.tagInsert { background-color: #007700; color: #ffffff }
.tagDelete { background-color: #770000; color: #ffffff }
'''
def addStylesheet(self, html, ss):
match = headRE.search(html)
if match:
pos = match.end()
else:
pos = 0
return ('%s<style type="text/css"><!--\n%s\n--></style>%s'
% (html[:pos], ss, html[pos:]))
def startInsertText(self):
return '<span class="insert">'
def endInsertText(self):
return '</span> '
def startDeleteText(self):
return '<span class="delete">'
def endDeleteText(self):
return '</span> '
def formatInsertTag(self, tag):
return '<span class="tagInsert">insert: <tt>%s</tt></span> ' % htmlEncode(tag)
def formatDeleteTag(self, tag):
return '<span class="tagDelete">delete: <tt>%s</tt></span> ' % htmlEncode(tag)
class NoTagHTMLMatcher(HTMLMatcher):
def formatInsertTag(self, tag):
return ''
def formatDeleteTag(self, tag):
return ''
def addStyleToTag(tag, style):
if re.findall('(style=")(.*?);* *?(")', tag):
return re.sub('(style=")(.*?);* *?(")', '\\1\\2; %s\\3' % style, tag)
elif re.findall("(style=')(.*?);* *?(')", tag):
return re.sub("(style=')(.*?);* *?(')", '\\1\\2; %s\\3' % style, tag)
else:
return re.sub('(<\w+) ', '\\1 style="%s" ' % style, tag)
pass
class TagHTMLMatcher(HTMLMatcher):
def textInsert(self, lst, out):
inSpan = False
for item in lst:
if item.startswith('<img'):
style = 'padding: 5px; border: 12px; border-style: solid; border-color: #aaffaa; background-color: #ccffcc;'
out.write(addStyleToTag(item, style))
out.write(' ')
elif item.startswith('<'):
if inSpan:
out.write(self.endInsertText())
inSpan = False
out.write(self.formatInsertTag(item))
out.write(item)
out.write(' ')
else:
if not inSpan:
out.write(self.startInsertText())
inSpan = True
out.write(item)
out.write(' ')
if inSpan:
out.write(self.endInsertText())
def formatInsertTag(self, tag):
if tag.startswith('<table'):
return '<span class="tagInsert"><tt>%s</tt></span> ' % _('Table')
return ''
def formatDeleteTag(self, tag):
if tag.startswith('<img'):
style = 'padding: 5px; border: 12px; border-style: solid; border-color: #ff8888; background-color: #ffcccc;'
return addStyleToTag(tag, style)
elif tag.startswith('<table'):
return '<span class="tagDelete"><tt>%s</tt></span> ' % _('Table')
return ''
class _LeftHTMLMatcher(NoTagHTMLMatcher):
def startInsertText(self):
return '<span class="delete">'
def endInsertText(self):
return '</span> '
def startDeleteText(self):
return '<span class="insert" style="display: none">'
def endDeleteText(self):
return '</span> '
class _RightHTMLMatcher(NoTagHTMLMatcher):
def startDeleteText(self):
return '<span class="delete" style="display: none">'
def endDeleteText(self):
return '</span> '
def comparisonhtmldiff(source1, source2, addStylesheet=False):
'''生成两列的对比页面
@TODO: 设法同步两列网页滚动条
@NOTE: 考虑 1. 换成自适应高度的 iframe/object;2. 用 js 同步滚动条;3. 给 插入/删除 标记添加 click 事件同步另一列网页位置
'''
html = '''<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<frameset cols="50%%, 50%%">
<frame src="data:text/html;charset=UTF-8;base64,%s">
<frame src="data:text/html;charset=UTF-8;base64,%s">
</frameset>
</head></html>'''
left = _LeftHTMLMatcher(source2, source1).htmlDiff(addStylesheet)
right = _RightHTMLMatcher(source1, source2).htmlDiff(addStylesheet)
return html % ( base64.encodestring(left), base64.encodestring(right))
def htmldiff(source1, source2, addStylesheet=False):
"""
Return the difference between two pieces of HTML
>>> htmldiff('test1', 'test2')
'<span class="delete">test1 </span> <span class="insert">test2 </span> '
>>> htmldiff('test1', 'test1')
'test1 '
>>> htmldiff('<b>test1</b>', '<i>test1</i>')
'<span class="tagDelete">delete: <tt>&lt;b&gt;</tt></span> <span class="tagInsert">insert: <tt>&lt;i&gt;</tt></span> <i> test1 <span class="tagDelete">delete: <tt>&lt;/b&gt;</tt></span> <span class="tagInsert">insert: <tt>&lt;/i&gt;</tt></span> </i> '
"""
h = HTMLMatcher(source1, source2)
return h.htmlDiff(addStylesheet)
def notaghtmldiff(source1, source2, addStylesheet=False):
h = NoTagHTMLMatcher(source1, source2)
return h.htmlDiff(addStylesheet)
def taghtmldiff(source1, source2, addStylesheet=False):
h = TagHTMLMatcher(source1, source2)
return h.htmlDiff(addStylesheet)
def diffFiles(f1, f2):
source1 = open(f1).read()
source2 = open(f2).read()
#return comparisonhtmldiff(source1, source2, True)
return taghtmldiff(source1, source2, True)
class SimpleHTMLMatcher(HTMLMatcher):
"""
Like HTMLMatcher, but returns a simpler diff
"""
def startInsertText(self):
return '+['
def endInsertText(self):
return ']'
def startDeleteText(self):
return '-['
def endDeleteText(self):
return ']'
def formatInsertTag(self, tag):
return '+[%s]' % tag
def formatDeleteTag(self, tag):
return '-[%s]' % tag
def simplehtmldiff(source1, source2):
"""
Simpler form of htmldiff; mostly for testing, like:
>>> simplehtmldiff('test1', 'test2')
'-[test1 ]+[test2 ]'
>>> simplehtmldiff('<b>Hello world!</b>', '<i>Hello you!</i>')
'-[<b>]+[<i>]<i> Hello -[world! ]-[</b>]+[you! ]+[</i>]</i> '
"""
h = SimpleHTMLMatcher(source1, source2)
return h.htmlDiff()
class TextMatcher(HTMLMatcher):
def set_seq1(self, a):
SequenceMatcher.set_seq1(self, a.split('\n'))
def set_seq2(self, b):
SequenceMatcher.set_seq2(self, b.split('\n'))
def htmlDiff(self, addStylesheet=False):
opcodes = self.get_opcodes()
a = self.a
b = self.b
out = StringIO()
for tag, i1, i2, j1, j2 in opcodes:
if tag == 'equal':
self.writeLines(a[i1:i2], out)
if tag == 'delete' or tag == 'replace':
out.write(self.startDeleteText())
self.writeLines(a[i1:i2], out)
out.write(self.endDeleteText())
if tag == 'insert' or tag == 'replace':
out.write(self.startInsertText())
self.writeLines(b[j1:j2], out)
out.write(self.endInsertText())
html = out.getvalue()
out.close()
if addStylesheet:
html = self.addStylesheet(html, self.stylesheet())
return html
def writeLines(self, lines, out):
for line in lines:
line = htmlEncode(line)
line = line.replace(' ', '&nbsp; ')
line = line.replace('\t', '&nbsp; &nbsp; &nbsp; &nbsp; ')
if line.startswith(' '):
line = '&nbsp;' + line[1:]
out.write('<tt>%s</tt><br>\n' % line)
if __name__ == '__main__':
import sys
if not sys.argv[1:]:
print "Usage: %s file1 file2" % sys.argv[0]
print "or to test: %s test" % sys.argv[0]
elif sys.argv[1] == 'test' and not sys.argv[2:]:
import doctest
doctest.testmod()
else:
print diffFiles(sys.argv[1], sys.argv[2])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment