Created
March 26, 2012 06:43
-
-
Save shellexy/2203495 to your computer and use it in GitHub Desktop.
html 易读格式显示 html 版本间差异
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: UTF-8 -*- | |
# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79: | |
from difflib import SequenceMatcher | |
import re | |
from StringIO import StringIO | |
import cgi | |
import base64 | |
try: import i18n | |
except: from gettext import gettext as _ | |
def htmlEncode(s, esc=cgi.escape): | |
return esc(s, 1) | |
commentRE = re.compile('<!--.*?-->', re.S) | |
tagRE = re.compile('<.*?>', re.S) | |
headRE = re.compile('<\s*head\s*>', re.S | re.I) | |
uniPunctuationRE = re.compile(u' ?([,。!《》;:‘’“”『』()]) ?') | |
class HTMLMatcher(SequenceMatcher): | |
def __init__(self, source1, source2): | |
SequenceMatcher.__init__(self, None, source1, source2) | |
def set_seq1(self, a): | |
SequenceMatcher.set_seq1(self, self.splitHTML(a)) | |
def set_seq2(self, b): | |
SequenceMatcher.set_seq2(self, self.splitHTML(b)) | |
def splitTags(self, t): | |
result = [] | |
pos = 0 | |
while 1: | |
match = tagRE.search(t, pos=pos) | |
if not match: | |
result.append(t[pos:]) | |
break | |
result.append(t[pos:match.start()]) | |
result.append(match.group(0)) | |
pos = match.end() | |
return result | |
def splitWords(self, t): | |
## 将中文按标点拆分句子对比差异 | |
t = uniPunctuationRE.sub(u' \\1 ', t.strip().decode('utf8')).encode('utf8') | |
return t.strip().split() | |
def splitHTML(self, t): | |
t = commentRE.sub('', t) | |
r = self.splitTags(t) | |
result = [] | |
for item in r: | |
if item.startswith('<'): | |
result.append(item) | |
else: | |
result.extend(self.splitWords(item)) | |
return result | |
def htmlDiff(self, addStylesheet=False): | |
opcodes = self.get_opcodes() | |
a = self.a | |
b = self.b | |
out = StringIO() | |
#print [o[0] for o in opcodes] | |
for tag, i1, i2, j1, j2 in opcodes: | |
if tag == 'equal': | |
for item in a[i1:i2]: | |
out.write(item) | |
out.write(' ') | |
if tag == 'delete' or tag == 'replace': | |
self.textDelete(a[i1:i2], out) | |
if tag == 'insert' or tag == 'replace': | |
self.textInsert(b[j1:j2], out) | |
html = out.getvalue() | |
out.close() | |
if addStylesheet: | |
html = self.addStylesheet(html, self.stylesheet()) | |
## 删除中文标点前后多余空格 | |
html = uniPunctuationRE.sub(u'\\1', html.decode('utf8')).encode('utf8') | |
return html | |
def textDelete(self, lst, out): | |
inSpan = False | |
for item in lst: | |
if item.startswith('<'): | |
if inSpan: | |
out.write(self.endDeleteText()) | |
inSpan = False | |
out.write(self.formatDeleteTag(item)) | |
else: | |
if not inSpan: | |
out.write(self.startDeleteText()) | |
inSpan = True | |
out.write(item) | |
out.write(' ') | |
if inSpan: | |
out.write(self.endDeleteText()) | |
def textInsert(self, lst, out): | |
inSpan = False | |
for item in lst: | |
if item.startswith('<'): | |
if inSpan: | |
out.write(self.endInsertText()) | |
inSpan = False | |
out.write(self.formatInsertTag(item)) | |
out.write(item) | |
out.write(' ') | |
else: | |
if not inSpan: | |
out.write(self.startInsertText()) | |
inSpan = True | |
out.write(item) | |
out.write(' ') | |
if inSpan: | |
out.write(self.endInsertText()) | |
def stylesheet(self): | |
return ''' | |
.insert { background-color: #aaffaa } | |
.delete { background-color: #ff8888; text-decoration: line-through } | |
.tagInsert { background-color: #007700; color: #ffffff } | |
.tagDelete { background-color: #770000; color: #ffffff } | |
''' | |
def addStylesheet(self, html, ss): | |
match = headRE.search(html) | |
if match: | |
pos = match.end() | |
else: | |
pos = 0 | |
return ('%s<style type="text/css"><!--\n%s\n--></style>%s' | |
% (html[:pos], ss, html[pos:])) | |
def startInsertText(self): | |
return '<span class="insert">' | |
def endInsertText(self): | |
return '</span> ' | |
def startDeleteText(self): | |
return '<span class="delete">' | |
def endDeleteText(self): | |
return '</span> ' | |
def formatInsertTag(self, tag): | |
return '<span class="tagInsert">insert: <tt>%s</tt></span> ' % htmlEncode(tag) | |
def formatDeleteTag(self, tag): | |
return '<span class="tagDelete">delete: <tt>%s</tt></span> ' % htmlEncode(tag) | |
class NoTagHTMLMatcher(HTMLMatcher): | |
def formatInsertTag(self, tag): | |
return '' | |
def formatDeleteTag(self, tag): | |
return '' | |
def addStyleToTag(tag, style): | |
if re.findall('(style=")(.*?);* *?(")', tag): | |
return re.sub('(style=")(.*?);* *?(")', '\\1\\2; %s\\3' % style, tag) | |
elif re.findall("(style=')(.*?);* *?(')", tag): | |
return re.sub("(style=')(.*?);* *?(')", '\\1\\2; %s\\3' % style, tag) | |
else: | |
return re.sub('(<\w+) ', '\\1 style="%s" ' % style, tag) | |
pass | |
class TagHTMLMatcher(HTMLMatcher): | |
def textInsert(self, lst, out): | |
inSpan = False | |
for item in lst: | |
if item.startswith('<img'): | |
style = 'padding: 5px; border: 12px; border-style: solid; border-color: #aaffaa; background-color: #ccffcc;' | |
out.write(addStyleToTag(item, style)) | |
out.write(' ') | |
elif item.startswith('<'): | |
if inSpan: | |
out.write(self.endInsertText()) | |
inSpan = False | |
out.write(self.formatInsertTag(item)) | |
out.write(item) | |
out.write(' ') | |
else: | |
if not inSpan: | |
out.write(self.startInsertText()) | |
inSpan = True | |
out.write(item) | |
out.write(' ') | |
if inSpan: | |
out.write(self.endInsertText()) | |
def formatInsertTag(self, tag): | |
if tag.startswith('<table'): | |
return '<span class="tagInsert"><tt>%s</tt></span> ' % _('Table') | |
return '' | |
def formatDeleteTag(self, tag): | |
if tag.startswith('<img'): | |
style = 'padding: 5px; border: 12px; border-style: solid; border-color: #ff8888; background-color: #ffcccc;' | |
return addStyleToTag(tag, style) | |
elif tag.startswith('<table'): | |
return '<span class="tagDelete"><tt>%s</tt></span> ' % _('Table') | |
return '' | |
class _LeftHTMLMatcher(NoTagHTMLMatcher): | |
def startInsertText(self): | |
return '<span class="delete">' | |
def endInsertText(self): | |
return '</span> ' | |
def startDeleteText(self): | |
return '<span class="insert" style="display: none">' | |
def endDeleteText(self): | |
return '</span> ' | |
class _RightHTMLMatcher(NoTagHTMLMatcher): | |
def startDeleteText(self): | |
return '<span class="delete" style="display: none">' | |
def endDeleteText(self): | |
return '</span> ' | |
def comparisonhtmldiff(source1, source2, addStylesheet=False): | |
'''生成两列的对比页面 | |
@TODO: 设法同步两列网页滚动条 | |
@NOTE: 考虑 1. 换成自适应高度的 iframe/object;2. 用 js 同步滚动条;3. 给 插入/删除 标记添加 click 事件同步另一列网页位置 | |
''' | |
html = '''<html><head> | |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> | |
<frameset cols="50%%, 50%%"> | |
<frame src="data:text/html;charset=UTF-8;base64,%s"> | |
<frame src="data:text/html;charset=UTF-8;base64,%s"> | |
</frameset> | |
</head></html>''' | |
left = _LeftHTMLMatcher(source2, source1).htmlDiff(addStylesheet) | |
right = _RightHTMLMatcher(source1, source2).htmlDiff(addStylesheet) | |
return html % ( base64.encodestring(left), base64.encodestring(right)) | |
def htmldiff(source1, source2, addStylesheet=False): | |
""" | |
Return the difference between two pieces of HTML | |
>>> htmldiff('test1', 'test2') | |
'<span class="delete">test1 </span> <span class="insert">test2 </span> ' | |
>>> htmldiff('test1', 'test1') | |
'test1 ' | |
>>> htmldiff('<b>test1</b>', '<i>test1</i>') | |
'<span class="tagDelete">delete: <tt><b></tt></span> <span class="tagInsert">insert: <tt><i></tt></span> <i> test1 <span class="tagDelete">delete: <tt></b></tt></span> <span class="tagInsert">insert: <tt></i></tt></span> </i> ' | |
""" | |
h = HTMLMatcher(source1, source2) | |
return h.htmlDiff(addStylesheet) | |
def notaghtmldiff(source1, source2, addStylesheet=False): | |
h = NoTagHTMLMatcher(source1, source2) | |
return h.htmlDiff(addStylesheet) | |
def taghtmldiff(source1, source2, addStylesheet=False): | |
h = TagHTMLMatcher(source1, source2) | |
return h.htmlDiff(addStylesheet) | |
def diffFiles(f1, f2): | |
source1 = open(f1).read() | |
source2 = open(f2).read() | |
#return comparisonhtmldiff(source1, source2, True) | |
return taghtmldiff(source1, source2, True) | |
class SimpleHTMLMatcher(HTMLMatcher): | |
""" | |
Like HTMLMatcher, but returns a simpler diff | |
""" | |
def startInsertText(self): | |
return '+[' | |
def endInsertText(self): | |
return ']' | |
def startDeleteText(self): | |
return '-[' | |
def endDeleteText(self): | |
return ']' | |
def formatInsertTag(self, tag): | |
return '+[%s]' % tag | |
def formatDeleteTag(self, tag): | |
return '-[%s]' % tag | |
def simplehtmldiff(source1, source2): | |
""" | |
Simpler form of htmldiff; mostly for testing, like: | |
>>> simplehtmldiff('test1', 'test2') | |
'-[test1 ]+[test2 ]' | |
>>> simplehtmldiff('<b>Hello world!</b>', '<i>Hello you!</i>') | |
'-[<b>]+[<i>]<i> Hello -[world! ]-[</b>]+[you! ]+[</i>]</i> ' | |
""" | |
h = SimpleHTMLMatcher(source1, source2) | |
return h.htmlDiff() | |
class TextMatcher(HTMLMatcher): | |
def set_seq1(self, a): | |
SequenceMatcher.set_seq1(self, a.split('\n')) | |
def set_seq2(self, b): | |
SequenceMatcher.set_seq2(self, b.split('\n')) | |
def htmlDiff(self, addStylesheet=False): | |
opcodes = self.get_opcodes() | |
a = self.a | |
b = self.b | |
out = StringIO() | |
for tag, i1, i2, j1, j2 in opcodes: | |
if tag == 'equal': | |
self.writeLines(a[i1:i2], out) | |
if tag == 'delete' or tag == 'replace': | |
out.write(self.startDeleteText()) | |
self.writeLines(a[i1:i2], out) | |
out.write(self.endDeleteText()) | |
if tag == 'insert' or tag == 'replace': | |
out.write(self.startInsertText()) | |
self.writeLines(b[j1:j2], out) | |
out.write(self.endInsertText()) | |
html = out.getvalue() | |
out.close() | |
if addStylesheet: | |
html = self.addStylesheet(html, self.stylesheet()) | |
return html | |
def writeLines(self, lines, out): | |
for line in lines: | |
line = htmlEncode(line) | |
line = line.replace(' ', ' ') | |
line = line.replace('\t', ' ') | |
if line.startswith(' '): | |
line = ' ' + line[1:] | |
out.write('<tt>%s</tt><br>\n' % line) | |
if __name__ == '__main__': | |
import sys | |
if not sys.argv[1:]: | |
print "Usage: %s file1 file2" % sys.argv[0] | |
print "or to test: %s test" % sys.argv[0] | |
elif sys.argv[1] == 'test' and not sys.argv[2:]: | |
import doctest | |
doctest.testmod() | |
else: | |
print diffFiles(sys.argv[1], sys.argv[2]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment