shellexy · March 26, 2012 06:43
diff --git a/htmldiff.py b/htmldiff.py
 #!/usr/bin/python
 # -*- coding: UTF-8 -*-
 # vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:

 from difflib import SequenceMatcher
 import re
 from StringIO import StringIO
 import cgi
 import base64

 try: import i18n
 except: from gettext import gettext as _

 def htmlEncode(s, esc=cgi.escape):
    return esc(s, 1)

 commentRE = re.compile('<!--.*?-->', re.S)
 tagRE = re.compile('<.*?>', re.S)
 headRE = re.compile('<\s*head\s*>', re.S | re.I)

 uniPunctuationRE = re.compile(u' ?([，。！《》；：‘’“”『』（）]) ?')

 class HTMLMatcher(SequenceMatcher):

    def __init__(self, source1, source2):
        SequenceMatcher.__init__(self, None, source1, source2)

    def set_seq1(self, a):
        SequenceMatcher.set_seq1(self, self.splitHTML(a))

    def set_seq2(self, b):
        SequenceMatcher.set_seq2(self, self.splitHTML(b))
        
    def splitTags(self, t):
        result = []
        pos = 0
        while 1:
            match = tagRE.search(t, pos=pos)
            if not match:
                result.append(t[pos:])
                break
            result.append(t[pos:match.start()])
            result.append(match.group(0))
            pos = match.end()
        return result

    def splitWords(self, t):
        ## 将中文按标点拆分句子对比差异
        t = uniPunctuationRE.sub(u' \\1 ', t.strip().decode('utf8')).encode('utf8')
        return t.strip().split()

    def splitHTML(self, t):
        t = commentRE.sub('', t)
        r = self.splitTags(t)
        result = []
        for item in r:
            if item.startswith('<'):
                result.append(item)
            else:
                result.extend(self.splitWords(item))
        return result

    def htmlDiff(self, addStylesheet=False):
        opcodes = self.get_opcodes()
        a = self.a
        b = self.b
        out = StringIO()
        #print [o[0] for o in opcodes]
        for tag, i1, i2, j1, j2 in opcodes:
            if tag == 'equal':
                for item in a[i1:i2]:
                    out.write(item)
                    out.write(' ')
            if tag == 'delete' or tag == 'replace':
                self.textDelete(a[i1:i2], out)
            if tag == 'insert' or tag == 'replace':
                self.textInsert(b[j1:j2], out)
        html = out.getvalue()
        out.close()
        if addStylesheet:
            html = self.addStylesheet(html, self.stylesheet())
        ## 删除中文标点前后多余空格
        html = uniPunctuationRE.sub(u'\\1', html.decode('utf8')).encode('utf8')
        return html

    def textDelete(self, lst, out):
        inSpan = False
        for item in lst:
            if item.startswith('<'):
                if inSpan:
                    out.write(self.endDeleteText())
                    inSpan = False
                out.write(self.formatDeleteTag(item))
            else:
                if not inSpan:
                    out.write(self.startDeleteText())
                    inSpan = True
                out.write(item)
                out.write(' ')
        if inSpan:
            out.write(self.endDeleteText())

    def textInsert(self, lst, out):
        inSpan = False
        for item in lst:
            if item.startswith('<'):
                if inSpan:
                    out.write(self.endInsertText())
                    inSpan = False
                out.write(self.formatInsertTag(item))
                out.write(item)
                out.write(' ')
            else:
                if not inSpan:
                    out.write(self.startInsertText())
                    inSpan = True
                out.write(item)
                out.write(' ')
        if inSpan:
            out.write(self.endInsertText())

    def stylesheet(self):
        return '''
 .insert { background-color: #aaffaa }
 .delete { background-color: #ff8888; text-decoration: line-through }
 .tagInsert { background-color: #007700; color: #ffffff }
 .tagDelete { background-color: #770000; color: #ffffff }
 '''

    def addStylesheet(self, html, ss):
        match = headRE.search(html)
        if match:
            pos = match.end()
        else:
            pos = 0
        return ('%s<style type="text/css"><!--\n%s\n--></style>%s'
                % (html[:pos], ss, html[pos:]))

    def startInsertText(self):
        return '<span class="insert">'
    def endInsertText(self):
        return '</span> '
    def startDeleteText(self):
        return '<span class="delete">'
    def endDeleteText(self):
        return '</span> '
    def formatInsertTag(self, tag):
        return '<span class="tagInsert">insert: <tt>%s</tt></span> ' % htmlEncode(tag)
    def formatDeleteTag(self, tag):
        return '<span class="tagDelete">delete: <tt>%s</tt></span> ' % htmlEncode(tag)

 class NoTagHTMLMatcher(HTMLMatcher):
    def formatInsertTag(self, tag):
        return ''
    def formatDeleteTag(self, tag):
        return ''

 def addStyleToTag(tag, style):
    if re.findall('(style=")(.*?);* *?(")', tag):
        return re.sub('(style=")(.*?);* *?(")', '\\1\\2; %s\\3' % style, tag)
    elif re.findall("(style=')(.*?);* *?(')", tag):
        return re.sub("(style=')(.*?);* *?(')", '\\1\\2; %s\\3' % style, tag)
    else:
        return re.sub('(<\w+) ', '\\1 style="%s" ' % style, tag)
    pass

 class TagHTMLMatcher(HTMLMatcher):
    def textInsert(self, lst, out):
        inSpan = False
        for item in lst:
            if item.startswith('<img'):
                style = 'padding: 5px; border: 12px; border-style: solid; border-color: #aaffaa; background-color: #ccffcc;'
                out.write(addStyleToTag(item, style))
                out.write(' ')
            elif item.startswith('<'):
                if inSpan:
                    out.write(self.endInsertText())
                    inSpan = False
                out.write(self.formatInsertTag(item))
                out.write(item)
                out.write(' ')
            else:
                if not inSpan:
                    out.write(self.startInsertText())
                    inSpan = True
                out.write(item)
                out.write(' ')
        if inSpan:
            out.write(self.endInsertText())

    def formatInsertTag(self, tag):
        if tag.startswith('<table'):
            return '<span class="tagInsert"><tt>%s</tt></span> ' % _('Table')
        return ''

    def formatDeleteTag(self, tag):
        if tag.startswith('<img'):
            style = 'padding: 5px; border: 12px; border-style: solid; border-color: #ff8888; background-color: #ffcccc;'
            return addStyleToTag(tag, style)
        elif tag.startswith('<table'):
            return '<span class="tagDelete"><tt>%s</tt></span> ' % _('Table')
        return ''


 class _LeftHTMLMatcher(NoTagHTMLMatcher):
    def startInsertText(self):
        return '<span class="delete">'
    def endInsertText(self):
        return '</span> '
    def startDeleteText(self):
        return '<span class="insert" style="display: none">'
    def endDeleteText(self):
        return '</span> '

 class _RightHTMLMatcher(NoTagHTMLMatcher):
    def startDeleteText(self):
        return '<span class="delete" style="display: none">'
    def endDeleteText(self):
        return '</span> '

 def comparisonhtmldiff(source1, source2, addStylesheet=False):
    '''生成两列的对比页面

    @TODO: 设法同步两列网页滚动条
    @NOTE: 考虑 1. 换成自适应高度的 iframe/object；2. 用 js 同步滚动条；3. 给 插入/删除 标记添加 click 事件同步另一列网页位置
    '''
    html = '''<html><head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
    <frameset cols="50%%, 50%%">
    <frame src="data:text/html;charset=UTF-8;base64,%s">
    <frame src="data:text/html;charset=UTF-8;base64,%s">
    </frameset>
    </head></html>'''
    left = _LeftHTMLMatcher(source2, source1).htmlDiff(addStylesheet)
    right = _RightHTMLMatcher(source1, source2).htmlDiff(addStylesheet)
    return html % ( base64.encodestring(left), base64.encodestring(right))

 def htmldiff(source1, source2, addStylesheet=False):
    """
    Return the difference between two pieces of HTML

        >>> htmldiff('test1', 'test2')
        '<span class="delete">test1 </span> <span class="insert">test2 </span> '
        >>> htmldiff('test1', 'test1')
        'test1 '
        >>> htmldiff('<b>test1</b>', '<i>test1</i>')
        '<span class="tagDelete">delete: <tt>&lt;b&gt;</tt></span> <span class="tagInsert">insert: <tt>&lt;i&gt;</tt></span> <i> test1 <span class="tagDelete">delete: <tt>&lt;/b&gt;</tt></span> <span class="tagInsert">insert: <tt>&lt;/i&gt;</tt></span> </i> '
    """
    h = HTMLMatcher(source1, source2)
    return h.htmlDiff(addStylesheet)

 def notaghtmldiff(source1, source2, addStylesheet=False):
    h = NoTagHTMLMatcher(source1, source2)
    return h.htmlDiff(addStylesheet)

 def taghtmldiff(source1, source2, addStylesheet=False):
    h = TagHTMLMatcher(source1, source2)
    return h.htmlDiff(addStylesheet)

 def diffFiles(f1, f2):
    source1 = open(f1).read()
    source2 = open(f2).read()
    #return comparisonhtmldiff(source1, source2, True)
    return taghtmldiff(source1, source2, True)

 class SimpleHTMLMatcher(HTMLMatcher):
    """
    Like HTMLMatcher, but returns a simpler diff
    """
    def startInsertText(self):
        return '+['
    def endInsertText(self):
        return ']'
    def startDeleteText(self):
        return '-['
    def endDeleteText(self):
        return ']'
    def formatInsertTag(self, tag):
        return '+[%s]' % tag
    def formatDeleteTag(self, tag):
        return '-[%s]' % tag

 def simplehtmldiff(source1, source2):
    """
    Simpler form of htmldiff; mostly for testing, like:

        >>> simplehtmldiff('test1', 'test2')
        '-[test1 ]+[test2 ]'
        >>> simplehtmldiff('<b>Hello world!</b>', '<i>Hello you!</i>')
        '-[<b>]+[<i>]<i> Hello -[world! ]-[</b>]+[you! ]+[</i>]</i> '
    """
    h = SimpleHTMLMatcher(source1, source2)
    return h.htmlDiff()

 class TextMatcher(HTMLMatcher):


    def set_seq1(self, a):
        SequenceMatcher.set_seq1(self, a.split('\n'))

    def set_seq2(self, b):
        SequenceMatcher.set_seq2(self, b.split('\n'))

    def htmlDiff(self, addStylesheet=False):
        opcodes = self.get_opcodes()
        a = self.a
        b = self.b
        out = StringIO()
        for tag, i1, i2, j1, j2 in opcodes:
            if tag == 'equal':
                self.writeLines(a[i1:i2], out)
            if tag == 'delete' or tag == 'replace':
                out.write(self.startDeleteText())
                self.writeLines(a[i1:i2], out)
                out.write(self.endDeleteText())
            if tag == 'insert' or tag == 'replace':
                out.write(self.startInsertText())
                self.writeLines(b[j1:j2], out)
                out.write(self.endInsertText())
        html = out.getvalue()
        out.close()
        if addStylesheet:
            html = self.addStylesheet(html, self.stylesheet())
        return html

    def writeLines(self, lines, out):
        for line in lines:
            line = htmlEncode(line)
            line = line.replace('  ', '&nbsp; ')
            line = line.replace('\t', '&nbsp; &nbsp; &nbsp; &nbsp; ')
            if line.startswith(' '):
                line = '&nbsp;' + line[1:]
            out.write('<tt>%s</tt><br>\n' % line)

 if __name__ == '__main__':
    import sys
    if not sys.argv[1:]:
        print "Usage: %s file1 file2" % sys.argv[0]
        print "or to test: %s test" % sys.argv[0]
    elif sys.argv[1] == 'test' and not sys.argv[2:]:
        import doctest
        doctest.testmod()
    else:
        print diffFiles(sys.argv[1], sys.argv[2])
	#!/usr/bin/python
	# -- coding: UTF-8 --
	# vim:set shiftwidth=4 tabstop=4 expandtab textwidth=79:

	from difflib import SequenceMatcher
	import re
	from StringIO import StringIO
	import cgi
	import base64

	try: import i18n
	except: from gettext import gettext as _

	def htmlEncode(s, esc=cgi.escape):
	return esc(s, 1)

	commentRE = re.compile('<!--.*?-->', re.S)
	tagRE = re.compile('<.*?>', re.S)
	headRE = re.compile('<\shead\s>', re.S \| re.I)

	uniPunctuationRE = re.compile(u' ?([，。！《》；：‘’“”『』（）]) ?')

	class HTMLMatcher(SequenceMatcher):

	def __init__(self, source1, source2):
	SequenceMatcher.__init__(self, None, source1, source2)

	def set_seq1(self, a):
	SequenceMatcher.set_seq1(self, self.splitHTML(a))

	def set_seq2(self, b):
	SequenceMatcher.set_seq2(self, self.splitHTML(b))

	def splitTags(self, t):
	result = []
	pos = 0
	while 1:
	match = tagRE.search(t, pos=pos)
	if not match:
	result.append(t[pos:])
	break
	result.append(t[pos:match.start()])
	result.append(match.group(0))
	pos = match.end()
	return result

	def splitWords(self, t):
	## 将中文按标点拆分句子对比差异
	t = uniPunctuationRE.sub(u' \\1 ', t.strip().decode('utf8')).encode('utf8')
	return t.strip().split()

	def splitHTML(self, t):
	t = commentRE.sub('', t)
	r = self.splitTags(t)
	result = []
	for item in r:
	if item.startswith('<'):
	result.append(item)
	else:
	result.extend(self.splitWords(item))
	return result

	def htmlDiff(self, addStylesheet=False):
	opcodes = self.get_opcodes()
	a = self.a
	b = self.b
	out = StringIO()
	#print [o[0] for o in opcodes]
	for tag, i1, i2, j1, j2 in opcodes:
	if tag == 'equal':
	for item in a[i1:i2]:
	out.write(item)
	out.write(' ')
	if tag == 'delete' or tag == 'replace':
	self.textDelete(a[i1:i2], out)
	if tag == 'insert' or tag == 'replace':
	self.textInsert(b[j1:j2], out)
	html = out.getvalue()
	out.close()
	if addStylesheet:
	html = self.addStylesheet(html, self.stylesheet())
	## 删除中文标点前后多余空格
	html = uniPunctuationRE.sub(u'\\1', html.decode('utf8')).encode('utf8')
	return html

	def textDelete(self, lst, out):
	inSpan = False
	for item in lst:
	if item.startswith('<'):
	if inSpan:
	out.write(self.endDeleteText())
	inSpan = False
	out.write(self.formatDeleteTag(item))
	else:
	if not inSpan:
	out.write(self.startDeleteText())
	inSpan = True
	out.write(item)
	out.write(' ')
	if inSpan:
	out.write(self.endDeleteText())

	def textInsert(self, lst, out):
	inSpan = False
	for item in lst:
	if item.startswith('<'):
	if inSpan:
	out.write(self.endInsertText())
	inSpan = False
	out.write(self.formatInsertTag(item))
	out.write(item)
	out.write(' ')
	else:
	if not inSpan:
	out.write(self.startInsertText())
	inSpan = True
	out.write(item)
	out.write(' ')
	if inSpan:
	out.write(self.endInsertText())

	def stylesheet(self):
	return '''
	.insert { background-color: #aaffaa }
	.delete { background-color: #ff8888; text-decoration: line-through }
	.tagInsert { background-color: #007700; color: #ffffff }
	.tagDelete { background-color: #770000; color: #ffffff }
	'''

	def addStylesheet(self, html, ss):
	match = headRE.search(html)
	if match:
	pos = match.end()
	else:
	pos = 0
	return ('%s<style type="text/css"><!--\n%s\n--></style>%s'
	% (html[:pos], ss, html[pos:]))

	def startInsertText(self):
	return '<span class="insert">'
	def endInsertText(self):
	return '</span> '
	def startDeleteText(self):
	return '<span class="delete">'
	def endDeleteText(self):
	return '</span> '
	def formatInsertTag(self, tag):
	return '<span class="tagInsert">insert: <tt>%s</tt></span> ' % htmlEncode(tag)
	def formatDeleteTag(self, tag):
	return '<span class="tagDelete">delete: <tt>%s</tt></span> ' % htmlEncode(tag)

	class NoTagHTMLMatcher(HTMLMatcher):
	def formatInsertTag(self, tag):
	return ''
	def formatDeleteTag(self, tag):
	return ''

	def addStyleToTag(tag, style):
	if re.findall('(style=")(.?); *?(")', tag):
	return re.sub('(style=")(.?); *?(")', '\\1\\2; %s\\3' % style, tag)
	elif re.findall("(style=')(.?); *?(')", tag):
	return re.sub("(style=')(.?); *?(')", '\\1\\2; %s\\3' % style, tag)
	else:
	return re.sub('(<\w+) ', '\\1 style="%s" ' % style, tag)
	pass

	class TagHTMLMatcher(HTMLMatcher):
	def textInsert(self, lst, out):
	inSpan = False
	for item in lst:
	if item.startswith('<img'):
	style = 'padding: 5px; border: 12px; border-style: solid; border-color: #aaffaa; background-color: #ccffcc;'
	out.write(addStyleToTag(item, style))
	out.write(' ')
	elif item.startswith('<'):
	if inSpan:
	out.write(self.endInsertText())
	inSpan = False
	out.write(self.formatInsertTag(item))
	out.write(item)
	out.write(' ')
	else:
	if not inSpan:
	out.write(self.startInsertText())
	inSpan = True
	out.write(item)
	out.write(' ')
	if inSpan:
	out.write(self.endInsertText())

	def formatInsertTag(self, tag):
	if tag.startswith('<table'):
	return '<span class="tagInsert"><tt>%s</tt></span> ' % _('Table')
	return ''

	def formatDeleteTag(self, tag):
	if tag.startswith('<img'):
	style = 'padding: 5px; border: 12px; border-style: solid; border-color: #ff8888; background-color: #ffcccc;'
	return addStyleToTag(tag, style)
	elif tag.startswith('<table'):
	return '<span class="tagDelete"><tt>%s</tt></span> ' % _('Table')
	return ''


	class _LeftHTMLMatcher(NoTagHTMLMatcher):
	def startInsertText(self):
	return '<span class="delete">'
	def endInsertText(self):
	return '</span> '
	def startDeleteText(self):
	return '<span class="insert" style="display: none">'
	def endDeleteText(self):
	return '</span> '

	class _RightHTMLMatcher(NoTagHTMLMatcher):
	def startDeleteText(self):
	return '<span class="delete" style="display: none">'
	def endDeleteText(self):
	return '</span> '

	def comparisonhtmldiff(source1, source2, addStylesheet=False):
	'''生成两列的对比页面

	@TODO: 设法同步两列网页滚动条
	@NOTE: 考虑 1. 换成自适应高度的 iframe/object；2. 用 js 同步滚动条；3. 给插入/删除标记添加 click 事件同步另一列网页位置
	'''
	html = '''<html><head>
	<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
	<frameset cols="50%%, 50%%">
	<frame src="data:text/html;charset=UTF-8;base64,%s">
	<frame src="data:text/html;charset=UTF-8;base64,%s">
	</frameset>
	</head></html>'''
	left = _LeftHTMLMatcher(source2, source1).htmlDiff(addStylesheet)
	right = _RightHTMLMatcher(source1, source2).htmlDiff(addStylesheet)
	return html % ( base64.encodestring(left), base64.encodestring(right))

	def htmldiff(source1, source2, addStylesheet=False):
	"""
	Return the difference between two pieces of HTML

	>>> htmldiff('test1', 'test2')
	'<span class="delete">test1 </span> <span class="insert">test2 </span> '
	>>> htmldiff('test1', 'test1')
	'test1 '
	>>> htmldiff('<b>test1</b>', '<i>test1</i>')
	'<span class="tagDelete">delete: <tt><b></tt></span> <span class="tagInsert">insert: <tt><i></tt></span> <i> test1 <span class="tagDelete">delete: <tt></b></tt></span> <span class="tagInsert">insert: <tt></i></tt></span> </i> '
	"""
	h = HTMLMatcher(source1, source2)
	return h.htmlDiff(addStylesheet)

	def notaghtmldiff(source1, source2, addStylesheet=False):
	h = NoTagHTMLMatcher(source1, source2)
	return h.htmlDiff(addStylesheet)

	def taghtmldiff(source1, source2, addStylesheet=False):
	h = TagHTMLMatcher(source1, source2)
	return h.htmlDiff(addStylesheet)

	def diffFiles(f1, f2):
	source1 = open(f1).read()
	source2 = open(f2).read()
	#return comparisonhtmldiff(source1, source2, True)
	return taghtmldiff(source1, source2, True)

	class SimpleHTMLMatcher(HTMLMatcher):
	"""
	Like HTMLMatcher, but returns a simpler diff
	"""
	def startInsertText(self):
	return '+['
	def endInsertText(self):
	return ']'
	def startDeleteText(self):
	return '-['
	def endDeleteText(self):
	return ']'
	def formatInsertTag(self, tag):
	return '+[%s]' % tag
	def formatDeleteTag(self, tag):
	return '-[%s]' % tag

	def simplehtmldiff(source1, source2):
	"""
	Simpler form of htmldiff; mostly for testing, like:

	>>> simplehtmldiff('test1', 'test2')
	'-[test1 ]+[test2 ]'
	>>> simplehtmldiff('<b>Hello world!</b>', '<i>Hello you!</i>')
	'-[<b>]+[<i>]<i> Hello -[world! ]-[</b>]+[you! ]+[</i>]</i> '
	"""
	h = SimpleHTMLMatcher(source1, source2)
	return h.htmlDiff()

	class TextMatcher(HTMLMatcher):


	def set_seq1(self, a):
	SequenceMatcher.set_seq1(self, a.split('\n'))

	def set_seq2(self, b):
	SequenceMatcher.set_seq2(self, b.split('\n'))

	def htmlDiff(self, addStylesheet=False):
	opcodes = self.get_opcodes()
	a = self.a
	b = self.b
	out = StringIO()
	for tag, i1, i2, j1, j2 in opcodes:
	if tag == 'equal':
	self.writeLines(a[i1:i2], out)
	if tag == 'delete' or tag == 'replace':
	out.write(self.startDeleteText())
	self.writeLines(a[i1:i2], out)
	out.write(self.endDeleteText())
	if tag == 'insert' or tag == 'replace':
	out.write(self.startInsertText())
	self.writeLines(b[j1:j2], out)
	out.write(self.endInsertText())
	html = out.getvalue()
	out.close()
	if addStylesheet:
	html = self.addStylesheet(html, self.stylesheet())
	return html

	def writeLines(self, lines, out):
	for line in lines:
	line = htmlEncode(line)
	line = line.replace(' ', '  ')
	line = line.replace('\t', '        ')
	if line.startswith(' '):
	line = ' ' + line[1:]
	out.write('<tt>%s</tt><br>\n' % line)

	if __name__ == '__main__':
	import sys
	if not sys.argv[1:]:
	print "Usage: %s file1 file2" % sys.argv[0]
	print "or to test: %s test" % sys.argv[0]
	elif sys.argv[1] == 'test' and not sys.argv[2:]:
	import doctest
	doctest.testmod()
	else:
	print diffFiles(sys.argv[1], sys.argv[2])