Created
May 5, 2012 09:11
-
-
Save alexzhan/2601133 to your computer and use it in GitHub Desktop.
Find the differences between two pieces of TEXT.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding:utf-8 -*- | |
""" | |
textdiff.py | |
Original is (C) Ian Bicking <[email protected]> | |
With changes from Richard Cyganiak <[email protected]> and Richard Cyganiak <[email protected]> | |
Modified from https://github.com/cygri/htmldiff/blob/master/htmldiff | |
Finds the differences between two pieces of TEXT. | |
*Not* line-by-line comparison (more word-by-word even letter by letter). | |
""" | |
from difflib import SequenceMatcher | |
class TextMatcher(SequenceMatcher): | |
def __init__(self, source1, source2): | |
SequenceMatcher.__init__(self, None, source1, source2) | |
def textDiff(self): | |
opcodes = self.get_opcodes() | |
a = self.a | |
b = self.b | |
out = [] | |
for tag, i1, i2, j1, j2 in opcodes: | |
if tag == 'equal': | |
for item in a[i1:i2]: | |
out.append(item) | |
if tag == 'delete' or tag == 'replace': | |
self.textDelete(a[i1:i2], out) | |
if tag == 'insert' or tag == 'replace': | |
self.textInsert(b[j1:j2], out) | |
text = ''.join(out) | |
return text | |
def textDelete(self, lst, out): | |
text = '' | |
for item in lst: | |
text += item | |
self.outDelete(text, out) | |
def textInsert(self, lst, out): | |
text = '' | |
for item in lst: | |
text += item | |
self.outInsert(text, out) | |
def outDelete(self, s, out): | |
if s.strip() == '': | |
out.append(s) | |
else: | |
out.append(self.startDeleteText()) | |
out.append(s) | |
out.append(self.endDeleteText()) | |
def outInsert(self, s, out): | |
if s.strip() == '': | |
out.append(s) | |
else: | |
out.append(self.startInsertText()) | |
out.append(s) | |
out.append(self.endInsertText()) | |
def startInsertText(self): | |
return '<span class="insert">' | |
def endInsertText(self): | |
return '</span>' | |
def startDeleteText(self): | |
return '<span class="delete">' | |
def endDeleteText(self): | |
return '</span>' | |
def textdiff(source1, source2): | |
""" | |
Return the difference between two pieces of TEXT | |
>>> textdiff('test1', 'test2') | |
'test<span class="delete">1</span><span class="insert">2</span>' | |
>>> textdiff('test1', 'test1') | |
'test1' | |
>>> textdiff(u"再回首",u"在回手") | |
'<span class="delete">再</span><span class="insert">在</span>回<span class="delete">首</span><span class="insert">手</span>' | |
>>> textdiff(u"this time",u"thi time") | |
'thi<span class="delete">s</span> time' | |
""" | |
h = TextMatcher(source1, source2) | |
return h.textDiff() | |
if __name__ == '__main__': | |
print textdiff(u"再回首",u"在回手") | |
print textdiff(u"this time",u"thi time") | |
print textdiff('test1', 'test2') | |
print textdiff('test1', 'test1') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment