Created
March 10, 2012 07:39
-
-
Save elotroalex/2010729 to your computer and use it in GitHub Desktop.
Necessary step for the Google-diff to work. The original code was meant for lines. This revision creates an array of words. The major challenge for me was incorporating the regex to iterate beyond the \n.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def diff_wordsToChars(text1, text2): | |
"""Split two texts into an array of words. Reduce the texts to a string | |
of hashes where each Unicode character represents one word. | |
Args: | |
text1: First string. | |
text2: Second string. | |
Returns: | |
Three element tuple, containing the encoded text1, the encoded text2 and | |
the array of unique strings. | |
""" | |
wordArray = [] # e.g. wordArray[4] == "Hello\n" | |
wordHash = {} # e.g. wordHash["Hello\n"] == 4 | |
def diff_wordsToCharsMunge(text): | |
chars = [] | |
# Walk the text, pulling out a substring for each word. | |
wordStart = 0 | |
# A regex is needed in order to account for spaces and linebreaks. | |
p = re.compile(r'\W+') | |
i = p.finditer(text) | |
for m in i: | |
wordEnd = m.start() | |
word = text[wordStart:wordEnd + 1] | |
wordStart = wordEnd + 1 | |
if word in wordHash: | |
chars.append(chr(wordHash[word])) | |
else: | |
wordArray.append(word) | |
wordHash[word] = len(wordArray) - 1 | |
chars.append(chr(len(wordArray) - 1)) | |
return "".join(chars) | |
chars1 = diff_wordsToCharsMunge(text1) | |
chars2 = diff_wordsToCharsMunge(text2) | |
return (chars1, chars2, wordArray) | |
text1 = "This is a test.\nThis is the second line." | |
text2 = "This is another test.\nThis is the second line." | |
results = diff_wordsToChars(text1, text2) | |
print(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment