Skip to content

Instantly share code, notes, and snippets.

@elotroalex
Created March 10, 2012 07:39
Show Gist options
  • Save elotroalex/2010729 to your computer and use it in GitHub Desktop.
Save elotroalex/2010729 to your computer and use it in GitHub Desktop.
Necessary step for the Google-diff to work. The original code was meant for lines. This revision creates an array of words. The major challenge for me was incorporating the regex to iterate beyond the \n.
import re
def diff_wordsToChars(text1, text2):
"""Split two texts into an array of words. Reduce the texts to a string
of hashes where each Unicode character represents one word.
Args:
text1: First string.
text2: Second string.
Returns:
Three element tuple, containing the encoded text1, the encoded text2 and
the array of unique strings.
"""
wordArray = [] # e.g. wordArray[4] == "Hello\n"
wordHash = {} # e.g. wordHash["Hello\n"] == 4
def diff_wordsToCharsMunge(text):
chars = []
# Walk the text, pulling out a substring for each word.
wordStart = 0
# A regex is needed in order to account for spaces and linebreaks.
p = re.compile(r'\W+')
i = p.finditer(text)
for m in i:
wordEnd = m.start()
word = text[wordStart:wordEnd + 1]
wordStart = wordEnd + 1
if word in wordHash:
chars.append(chr(wordHash[word]))
else:
wordArray.append(word)
wordHash[word] = len(wordArray) - 1
chars.append(chr(len(wordArray) - 1))
return "".join(chars)
chars1 = diff_wordsToCharsMunge(text1)
chars2 = diff_wordsToCharsMunge(text2)
return (chars1, chars2, wordArray)
text1 = "This is a test.\nThis is the second line."
text2 = "This is another test.\nThis is the second line."
results = diff_wordsToChars(text1, text2)
print(results)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment