elotroalex · March 10, 2012 07:39
diff --git a/wordsToChars.py b/wordsToChars.py
 import re

 def diff_wordsToChars(text1, text2):
    """Split two texts into an array of words.  Reduce the texts to a string
    of hashes where each Unicode character represents one word.

    Args:
      text1: First string.
      text2: Second string.

    Returns:
      Three element tuple, containing the encoded text1, the encoded text2 and
      the array of unique strings. 
    """
    wordArray = []  # e.g. wordArray[4] == "Hello\n"
    wordHash = {}   # e.g. wordHash["Hello\n"] == 4
   

    def diff_wordsToCharsMunge(text):
      chars = []
      # Walk the text, pulling out a substring for each word.

      wordStart = 0
      # A regex is needed in order to account for spaces and linebreaks.
      p = re.compile(r'\W+')      
      i = p.finditer(text)
       
      for m in i:
        wordEnd = m.start()
        word = text[wordStart:wordEnd + 1]
        wordStart = wordEnd + 1
        
        if word in wordHash:
          chars.append(chr(wordHash[word]))
        else:
          wordArray.append(word)
          wordHash[word] = len(wordArray) - 1
          chars.append(chr(len(wordArray) - 1))
      return "".join(chars)

    chars1 = diff_wordsToCharsMunge(text1)
    chars2 = diff_wordsToCharsMunge(text2)
    return (chars1, chars2, wordArray)


 text1 = "This is a test.\nThis is the second line."
 text2 = "This is another test.\nThis is the second line."

 results = diff_wordsToChars(text1, text2)
 print(results)
	import re

	def diff_wordsToChars(text1, text2):
	"""Split two texts into an array of words. Reduce the texts to a string
	of hashes where each Unicode character represents one word.

	Args:
	text1: First string.
	text2: Second string.

	Returns:
	Three element tuple, containing the encoded text1, the encoded text2 and
	the array of unique strings.
	"""
	wordArray = [] # e.g. wordArray[4] == "Hello\n"
	wordHash = {} # e.g. wordHash["Hello\n"] == 4


	def diff_wordsToCharsMunge(text):
	chars = []
	# Walk the text, pulling out a substring for each word.

	wordStart = 0
	# A regex is needed in order to account for spaces and linebreaks.
	p = re.compile(r'\W+')
	i = p.finditer(text)

	for m in i:
	wordEnd = m.start()
	word = text[wordStart:wordEnd + 1]
	wordStart = wordEnd + 1

	if word in wordHash:
	chars.append(chr(wordHash[word]))
	else:
	wordArray.append(word)
	wordHash[word] = len(wordArray) - 1
	chars.append(chr(len(wordArray) - 1))
	return "".join(chars)

	chars1 = diff_wordsToCharsMunge(text1)
	chars2 = diff_wordsToCharsMunge(text2)
	return (chars1, chars2, wordArray)


	text1 = "This is a test.\nThis is the second line."
	text2 = "This is another test.\nThis is the second line."

	results = diff_wordsToChars(text1, text2)
	print(results)