Skip to content

Instantly share code, notes, and snippets.

@gameguy43
Created October 7, 2014 22:43
Show Gist options
  • Save gameguy43/a88a22f9cd393ff71998 to your computer and use it in GitHub Desktop.
Save gameguy43/a88a22f9cd393ff71998 to your computer and use it in GitHub Desktop.
class WordCloudData:
def __init__(self, input_string):
self.input_string = input_string
self.words_to_counts = {}
self.populate_hash()
def populate_hash(self):
#
current_word = ''
for i in range(0, len(self.input_string)):
character = self.input_string[i]
# if we reached the end of the string we check if the last
# character is a letter and add the last word to our hash
if i == len(self.input_string)-1:
if self.is_letter(character):
current_word += character
if current_word: self.add_word_to_hash(current_word)
# if we reach a space or emdash we know we're at the end of a word
# so we add it to our hash and reset our current word
elif character == ' ' or character == u'\u2014':
if current_word: self.add_word_to_hash(current_word)
current_word = ''
# we want to make sure we split on an elipses so if we get two periods in
# a row we add the current word to our hash and reset our current word
elif character == '.':
if i < len(self.input_string)-1 and self.input_string[i+1] == '.':
if current_word: self.add_word_to_hash(current_word)
current_word = ''
# if the character is a letter or an apostrophe, we add it to our current word
elif self.is_letter(character) or character == '\'':
current_word += character
# if the character is a hyphen, we want to check if it's surrounded by letters
# if it is, we add it to our current word
elif character == '-':
if i > 0 and self.is_letter(self.input_string[i-1]) and self.is_letter(self.input_string[i+1]):
current_word += character
def add_word_to_hash(self, word):
# if the word is already in the hash we increment its count
if self.words_to_counts.has_key(word):
self.words_to_counts[word] += 1
# if a lowercase version is in the hash, we know our input word must be uppercase
# but we only include uppercase words if they're always uppercase
# so we just increment the lowercase version's count
elif self.words_to_counts.has_key(word.lower()):
self.words_to_counts[word.lower()] += 1
# if an uppercase version is in the hash, we know our input word must be lowercase.
# since we only include uppercase words if they're always uppercase, we add the
# lowercase version and give it the uppercase version's count
elif self.words_to_counts.has_key(word.capitalize()):
self.words_to_counts[word] = 1
self.words_to_counts[word] += self.words_to_counts[word.capitalize()]
del self.words_to_counts[word.capitalize()]
# otherwise, the word is not in the hash at all, lowercase or uppercase
# so we add it to the hash
else:
self.words_to_counts[word] = 1
def is_letter(self, character):
return character in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
test_input = 'Hello hello' # capitalized word
should_be = {'hello': 2}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = 'Hello & world' # stranded punctuation
should_be = {'Hello': 1, 'world': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = 'Hello' + u'\u2014' + 'world' # em dash
should_be = {'Hello': 1, 'world': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = ' hello ' # surrounding spaces
should_be = {'hello': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = '-world- hello-world - -- -hello-' # hyphens
should_be = {'world': 1, 'hello-world': 1, 'hello': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = 'cake\'s' # apostrophe
should_be = {'cake\'s': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = '...hello...world ... cake...' # elipses
should_be = {'hello': 1, 'world': 1, 'cake': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = '\"Hello!\" she yelled' # quotation marks
should_be = {'Hello': 1, 'she': 1, 'yelled': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
test_input = 'We ate (Mille-Feuille) cake.' # parenthesis and hyphenated words
should_be = {'We': 1, 'ate': 1, 'Mille-Feuille': 1, 'cake': 1}
result = WordCloudData(test_input).words_to_counts
assert result == should_be
print "all tests pass"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment