Created
October 7, 2014 22:43
-
-
Save gameguy43/a88a22f9cd393ff71998 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class WordCloudData: | |
def __init__(self, input_string): | |
self.input_string = input_string | |
self.words_to_counts = {} | |
self.populate_hash() | |
def populate_hash(self): | |
# | |
current_word = '' | |
for i in range(0, len(self.input_string)): | |
character = self.input_string[i] | |
# if we reached the end of the string we check if the last | |
# character is a letter and add the last word to our hash | |
if i == len(self.input_string)-1: | |
if self.is_letter(character): | |
current_word += character | |
if current_word: self.add_word_to_hash(current_word) | |
# if we reach a space or emdash we know we're at the end of a word | |
# so we add it to our hash and reset our current word | |
elif character == ' ' or character == u'\u2014': | |
if current_word: self.add_word_to_hash(current_word) | |
current_word = '' | |
# we want to make sure we split on an elipses so if we get two periods in | |
# a row we add the current word to our hash and reset our current word | |
elif character == '.': | |
if i < len(self.input_string)-1 and self.input_string[i+1] == '.': | |
if current_word: self.add_word_to_hash(current_word) | |
current_word = '' | |
# if the character is a letter or an apostrophe, we add it to our current word | |
elif self.is_letter(character) or character == '\'': | |
current_word += character | |
# if the character is a hyphen, we want to check if it's surrounded by letters | |
# if it is, we add it to our current word | |
elif character == '-': | |
if i > 0 and self.is_letter(self.input_string[i-1]) and self.is_letter(self.input_string[i+1]): | |
current_word += character | |
def add_word_to_hash(self, word): | |
# if the word is already in the hash we increment its count | |
if self.words_to_counts.has_key(word): | |
self.words_to_counts[word] += 1 | |
# if a lowercase version is in the hash, we know our input word must be uppercase | |
# but we only include uppercase words if they're always uppercase | |
# so we just increment the lowercase version's count | |
elif self.words_to_counts.has_key(word.lower()): | |
self.words_to_counts[word.lower()] += 1 | |
# if an uppercase version is in the hash, we know our input word must be lowercase. | |
# since we only include uppercase words if they're always uppercase, we add the | |
# lowercase version and give it the uppercase version's count | |
elif self.words_to_counts.has_key(word.capitalize()): | |
self.words_to_counts[word] = 1 | |
self.words_to_counts[word] += self.words_to_counts[word.capitalize()] | |
del self.words_to_counts[word.capitalize()] | |
# otherwise, the word is not in the hash at all, lowercase or uppercase | |
# so we add it to the hash | |
else: | |
self.words_to_counts[word] = 1 | |
def is_letter(self, character): | |
return character in 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
test_input = 'Hello hello' # capitalized word | |
should_be = {'hello': 2} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = 'Hello & world' # stranded punctuation | |
should_be = {'Hello': 1, 'world': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = 'Hello' + u'\u2014' + 'world' # em dash | |
should_be = {'Hello': 1, 'world': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = ' hello ' # surrounding spaces | |
should_be = {'hello': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = '-world- hello-world - -- -hello-' # hyphens | |
should_be = {'world': 1, 'hello-world': 1, 'hello': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = 'cake\'s' # apostrophe | |
should_be = {'cake\'s': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = '...hello...world ... cake...' # elipses | |
should_be = {'hello': 1, 'world': 1, 'cake': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = '\"Hello!\" she yelled' # quotation marks | |
should_be = {'Hello': 1, 'she': 1, 'yelled': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
test_input = 'We ate (Mille-Feuille) cake.' # parenthesis and hyphenated words | |
should_be = {'We': 1, 'ate': 1, 'Mille-Feuille': 1, 'cake': 1} | |
result = WordCloudData(test_input).words_to_counts | |
assert result == should_be | |
print "all tests pass" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment