Last active
December 7, 2015 18:48
-
-
Save kevinrobinson/872c1e8285ccc2e5070a to your computer and use it in GitHub Desktop.
Naive dictionary approach to representing words
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # example input: | |
| sentence = 'The quick brown fox jumped over the lazy dog.' | |
| # tokenize and normalize words, building the set of all vocabulary ever seen | |
| words_set = {} | |
| def tokenize(sentence): | |
| return map(str.lower, sentence[0:-1].split(' ')) | |
| for word in tokenize(sentence): | |
| words_set[word] = True | |
| # build an index for the vocabulary | |
| word2index = {} | |
| index2word = {} | |
| for index, word in enumerate(words_set.keys()): | |
| word2index[word] = index | |
| index2word[index] = word | |
| # output: | |
| # >>> word2index | |
| # {'quick': 4, 'lazy': 6, 'jumped': 7, 'brown': 2, 'the': 3, 'over': 0, 'dog': 5, 'fox': 1} | |
| # >>> index2word | |
| # {0: 'over', 1: 'fox', 2: 'brown', 3: 'the', 4: 'quick', 5: 'dog', 6: 'lazy', 7: 'jumped'} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment