Skip to content

Instantly share code, notes, and snippets.

@leeyspaul
Created July 31, 2018 23:18
Show Gist options
  • Save leeyspaul/52985629924becf93427b0c68c695d66 to your computer and use it in GitHub Desktop.
Save leeyspaul/52985629924becf93427b0c68c695d66 to your computer and use it in GitHub Desktop.
Unique Vocab Generator for Facebook bABI Project (Snippet - Missing other parts)
def get_unique_vocab(file_name):
with open(file_name,'r') as file:
raw_corpus = file.read()
tokenized = text_to_word_sequence(raw_corpus, filters='\n\t?123456789101112131415.')
return set(tokenized + ['.'])
vocab = get_unique_vocab(task_training)
print(f'Vocabulary set\n---\n {vocab}')
vocab_maxlen = len(vocab) + 1
story_maxlen = max(map(len,[s for s,_,_ in training_data]))
question_maxlen = max(map(len,[q for _,q,_ in training_data]))
print("The vocabulary size is: {} unique words".format(vocab_maxlen))
print("The story max length is: {}".format(story_maxlen) + " words.")
print("The question max size is: {}".format(question_maxlen))
for vocab_word in enumerate(vocab):
print(vocab_word)
word_index = dict((c, i + 1) for i, c in enumerate(vocab))
word_index
index_words = [''] + list(vocab)
index_words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment