Created
May 13, 2020 12:40
-
-
Save gauravbansal98/ac3a8246e91332edf3af02d0ec25c7d2 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# convert a dictionary of clean descriptions to a list of descriptions | |
def to_lines(descriptions): | |
all_desc = list() | |
for key in descriptions.keys(): | |
[all_desc.append(d) for d in descriptions[key]] | |
return all_desc | |
# fit a tokenizer given caption descriptions | |
def create_tokenizer(descriptions): | |
lines = to_lines(descriptions) | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts(lines) | |
return tokenizer | |
# prepare tokenizer | |
tokenizer = create_tokenizer(train_descriptions) | |
vocab_size = len(tokenizer.word_index) + 1 | |
print('Vocabulary Size: %d' % vocab_size) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment