Created
August 30, 2019 20:49
-
-
Save Lexie88rus/37dff389bbd8bfa3597ba6d77b3249f8 to your computer and use it in GitHub Desktop.
Encode words as tensors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
# Translate word to an index from vocabulary | |
def wordToIndex(word): | |
if (word != end_of_sentence): | |
word = clean_title(word) | |
return vocab[word] | |
# Translate word to 1-hot tensor | |
def wordToTensor(word): | |
tensor = torch.zeros(1, 1, vocab_size) | |
tensor[0][0][wordToIndex(word)] = 1 | |
return tensor | |
# Turn a title into a <title_length x 1 x vocab_size>, | |
# or an array of one-hot vectors | |
def titleToTensor(title): | |
words = extract_words(title) | |
tensor = torch.zeros(len(words) + 1, 1, vocab_size) | |
for index in range(len(words)): | |
tensor[index][0][wordToIndex(words[index])] = 1 | |
tensor[len(words)][0][vocab[end_of_sentence]] = 1 | |
return tensor | |
# Turn a sequence of words from title into tensor <sequence_length x 1 x vocab_size> | |
def sequenceToTensor(sequence): | |
tensor = torch.zeros(len(sequence), 1, vocab_size) | |
for index in range(len(sequence)): | |
tensor[index][0][wordToIndex(sequence[index])] = 1 | |
return tensor |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment