Skip to content

Instantly share code, notes, and snippets.

@cozek
Created September 21, 2019 19:20
Show Gist options
  • Save cozek/3bba8dbd28dc6ec1c898a7ccb877b13b to your computer and use it in GitHub Desktop.
Save cozek/3bba8dbd28dc6ec1c898a7ccb877b13b to your computer and use it in GitHub Desktop.
sentence_to_embedding
def sent_to_embedding(embedding, data, max_len=None):
'''
Creates fasttext embedding of given list of sentences
input:
embedding: fasttext.model
data: list of sentences
max_len: maximum number of words to consider
returns:
emb_matrix: numpy matrix containing word level embeddings
'''
tknz = nltk.TweetTokenizer()
def find_max_len(data):
l = 0
for sent in data:
sent = tknz.tokenize(sent)
if len(sent) > l:
l = len(sent)
return l
if max_len == None:
max_len = find_max_len(data)
emb_size = embedding.get_word_vector('hi').shape[0]
emb_matrix = np.zeros((len(data),max_len, emb_size))
for i in tqdm(range(len(data))):
words = tknz.tokenize(data[i])[:max_len]
for j in range(len(words)):
emb_matrix[i,j,:] = embedding.get_word_vector(words[j])
return emb_matrix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment