Skip to content

Instantly share code, notes, and snippets.

@Eligijus112
Created March 9, 2020 18:38
Show Gist options
  • Save Eligijus112/1e66e8df2412a170514c664fe048e65d to your computer and use it in GitHub Desktop.
Save Eligijus112/1e66e8df2412a170514c664fe048e65d to your computer and use it in GitHub Desktop.
A class to read word embeddings
import numpy as np
class Embeddings():
"""
A class to read the word embedding file and to create the word embedding matrix
"""
def __init__(self, path, vector_dimension):
self.path = path
self.vector_dimension = vector_dimension
@staticmethod
def get_coefs(word, *arr):
return word, np.asarray(arr, dtype='float32')
def get_embedding_index(self):
embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore'))
return embeddings_index
def create_embedding_matrix(self, tokenizer, max_features):
"""
A method to create the embedding matrix
"""
model_embed = self.get_embedding_index()
embedding_matrix = np.zeros((max_features + 1, self.vector_dimension))
for word, index in tokenizer.word_index.items():
if index > max_features:
break
else:
try:
embedding_matrix[index] = model_embed[word]
except:
continue
return embedding_matrix
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment