Created
March 9, 2020 18:38
-
-
Save Eligijus112/1e66e8df2412a170514c664fe048e65d to your computer and use it in GitHub Desktop.
A class to read word embeddings
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import numpy as np | |
| class Embeddings(): | |
| """ | |
| A class to read the word embedding file and to create the word embedding matrix | |
| """ | |
| def __init__(self, path, vector_dimension): | |
| self.path = path | |
| self.vector_dimension = vector_dimension | |
| @staticmethod | |
| def get_coefs(word, *arr): | |
| return word, np.asarray(arr, dtype='float32') | |
| def get_embedding_index(self): | |
| embeddings_index = dict(self.get_coefs(*o.split(" ")) for o in open(self.path, errors='ignore')) | |
| return embeddings_index | |
| def create_embedding_matrix(self, tokenizer, max_features): | |
| """ | |
| A method to create the embedding matrix | |
| """ | |
| model_embed = self.get_embedding_index() | |
| embedding_matrix = np.zeros((max_features + 1, self.vector_dimension)) | |
| for word, index in tokenizer.word_index.items(): | |
| if index > max_features: | |
| break | |
| else: | |
| try: | |
| embedding_matrix[index] = model_embed[word] | |
| except: | |
| continue | |
| return embedding_matrix |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment