Created
July 4, 2022 15:03
-
-
Save lasp73/522188b82539246450dbd43cacff8b0f to your computer and use it in GitHub Desktop.
Helper classes to work with embeddings in scikit-learn
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from abc import ABC, abstractmethod | |
from typing import Callable | |
import numpy as np | |
import pandas as pd | |
from gensim.models import KeyedVectors, Word2Vec | |
from sklearn.base import BaseEstimator, TransformerMixin | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
logger = logging.getLogger(__name__) | |
class BaseEmbedding(ABC): | |
"""Adapter for different embeddings. | |
You must implement all methods according to your embedding model. | |
""" | |
@abstractmethod | |
def __getitem__(self, word): | |
pass | |
@abstractmethod | |
def __contains__(self, word): | |
pass | |
@abstractmethod | |
def vector_size(self): | |
"""Returns the embedding vector size.""" | |
pass | |
class GensimEmbedding(BaseEmbedding): | |
"""Embeddings from Gensim.""" | |
def __init__(self, model: KeyedVectors = None): | |
"""Creates embeddings from Gensim's KeyedVectors.""" | |
self.model = model | |
def __getitem__(self, word): | |
return self.model[word] | |
def __contains__(self, word): | |
return word in self.model | |
def vector_size(self): | |
return self.model.vector_size | |
def load_file(self, file_path): | |
"""Load embeddings from Gensim format file.""" | |
self.model = KeyedVectors.load(file_path) | |
return self | |
class Word2VecEmbedding(BaseEmbedding): | |
"""Embeddings from Word2Vec. | |
If no pretrained model file is informed, a new model is trained. | |
""" | |
def __getitem__(self, word): | |
return self.model[word] | |
def __contains__(self, word): | |
return word in self.model | |
def vector_size(self): | |
return self.model.vector_size | |
def train(self, tokens, **params): | |
# X_split = [tokens_sentences for tokens_sentence in tokens] | |
temp_model = Word2Vec(tokens.to_list(), **params) | |
self.model = temp_model.wv | |
del temp_model | |
return self | |
def load_file(self, file_path, file_binary=False): | |
self.model = KeyedVectors.load_word2vec_format(file_path, binary=file_binary) | |
self.trainable = False | |
return self | |
class GloveEmbedding(BaseEmbedding): | |
"""Embeddings from Glove text format""" | |
def __getitem__(self, word): | |
return self.model[word] | |
def __contains__(self, word): | |
return word in self.model | |
def vector_size(self): | |
return self.vector_dim | |
def load_file(self, file_path): | |
embeddings_dict = {} | |
logger.info("Loading Glove embeddings from: %s", file_path) | |
with open(file_path, mode="r", encoding="utf-8") as f: | |
for line in f: | |
values = line.split() | |
word = values[0] | |
vector = np.asarray(values[1:], "float32") | |
embeddings_dict[word] = vector | |
# Taking the vector size from last word | |
self.vector_dim = len(vector) | |
logger.info( | |
"Loaded Glove embeddings [%d, %d]!", len(embeddings_dict), self.vector_dim | |
) | |
self.model = embeddings_dict | |
return self | |
class MeanEmbeddingVectorizer(BaseEstimator, TransformerMixin): | |
"""Vectorizer to calculate the sentence embedding from the | |
average word vectors. | |
""" | |
def __init__(self, model: BaseEmbedding, tokenizer=None): | |
self.model = model | |
self.tokenizer = str.split if tokenizer is None else tokenizer | |
def fit(self, X, y=None): | |
return self | |
def transform(self, X): | |
# Here we are taking the mean | |
return np.array( | |
[ | |
np.mean( | |
[self.model[token] for token in self.tokenizer(sentence) if token in self.model] | |
or [np.zeros(self.model.vector_size())], | |
axis=0, | |
) | |
for sentence in X | |
] | |
) | |
class WeightedEmbeddingVectorizer(BaseEstimator, TransformerMixin): | |
"""Vectorizer to calculate the sentence embedding from the | |
weighted word vectors based on tf-idf. | |
If you inform label vector (Y), grouped tf-idf will be used. | |
""" | |
def __init__(self, model: BaseEmbedding, tokenizer=None): | |
self.model = model | |
self.tokenizer = str.split if tokenizer is None else tokenizer | |
def fit(self, X, y=None): | |
self.__build_tfidf_vectorizer(X, y) | |
return self | |
def __build_tfidf_vectorizer(self, X, y=None): | |
if y is None: | |
logger.info("Tf-idf using independent sentences") | |
X_new = X | |
else: | |
logger.info("Tf-idf using grouped sentences") | |
X_new = ( | |
pd.DataFrame({"x": X, "y": y}) | |
.groupby("y", as_index=False) | |
.agg(" ".join)["x"] | |
.values.astype(str) | |
) | |
self.tfidf_vectorizer = TfidfVectorizer() | |
self.tfidf_vectorizer.fit(X_new) | |
self.feature_names = { | |
w: i for (i, w) in enumerate(self.tfidf_vectorizer.get_feature_names()) | |
} | |
def transform(self, X): | |
# Here we are taking the weighted mean using tf-idf | |
self.tfidf_feats = self.tfidf_vectorizer.transform(X) | |
return np.array( | |
[ | |
np.mean( | |
[ | |
( | |
self.model[w] | |
* self.tfidf_feats[sent_id, self.feature_names[w]] | |
) | |
for w in self.tokenizer(words) | |
if w in self.model and w in self.feature_names | |
] | |
or [np.zeros(self.model.vector_size())], | |
axis=0, | |
) | |
for sent_id, words in enumerate(X) | |
] | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment