Skip to content

Instantly share code, notes, and snippets.

@altescy
Created January 17, 2023 03:40
Show Gist options
  • Save altescy/b6528e43b5559d7058e9b0b65a46d40a to your computer and use it in GitHub Desktop.
Save altescy/b6528e43b5559d7058e9b0b65a46d40a to your computer and use it in GitHub Desktop.
from typing import Generic, Hashable, Iterable, Mapping, Optional, Sequence, TypeVar
import numpy
from sklearn.base import TransformerMixin
T_Token = TypeVar("T_Token", bound=Hashable)
Self = TypeVar("Self", bound="SwemVectorizer")
class SwemVectorizer(Generic[T_Token], TransformerMixin): # type: ignore[misc]
def __init__(
self,
embedder: Mapping[T_Token, numpy.ndarray],
embedding_dim: int,
window_size: int = 3,
) -> None:
self.embedder = embedder
self.embedding_dim = embedding_dim
self.window_size = window_size
def _compute_swem_vector(self, tokens: Sequence[T_Token]) -> numpy.ndarray:
vectors = numpy.array([self.embedder[token] for token in tokens])
if len(vectors) < self.window_size:
padding_size = numpy.ceil((self.window_size - len(vectors)) / 2)
vectors = numpy.pad(vectors, ((padding_size, padding_size), (0, 0)), "constant")
output = -numpy.inf * numpy.ones(self.embedding_dim)
for offset in range(len(vectors) - self.window_size + 1):
window = vectors[offset : offset + self.window_size]
output = numpy.maximum(output, window.mean(0))
return output
def fit(
self: Self,
X: Iterable[Sequence[T_Token]],
y: Optional[numpy.ndarray] = None,
) -> Self:
return self
def transform(self, X: Iterable[Sequence[T_Token]]) -> numpy.ndarray:
return numpy.array([self._compute_swem_vector(tokens) for tokens in X])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment