Last active
May 16, 2018 06:25
-
-
Save soaxelbrooke/32318ba4fc963810ec91832fccfd1b68 to your computer and use it in GitHub Desktop.
Word embeddings utility class for loading and transforming quickly.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas | |
import numpy | |
import csv | |
from typing import List, Optional | |
class WordVectorizer: | |
def __init__(self, embeddings_path: str, embedding_dim: int, limit=None): | |
with open(embeddings_path) as infile: | |
# Skip header if this was produced by fasttext, which has metadata on first line | |
if next(infile).split(' ') == embedding_dim + 1: | |
infile.seek(0) | |
wv_df = pandas.read_csv(infile, header=None, delim_whitespace=True, | |
names=list(range(embedding_dim + 1)), | |
quoting=csv.QUOTE_NONE, nrows=limit) | |
self.vectors = wv_df.drop(columns=[0]).values.astype(float) | |
self.vectors = numpy.vstack([self.vectors, numpy.zeros((1, embedding_dim))]) | |
self.word_to_idx = {word: idx for idx, word in enumerate(wv_df[0])} | |
def transform(self, tokens: List[str]) -> numpy.ndarray: | |
""" Transform a sentence into a matrix of word vectors. Zeros words not in embeddings. """ | |
indexes = [self.word_to_idx[token] if token in self.word_to_idx else -1 for token in tokens] | |
return self.vectors[indexes] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment