Skip to content

Instantly share code, notes, and snippets.

@timotta
Created July 29, 2021 12:42
Show Gist options
  • Save timotta/ceaec141cbb0b18740b5c48071d989ae to your computer and use it in GitHub Desktop.
Save timotta/ceaec141cbb0b18740b5c48071d989ae to your computer and use it in GitHub Desktop.
Gensim word2vec indexed words in a dataframe, dealing with padding and unknown values
# Needs EMBED_SIZE ans SENTENCE_SIZE
# df.text_array is a column with list os word in each cell
w2v_model = Word2Vec(
sentences=df.text_array,
vector_size=EMBED_SIZE,
window=5,
min_count=1,
workers=4,
seed=1982,
epochs=W2V_EPOCHS,
)
w2v_model.wv.add_vector("<UNK>", np.zeros(EMBED_SIZE))
w2v_model.wv.add_vector("<PAD>", np.zeros(EMBED_SIZE))
print(f"vocabulary: {len(w2v_model.wv.key_to_index)}")
def pad(size):
def _pad(text_array):
diff = size - len(text_array)
if diff > 0:
pads = [ "<PAD>" for i in range(diff) ]
return pads + text_array
return text_array[:size]
return _pad
def word_to_index_array(w2v_model):
def _w(text_array):
result = []
for w in text_array:
values = w2v_model.wv.key_to_index.get(w)
if not values:
values = w2v_model.wv.key_to_index.get("<UNK>")
result.append( values )
return result
return _w
df["text_array_padded"] = df.text_array.apply(pad(size=SEQUENCE_SIZE))
df["text_array_indexes"] = df.text_array_padded.apply(word_to_index_array(w2v_model))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment