Skip to content

Instantly share code, notes, and snippets.

@Steboss89
Created June 1, 2022 21:53
Show Gist options
  • Save Steboss89/3e86d02795b7fb63e239b53cd2909fee to your computer and use it in GitHub Desktop.
Save Steboss89/3e86d02795b7fb63e239b53cd2909fee to your computer and use it in GitHub Desktop.
Create word embeddings
# use the documents' list as a column in a dataframe
df = pd.DataFrame(data, columns=["text"])
def get_word2vec(text):
r"""
Parameters
-----------
text: str, text from dataframe, df['text'].tolist()"""
num_workers = multiprocessing.cpu_count()
num_features = 200# tune this bit
epoch_count = 10
tokens = [nltk.word_tokenize(sentence) for sentence in text]
sentence_count = len(text)
word2vec = None
word2vec = w2v.Word2Vec(sg=1,
seed=1,
workers=num_workers,
size=num_features,
min_count=min_frequency_val,
window=5,
sample=0)
print("Building vocab...")
word2vec.build_vocab(tokens)
print("Word2Vec vocabulary length:", len(word2vec.wv.vocab))
print("Training...")
word2vec.train(tokens, total_examples=sentence_count, epochs=epoch_count)
print("Saving model...")
word2vec.save(w2v_file)
return word2vec
# save results in an analysis folder
save_dir = "analysis"
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# set up a minimal frequency value
min_frequency_val=6
# save the word2vec in a file
w2v_file = os.path.join(save_dir, "word_vectors.w2v")
word2vec = get_word2vec(df['text'].tolist())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment