Skip to content

Instantly share code, notes, and snippets.

@naranjja
Created April 20, 2018 21:36
Show Gist options
  • Save naranjja/788d24a012e5fd982e33b66a55a5b23e to your computer and use it in GitHub Desktop.
Save naranjja/788d24a012e5fd982e33b66a55a5b23e to your computer and use it in GitHub Desktop.
Simple implementation of word2vec to find related words within a dataset
stop-words>=2015.2.23.1
nltk>=3.2.3
gensim>=3.4.0
import pandas as pd
from re import sub
from string import ascii_letters
from stop_words import get_stop_words
from nltk.corpus import stopwords as nltk_stopwords
from gensim.models.word2vec import Word2Vec
from gensim import corpora
def clean_words(words, stopwords):
words = sub("[^a-zA-Z]", " ", words)
words = words.lower().split()
words = [w for w in words if not w in stopwords]
return(words)
def clean(paragraph, stopwords):
sentences = []
if isinstance(paragraph, str):
raw_sentences = paragraph.split(".")
for raw_sentence in raw_sentences:
if len(raw_sentence) > 0:
sentences.append(clean_words(raw_sentence, stopwords))
return sentences
def find_words_related_to(word):
df = pd.read_csv("path/to/csv", encoding="utf-8")
df["text"].dropna(inplace=True)
stopwords = get_stop_words("target-language")
stopwords += nltk_stopwords.words("target-language")
stopwords += ascii_letters.split()
with open("./custom_stopwords.txt", "r") as f:
stopwords += [_.strip() for _ in f.readlines()]
df.loc[:, "text"] = df["text"].str.normalize('NFKD')
df.loc[:, "text"] = df["text"].str.encode('ascii', errors='ignore')
df.loc[:, "text"] = df["text"].str.decode('utf-8')
documents = []
for doc in df["text"]:
documents += clean(doc, stopwords)
dictionary = corpora.Dictionary(documents)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents]
w2v = Word2Vec(documents, size=100, min_count=1, sg=1, window=20)
return w2v.wv.most_similar_cosmul(positive=[word])
if __name__ == '__main__':
print(find_words_related_to("something"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment