Created
April 20, 2018 21:36
-
-
Save naranjja/788d24a012e5fd982e33b66a55a5b23e to your computer and use it in GitHub Desktop.
Simple implementation of word2vec to find related words within a dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
some | |
stop | |
words |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
stop-words>=2015.2.23.1 | |
nltk>=3.2.3 | |
gensim>=3.4.0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from re import sub | |
from string import ascii_letters | |
from stop_words import get_stop_words | |
from nltk.corpus import stopwords as nltk_stopwords | |
from gensim.models.word2vec import Word2Vec | |
from gensim import corpora | |
def clean_words(words, stopwords): | |
words = sub("[^a-zA-Z]", " ", words) | |
words = words.lower().split() | |
words = [w for w in words if not w in stopwords] | |
return(words) | |
def clean(paragraph, stopwords): | |
sentences = [] | |
if isinstance(paragraph, str): | |
raw_sentences = paragraph.split(".") | |
for raw_sentence in raw_sentences: | |
if len(raw_sentence) > 0: | |
sentences.append(clean_words(raw_sentence, stopwords)) | |
return sentences | |
def find_words_related_to(word): | |
df = pd.read_csv("path/to/csv", encoding="utf-8") | |
df["text"].dropna(inplace=True) | |
stopwords = get_stop_words("target-language") | |
stopwords += nltk_stopwords.words("target-language") | |
stopwords += ascii_letters.split() | |
with open("./custom_stopwords.txt", "r") as f: | |
stopwords += [_.strip() for _ in f.readlines()] | |
df.loc[:, "text"] = df["text"].str.normalize('NFKD') | |
df.loc[:, "text"] = df["text"].str.encode('ascii', errors='ignore') | |
df.loc[:, "text"] = df["text"].str.decode('utf-8') | |
documents = [] | |
for doc in df["text"]: | |
documents += clean(doc, stopwords) | |
dictionary = corpora.Dictionary(documents) | |
doc_term_matrix = [dictionary.doc2bow(doc) for doc in documents] | |
w2v = Word2Vec(documents, size=100, min_count=1, sg=1, window=20) | |
return w2v.wv.most_similar_cosmul(positive=[word]) | |
if __name__ == '__main__': | |
print(find_words_related_to("something")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment