Skip to content

Instantly share code, notes, and snippets.

@MLWhiz
Created February 9, 2019 08:07
Show Gist options
  • Save MLWhiz/f1e0baf3e2bcfb7a6c40c812079ff109 to your computer and use it in GitHub Desktop.
Save MLWhiz/f1e0baf3e2bcfb7a6c40c812079ff109 to your computer and use it in GitHub Desktop.
# load the GloVe vectors in a dictionary:
def load_glove_index():
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
return embeddings_index
embeddings_index = load_glove_index()
print('Found %s word vectors.' % len(embeddings_index))
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def sent2vec(s):
words = str(s).lower()
words = word_tokenize(words)
words = [w for w in words if not w in stop_words]
words = [w for w in words if w.isalpha()]
M = []
for w in words:
try:
M.append(embeddings_index[w])
except:
continue
M = np.array(M)
v = M.sum(axis=0)
if type(v) != np.ndarray:
return np.zeros(300)
return v / np.sqrt((v ** 2).sum())
# create glove features
xtrain_glove = np.array([sent2vec(x) for x in tqdm(train_df.cleaned_text.values)])
xtest_glove = np.array([sent2vec(x) for x in tqdm(test_df.cleaned_text.values)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment