Skip to content

Instantly share code, notes, and snippets.

@MLWhiz
Created February 21, 2019 17:03
Show Gist options
  • Save MLWhiz/4199b7c5894a271d6f54895175d71713 to your computer and use it in GitHub Desktop.
Save MLWhiz/4199b7c5894a271d6f54895175d71713 to your computer and use it in GitHub Desktop.
from nltk.stem import PorterStemmer
ps = PorterStemmer()
from nltk.stem.lancaster import LancasterStemmer
lc = LancasterStemmer()
from nltk.stem import SnowballStemmer
sb = SnowballStemmer("english")
def load_glove(word_dict, lemma_dict):
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
embed_size = 300
nb_words = len(word_dict)+1
embedding_matrix = np.zeros((nb_words, embed_size), dtype=np.float32)
unknown_vector = np.zeros((embed_size,), dtype=np.float32) - 1.
print(unknown_vector[:5])
for key in tqdm(word_dict):
word = key
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = key.lower()
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = key.upper()
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = key.capitalize()
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = ps.stem(key)
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = lc.stem(key)
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = sb.stem(key)
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
word = lemma_dict[key]
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
if len(key) > 1:
word = correction(key)
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
embedding_matrix[word_dict[key]] = embedding_vector
continue
embedding_matrix[word_dict[key]] = unknown_vector
return embedding_matrix, nb_words
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment