Created
March 20, 2020 21:19
-
-
Save ravishchawla/3122e5620099d64e0633d4be597b4919 to your computer and use it in GitHub Desktop.
quora_data_augmentation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nearest_syns = NearestNeighbors(n_neighbors=total_syns+1).fit(embeddings_matrix); | |
neighbours_mat = nearest_syns.kneighbors(embeddings_matrix[1:top_k])[1]; | |
synonyms = {x[0]: x[1:] for x in neighbours_mat}; | |
def augment_sentence(encoded_sentence, prob = 0.5): | |
for posit in range(len(encoded_sentence)): | |
if random.random() > prob: | |
try: | |
syns = synonyms[encoded_sentence[posit]]; | |
rand_syn = np.random.choice(syns); | |
encoded_sentence[posit] = rand_syn; | |
except KeyError: | |
pass; | |
return encoded_sentence; | |
augment_sentence(example_test_sequence); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment