Skip to content

Instantly share code, notes, and snippets.

@dkav9
Created October 25, 2021 23:40
Show Gist options
  • Select an option

  • Save dkav9/6ebbe26e6171dd4ba44309e8e212ded5 to your computer and use it in GitHub Desktop.

Select an option

Save dkav9/6ebbe26e6171dd4ba44309e8e212ded5 to your computer and use it in GitHub Desktop.
def remove_similar_sentences(sentences, model, similarity_threshold):
"""
model: the nlp model to use from the sentence transformer library
sentences: a list of sentences from a website
Returns a list of new sentences which are not too similar to each other
"""
new_sentences = sentences.copy()
# Compute embeddings
embeddings = model.encode(new_sentences, device='cpu', show_progress_bar=False)
# Compute cosine-similarities for each sentence with each other sentence
cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
# Find most similar pairs
pairs = []
for i in range(len(cosine_scores)-1):
for j in range(i+1, len(cosine_scores)):
pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
# Get similar pairs
similar = []
for pair in pairs:
if pair['score'] > similarity_threshold:
similar.append(pair['index'])
# Get indeces of similar pair
del_indeces = []
for i in similar:
del_indeces.append(min(i))
del_indeces = set(del_indeces)
# Delete similar sentences
try:
for i in del_indeces:
del new_sentences[i]
except:
pass
return new_sentences
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment