Created
October 25, 2021 23:40
-
-
Save dkav9/6ebbe26e6171dd4ba44309e8e212ded5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def remove_similar_sentences(sentences, model, similarity_threshold): | |
| """ | |
| model: the nlp model to use from the sentence transformer library | |
| sentences: a list of sentences from a website | |
| Returns a list of new sentences which are not too similar to each other | |
| """ | |
| new_sentences = sentences.copy() | |
| # Compute embeddings | |
| embeddings = model.encode(new_sentences, device='cpu', show_progress_bar=False) | |
| # Compute cosine-similarities for each sentence with each other sentence | |
| cosine_scores = util.pytorch_cos_sim(embeddings, embeddings) | |
| # Find most similar pairs | |
| pairs = [] | |
| for i in range(len(cosine_scores)-1): | |
| for j in range(i+1, len(cosine_scores)): | |
| pairs.append({'index': [i, j], 'score': cosine_scores[i][j]}) | |
| # Get similar pairs | |
| similar = [] | |
| for pair in pairs: | |
| if pair['score'] > similarity_threshold: | |
| similar.append(pair['index']) | |
| # Get indeces of similar pair | |
| del_indeces = [] | |
| for i in similar: | |
| del_indeces.append(min(i)) | |
| del_indeces = set(del_indeces) | |
| # Delete similar sentences | |
| try: | |
| for i in del_indeces: | |
| del new_sentences[i] | |
| except: | |
| pass | |
| return new_sentences |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment