dkav9 · October 25, 2021 23:40
diff --git a/get_rid_of_similar.py b/get_rid_of_similar.py
 def remove_similar_sentences(sentences, model, similarity_threshold):
  """
  model: the nlp model to use from the sentence transformer library
  sentences: a list of sentences from a website
  Returns a list of new sentences which are not too similar to each other
  """
  new_sentences = sentences.copy()
  # Compute embeddings
  embeddings = model.encode(new_sentences, device='cpu', show_progress_bar=False)
  # Compute cosine-similarities for each sentence with each other sentence
  cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
  # Find most similar pairs
  pairs = []
  for i in range(len(cosine_scores)-1):
      for j in range(i+1, len(cosine_scores)):
          pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
  # Get similar pairs
  similar = []
  for pair in pairs:
    if pair['score'] > similarity_threshold:
      similar.append(pair['index'])
  # Get indeces of similar pair
  del_indeces = []
  for i in similar:
    del_indeces.append(min(i))
  del_indeces = set(del_indeces)
  # Delete similar sentences
  try:
    for i in del_indeces:
      del new_sentences[i]
  except:
    pass
  return new_sentences
	def remove_similar_sentences(sentences, model, similarity_threshold):
	"""
	model: the nlp model to use from the sentence transformer library
	sentences: a list of sentences from a website
	Returns a list of new sentences which are not too similar to each other
	"""
	new_sentences = sentences.copy()
	# Compute embeddings
	embeddings = model.encode(new_sentences, device='cpu', show_progress_bar=False)
	# Compute cosine-similarities for each sentence with each other sentence
	cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
	# Find most similar pairs
	pairs = []
	for i in range(len(cosine_scores)-1):
	for j in range(i+1, len(cosine_scores)):
	pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})
	# Get similar pairs
	similar = []
	for pair in pairs:
	if pair['score'] > similarity_threshold:
	similar.append(pair['index'])
	# Get indeces of similar pair
	del_indeces = []
	for i in similar:
	del_indeces.append(min(i))
	del_indeces = set(del_indeces)
	# Delete similar sentences
	try:
	for i in del_indeces:
	del new_sentences[i]
	except:
	pass
	return new_sentences
No results found