benob · March 20, 2022 11:33
diff --git a/bm25.py b/bm25.py
 from collections import defaultdict
 from math import log

 stopwords = {'le', 'la', 'du'}
 documents = ['le chat boit du lait', 'le chien aime le chat', 'la souris aime le chien']

 index = defaultdict(list)
 doc_length = []

 # Create the index
 for doc_id, text in enumerate(documents):
  count = defaultdict(int)
  words = [word for word in text.split() if word not in stopwords]
  # Count the number of occurrences of each word in the document
  for word in words:
    count[word] += 1
  for word in count:
    index[word].append((doc_id, count[word]))
  doc_length.append(len(words))
    
 # Display the index
 for word in index:
  print(word, index[word])

 # Formulas for the BM25 model
 N = len(doc_length)
 avg_doc_length = sum(doc_length) / N
 k1 = 1.2
 b = 0.75

 def idf(word):
  df = len(index[word])
  return log(1 + (N - df + 0.5) / (df + 0.5))

 def tf(word, count, doc_id):
  return (count * (k1 + 1)) / (count + k1 * (1 - b + b * doc_length[doc_id] / avg_doc_length))

 # Score documents for a simple query
 query = 'chien et chat'
 scores = defaultdict(float)

 # Compute the score of documents from each word in the query
 for word in query.split():
  for doc_id, count in index[word]:
    scores[doc_id] += idf(word) * tf(word, count, doc_id)

 # Display the sorted results by decreasing relevance
 for doc_id, score in sorted(scores.items(), key=lambda pair: -pair[1]):
  print(score, doc_id, documents[doc_id])
	from collections import defaultdict
	from math import log

	stopwords = {'le', 'la', 'du'}
	documents = ['le chat boit du lait', 'le chien aime le chat', 'la souris aime le chien']

	index = defaultdict(list)
	doc_length = []

	# Create the index
	for doc_id, text in enumerate(documents):
	count = defaultdict(int)
	words = [word for word in text.split() if word not in stopwords]
	# Count the number of occurrences of each word in the document
	for word in words:
	count[word] += 1
	for word in count:
	index[word].append((doc_id, count[word]))
	doc_length.append(len(words))

	# Display the index
	for word in index:
	print(word, index[word])

	# Formulas for the BM25 model
	N = len(doc_length)
	avg_doc_length = sum(doc_length) / N
	k1 = 1.2
	b = 0.75

	def idf(word):
	df = len(index[word])
	return log(1 + (N - df + 0.5) / (df + 0.5))

	def tf(word, count, doc_id):
	return (count * (k1 + 1)) / (count + k1 * (1 - b + b * doc_length[doc_id] / avg_doc_length))

	# Score documents for a simple query
	query = 'chien et chat'
	scores = defaultdict(float)

	# Compute the score of documents from each word in the query
	for word in query.split():
	for doc_id, count in index[word]:
	scores[doc_id] += idf(word) * tf(word, count, doc_id)

	# Display the sorted results by decreasing relevance
	for doc_id, score in sorted(scores.items(), key=lambda pair: -pair[1]):
	print(score, doc_id, documents[doc_id])