vwood · December 9, 2011 05:02 · vwood · Dec 9, 2011
diff --git a/compact-search-engine.py b/compact-search-engine.py
 #!/usr/bin/env python
 # Short indexer and search engine, 2011 Vaughn Wood
 # Written in python 3
 # Assumes the collection is a newline separated file called index on the current path
 # Only uses TF (so perform queries accordingly)

 from functools import *

 def dict_join_add_values(d, e):
 	"""Given two dictionaries, add the items in the second to the first.
 	If the same key exists in both, the associated values (integers) are added together."""
 	return d.update({k: e[k]+d.get(k,0) for k in e.keys()}) or d

 def make_tf_list(doc, docno):
 	"""Create a dictionary of terms and their frequency in a given document."""
 	return {k: {docno: v} for k,v in reduce(dict_join_add_values, [{w: 1} for w in doc], {}).items()}

 def dict_join_documents(d, e):
 	"""Given two dictionaries, add the items in the second to the first.
 	If the same key exists in both, the associated values (dicts) are unioned together."""
 	return d.update({k: (e[k].update(d.get(k,{})) or e[k]) for k in e.keys()}) or d

 def index(docs):
 	"""Given a list of documents (lists of words), create a dictionary of terms to 
 	postings (document ids and term frequencies)"""
 	return reduce(dict_join_documents, [make_tf_list(doc, i) for i, doc in enumerate(docs)], {})

 def search(terms, index):
 	"""Join the postings for each term in the query, then sort them according to their score."""
 	return reversed(sorted(list(reduce(dict_join_add_values, [index.get(term, {}) for term in terms], {}).items()), key=lambda x: x[1]))

 idx = index([d.split() for d in open("index").read().split('\n')])
 while (True):
 	[print("%s\t%s"%(d,s)) for d, s in search(input('> ').split(), idx)]
	#!/usr/bin/env python
	# Short indexer and search engine, 2011 Vaughn Wood
	# Written in python 3
	# Assumes the collection is a newline separated file called index on the current path
	# Only uses TF (so perform queries accordingly)

	from functools import *

	def dict_join_add_values(d, e):
	"""Given two dictionaries, add the items in the second to the first.
	If the same key exists in both, the associated values (integers) are added together."""
	return d.update({k: e[k]+d.get(k,0) for k in e.keys()}) or d

	def make_tf_list(doc, docno):
	"""Create a dictionary of terms and their frequency in a given document."""
	return {k: {docno: v} for k,v in reduce(dict_join_add_values, [{w: 1} for w in doc], {}).items()}

	def dict_join_documents(d, e):
	"""Given two dictionaries, add the items in the second to the first.
	If the same key exists in both, the associated values (dicts) are unioned together."""
	return d.update({k: (e[k].update(d.get(k,{})) or e[k]) for k in e.keys()}) or d

	def index(docs):
	"""Given a list of documents (lists of words), create a dictionary of terms to
	postings (document ids and term frequencies)"""
	return reduce(dict_join_documents, [make_tf_list(doc, i) for i, doc in enumerate(docs)], {})

	def search(terms, index):
	"""Join the postings for each term in the query, then sort them according to their score."""
	return reversed(sorted(list(reduce(dict_join_add_values, [index.get(term, {}) for term in terms], {}).items()), key=lambda x: x[1]))

	idx = index([d.split() for d in open("index").read().split('\n')])
	while (True):
	[print("%s\t%s"%(d,s)) for d, s in search(input('> ').split(), idx)]