A Simple Search Engine in Python
Download: sample.tar.bz2 or sample.zip
^ Start by clicking one of those two links!
A Simple Search Engine in Python
^ Start by clicking one of those two links!
If you can't install Flask, use this instead. Drop it into the dirctory where tiny.py
is, and run it.
# cmdline.py - run tiny search queries in your terminal
import tiny, sys
my_index = tiny.Index(sys.argv[1])
while True:
query = input("search:> ")
for doc, score in my_index.search(query):
print(" {:7.1f} {}".format(score, doc.path))
This is another snippet of code we'll use. (One of the more boring bits.)
class Index:
"""Object for querying a .tiny index."""
def __init__(self, dir):
"""Create an Index that reads `$DIR/.tiny`."""
dir = pathlib.Path(dir)
tiny_dir = dir / ".tiny"
self.dir = dir
self.index_file = tiny_dir / "index.dat"
self.documents = []
for [line, max_tf] in csv.reader(open(tiny_dir / "documents.csv")):
self.documents.append(Document(pathlib.Path(line), int(max_tf)))
self.terms = {}
for word, start, length in csv.reader(open(tiny_dir / "terms.csv")):
self.terms[word] = (int(start), int(length))
def lookup(self, word):
"""Return a list of Hits for the given word."""
if word not in self.terms:
return []
start, length = self.terms[word]
with open(self.index_file, 'rb') as f:
f.seek(start)
bytes = f.read(length)
read_pos = 0
hits = []
while read_pos < len(bytes):
doc_id, hit_count = struct.unpack("=II", bytes[read_pos:read_pos+8])
read_pos += 8
offset_bytes = bytes[read_pos:read_pos + 4 * hit_count]
read_pos += 4 * hit_count
offsets = array.array('I')
offsets.frombytes(offset_bytes)
hits.append(Hit(doc_id, offsets))
assert read_pos == len(bytes)
return hits
No peeking, but... github.com/jorendorff/tinysearch