README.md

# cmdline.py - run tiny search queries in your terminal import tiny, sys my_index = tiny.Index(sys.argv[1]) while True: query = input("search:> ") for doc, score in my_index.search(query): print(" {:7.1f} {}".format(score, doc.path))

Code to read the index

This is another snippet of code we'll use. (One of the more boring bits.)

class Index:
    """Object for querying a .tiny index."""

    def __init__(self, dir):
        """Create an Index that reads `$DIR/.tiny`."""
        dir = pathlib.Path(dir)
        tiny_dir = dir / ".tiny"
        self.dir = dir
        self.index_file = tiny_dir / "index.dat"

        self.documents = []
        for [line, max_tf] in csv.reader(open(tiny_dir / "documents.csv")):
            self.documents.append(Document(pathlib.Path(line), int(max_tf)))

        self.terms = {}
        for word, start, length in csv.reader(open(tiny_dir / "terms.csv")):
            self.terms[word] = (int(start), int(length))

    def lookup(self, word):
        """Return a list of Hits for the given word."""
        if word not in self.terms:
            return []

        start, length = self.terms[word]
        with open(self.index_file, 'rb') as f:
            f.seek(start)
            bytes = f.read(length)

        read_pos = 0
        hits = []
        while read_pos < len(bytes):
            doc_id, hit_count = struct.unpack("=II", bytes[read_pos:read_pos+8])
            read_pos += 8
            offset_bytes = bytes[read_pos:read_pos + 4 * hit_count]
            read_pos += 4 * hit_count
            offsets = array.array('I')
            offsets.frombytes(offset_bytes)
            hits.append(Hit(doc_id, offsets))
        assert read_pos == len(bytes)
        return hits

jorendorff/README.md

bit.ly/pytnsearch

Download: sample.tar.bz2 or sample.zip

Code to search text from the command line

Code to read the index