Skip to content

Instantly share code, notes, and snippets.

@jorendorff
Last active February 5, 2017 20:02
Show Gist options
  • Save jorendorff/2d0243a11a03b9fe23d59ad4afdb4a28 to your computer and use it in GitHub Desktop.
Save jorendorff/2d0243a11a03b9fe23d59ad4afdb4a28 to your computer and use it in GitHub Desktop.

Code to search text from the command line

If you can't install Flask, use this instead. Drop it into the dirctory where tiny.py is, and run it.

# cmdline.py - run tiny search queries in your terminal
import tiny, sys

my_index = tiny.Index(sys.argv[1])

while True:
    query = input("search:> ")
    for doc, score in my_index.search(query):
        print("    {:7.1f} {}".format(score, doc.path))

Code to read the index

This is another snippet of code we'll use. (One of the more boring bits.)

class Index:
    """Object for querying a .tiny index."""

    def __init__(self, dir):
        """Create an Index that reads `$DIR/.tiny`."""
        dir = pathlib.Path(dir)
        tiny_dir = dir / ".tiny"
        self.dir = dir
        self.index_file = tiny_dir / "index.dat"

        self.documents = []
        for [line, max_tf] in csv.reader(open(tiny_dir / "documents.csv")):
            self.documents.append(Document(pathlib.Path(line), int(max_tf)))

        self.terms = {}
        for word, start, length in csv.reader(open(tiny_dir / "terms.csv")):
            self.terms[word] = (int(start), int(length))

    def lookup(self, word):
        """Return a list of Hits for the given word."""
        if word not in self.terms:
            return []

        start, length = self.terms[word]
        with open(self.index_file, 'rb') as f:
            f.seek(start)
            bytes = f.read(length)

        read_pos = 0
        hits = []
        while read_pos < len(bytes):
            doc_id, hit_count = struct.unpack("=II", bytes[read_pos:read_pos+8])
            read_pos += 8
            offset_bytes = bytes[read_pos:read_pos + 4 * hit_count]
            read_pos += 4 * hit_count
            offsets = array.array('I')
            offsets.frombytes(offset_bytes)
            hits.append(Hit(doc_id, offsets))
        assert read_pos == len(bytes)
        return hits
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment