Created
December 17, 2012 18:17
-
-
Save turicas/4320536 to your computer and use it in GitHub Desktop.
Some tests with whoosh (full-text search library written entirely in Python)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# To bootstrap the environment: | |
# mkvirtualenv whoosh | |
# pip install whoosh | |
import os | |
from whoosh.index import create_in, open_dir | |
from whoosh.fields import Schema, TEXT, ID | |
from whoosh.qparser import QueryParser | |
class Index(object): | |
'''Integrate whoosh's indexer and searcher''' | |
def __init__(self, index_path, schema=None): | |
if not os.path.exists(index_path): | |
if schema is None: | |
raise ValueError('You need to specify a `schema` when creating' | |
' an index') | |
os.mkdir(index_path) | |
self._index = create_in(index_path, schema) | |
self._schema = schema | |
else: | |
self._index = open_dir(index_path) | |
self._schema = self._index.schema | |
def add_document(self, **document): | |
'''Add a document to the index. | |
Document properties should be passed as parameters, like in: | |
>>> my_index.add_document(title=u'My Title', content=u'The content') | |
''' | |
writer = self._index.writer() | |
writer.add_document(**document) | |
writer.commit() | |
def add_documents(self, documents): | |
'''Add a list of documents (`list` of `dict`s) to the index | |
It's an optimized version of `add_document` since it calls `commit` | |
only in the end. | |
''' | |
writer = self._index.writer() | |
for document in documents: | |
writer.add_document(**document) | |
writer.commit() | |
def search(self, query, field): | |
query_object = QueryParser(field, self._schema).parse(query) | |
searcher = self._index.searcher() | |
results = searcher.search(query_object) | |
return results | |
if __name__ == '__main__': | |
import shutil | |
index_path = 'index-test' | |
try: | |
shutil.rmtree(index_path) | |
except OSError: | |
pass | |
schema = Schema(filename=TEXT, id=ID(stored=True), content=TEXT) | |
my_index = Index(index_path, schema) | |
doc_1 = {u'filename': u'a.txt', u'id': u'1', u'content': u'first document'} | |
doc_2 = {u'filename': u'b.txt', u'id': u'2', u'content': u'2nd document'} | |
doc_3 = {u'filename': u'c.txt', u'id': u'3', u'content': u'3rd document'} | |
documents = [doc_1, doc_2, doc_3] | |
my_index.add_documents(documents) | |
result = my_index.search(u'first', u'content') | |
assert len(result) == 1 | |
assert result[0][u'id'] == u'1' | |
doc_4 = {u'filename': u'a.txt', u'id': u'4', u'content': u'not first'} | |
my_index.add_document(**doc_4) | |
result = my_index.search(u'first', u'content') | |
assert len(result) == 2 | |
assert set([doc[u'id'] for doc in result]) == set([u'1', u'4']) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment