Created
August 19, 2010 17:20
-
-
Save FSX/538415 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Url: http://61924.nl/posts/00038-whoosh | |
Example: | |
>>> from searchengine import * | |
>>> search = SearchEngine('./index') | |
>>> search.create_index() | |
>>> search.add_document('http://example.org/somedocument', | |
'The document title', 'The content of the document') | |
>>> search.commit() | |
""" | |
import os, os.path, re | |
import whoosh.index | |
import whoosh.fields | |
import whoosh.qparser | |
RE_GIST_JS = re.compile('<script(?: type="text\/javascript")? src="' | |
'http:\/\/gist.github.com\/([0-9]+)\.js"><\/script>') | |
def replace_gist_js(text): | |
return RE_GIST_JS.sub( | |
'<p><a href="http://gist.github.com/\1">[Gist \1]</a></p>', text) | |
REGEX_HTML_TAG = re.compile('<[^<]*?/?>') | |
def strip_html_tags(text): | |
return REGEX_HTML_TAG.sub('', text) | |
class SearchEngine(object): | |
schema = whoosh.fields.Schema( | |
url=whoosh.fields.ID(unique=True, stored=True), | |
title=whoosh.fields.TEXT(stored=True, phrase=False), | |
content=whoosh.fields.TEXT()) | |
def __init__(self, index_path): | |
self.path = index_path | |
if not os.path.exists(index_path): | |
os.makedirs(index_path) | |
def create_index(self): | |
whoosh.index.create_in(self.path, self.schema) | |
self.open_index() | |
def open_index(self): | |
self._index = whoosh.index.open_dir(self.path) | |
self._writer = self._index.writer() | |
def index_exists(self): | |
return whoosh.index.exists_in(self.path) | |
def add_document(self, url, title, content): | |
self._writer.add_document( | |
url=unicode(url), | |
title=unicode(title), | |
content=unicode(content)) | |
def update_document(self, url, title, content): | |
self._writer.update_document( | |
url=unicode(url), | |
title=unicode(title), | |
content=unicode(content)) | |
def delete_document(self, url): | |
self._index.delete_by_term('url', unicode(url)) | |
def commit(self): | |
self._writer.commit(optimize=True) | |
def cancel(self): | |
self._writer.cancel() | |
_queryparser = whoosh.qparser.QueryParser('content', schema=schema) | |
def find(self, querystring): | |
s = self._index.searcher() | |
return s.search(self._queryparser.parse(unicode(querystring)), | |
limit=50) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment