Created
December 28, 2010 20:46
-
-
Save acdha/757688 to your computer and use it in GitHub Desktop.
django-haystack "hackend" which stuffs values in through the Solr extraction handler for rich content indexing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
import logging | |
from poster.encode import multipart_encode | |
from pysolr import SolrError, safe_urlencode | |
from haystack.backends.solr_backend import * | |
# For sanity: | |
from haystack.backends.solr_backend import SearchBackend as StandardSolrBackend | |
class ExtractionSearchBackend(StandardSolrBackend): | |
def update(self, index, iterable, commit=True): | |
# opt-in to non-standard handling: | |
if not getattr(index, "use_extraction_handler", False): | |
return super(SearchBackend, self).update(index, iterable, | |
commit=commit) | |
# Unlike the default backend, we want to prepare the document like | |
# normal but must then post it to a completely different handler with | |
# a completely different call signature - see | |
# http://wiki.apache.org/solr/ExtractingRequestHandler for details. | |
# | |
# In practice this means posting all of the non-text fields using | |
# URL parameters and passing the file contents as multipart data | |
for obj in iterable: | |
try: | |
self.extract(obj, index=index) | |
except (UnicodeDecodeError, IOError), e: | |
logging.error("Unable to update index for %s: %s", obj, e, | |
exc_info=e) | |
if commit: | |
self.conn.commit() | |
def extract(self, obj, index=None): | |
params = { | |
"boost": index.get_field_weights(), | |
} | |
try: | |
metadata = index.full_prepare(obj) | |
except UnicodeDecodeError, e: | |
logging.error("Error preparing metadata for %r: %s", obj, e, | |
exc_info=e) | |
return | |
# We don't actually want to use the prepared text field here: | |
metadata.pop("text", None) | |
for k, v in metadata.items(): | |
params["literal.%s" % k] = v | |
# FIXME: Actually open source URL rather than reading test data: | |
params["source_file"] = open("qt5nz25228.pdf", "rb") | |
body_generator, headers = multipart_encode(params) | |
try: | |
self.conn._send_request('POST', | |
"%s/update/extract" % self.conn.path, | |
"".join(body_generator), headers) | |
except (IOError, SolrError), e: | |
self.log.error("Failed to add documents to Solr: %s", e, exc_info=e) | |
raise e | |
SearchBackend = ExtractionSearchBackend |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment