Last active
June 23, 2016 16:51
-
-
Save frague59/8ab2470ed133754a6327 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Search features for : | |
* :mod:`elasticsearch.elasticsearch` | |
* :mod:`haystack:haystack` | |
* :mod:`elasticstack:elasticstack` | |
:creationdate: 05/11/15 15:05 | |
:moduleauthor: François GUÉRIN <[email protected]> | |
:modulename: intrautils.search | |
""" | |
import base64 | |
import json | |
import logging | |
from copy import copy, deepcopy | |
import haystack | |
from django import forms | |
from django.conf import settings | |
from django.contrib.contenttypes.models import ContentType | |
from django.db import models as dj_models | |
from django.db.models.fields.files import FieldFile as dj_File | |
from django.utils.translation import ugettext_lazy as _ | |
from elasticsearch import NotFoundError | |
from elasticstack.backends import ConfigurableElasticBackend, ConfigurableElasticSearchEngine | |
from elasticstack.fields import FacetField | |
from elasticstack.forms import SearchForm | |
from filer.models import File as fi_File | |
from form_utils.forms import BetterForm | |
from haystack import DEFAULT_ALIAS | |
from haystack.backends import SQ | |
from haystack.constants import DJANGO_CT, DJANGO_ID | |
from haystack.fields import SearchField | |
from haystack.forms import model_choices | |
from urllib3.fields import guess_content_type | |
from utils.forms import CollapsibleFieldsetFormMixin | |
__author__ = 'fguerin' | |
logger = logging.getLogger('intrautils.search') | |
DEFAULT_TYPE_MAPPINGS = {'type': 'string', 'analyzer': 'french'} | |
#: Type mapings | |
TYPE_MAPPINGS = { | |
'string': {'type': 'string', 'analyzer': 'french',}, | |
'edge_ngram': {'type': 'string', 'analyzer': 'edgengram_analyzer'}, | |
'ngram': {'type': 'string', 'analyzer': 'ngram_analyzer'}, | |
'date': {'type': 'date'}, | |
'datetime': {'type': 'date'}, | |
'location': {'type': 'geo_point'}, | |
'boolean': {'type': 'boolean'}, | |
'float': {'type': 'float'}, | |
'long': {'type': 'long'}, | |
'integer': {'type': 'long'}, | |
'attachment': {'type': 'attachment', | |
'fields': { | |
'content': { | |
'copy_to': 'copy', | |
'type': 'string', | |
'term_vector': 'with_positions_offsets', | |
'store': 'yes', | |
'analyzer': 'edgengram_analyzer'}, | |
'title': {'analyzer': 'french'}, | |
'author': { | |
'analyzer': 'edgengram_analyzer'}, | |
'content_type': { | |
'analyzer': 'edgengram_analyzer'}, | |
'content_length': { | |
'store': 'yes', | |
'type': 'integer'}}, | |
} | |
} | |
class ExtendedElasticsearchBackend(ConfigurableElasticBackend): | |
""" | |
Adds ***attachment*** support for elasticsearch backend settings | |
""" | |
def setup(self): | |
""" | |
Defers loading until needed. | |
.. note:: | |
This code is a copy of :meth:`haystack:haystack.backends.elastisearch_backend.ElasticsearchSearchBackend.setup`, | |
except that the _boost parameter has been removed. | |
""" | |
# Get the existing mapping & cache it. We'll compare it | |
# during the ``update`` & if it doesn't match, we'll put the new | |
# mapping. | |
try: | |
self.existing_mapping = self.conn.indices.get_mapping(index=self.index_name) | |
except NotFoundError: | |
pass | |
except Exception: | |
if not self.silently_fail: | |
raise | |
unified_index = haystack.connections[self.connection_alias].get_unified_index() | |
self.content_field_name, field_mapping = self.build_schema(unified_index.all_searchfields()) | |
current_mapping = { | |
'modelresult': { | |
'properties': field_mapping, | |
} | |
} | |
if current_mapping != self.existing_mapping: | |
try: | |
# Make sure the index is there first. | |
self.conn.indices.create(index=self.index_name, body=self.DEFAULT_SETTINGS, ignore=400) | |
self.conn.indices.put_mapping(index=self.index_name, doc_type='modelresult', body=current_mapping) | |
self.existing_mapping = current_mapping | |
except Exception: | |
if not self.silently_fail: | |
raise | |
self.setup_complete = True | |
def extract_file_contents(self, file_obj): | |
contents = base64.decode(file_obj) | |
metadata = {'content_length': len(contents)} | |
return {'contents': contents, 'metadata': metadata} | |
def build_schema(self, fields): | |
""" | |
Merge from `haystack` and `elasticstack` `elasticsearch` backend `build_shema` methods. | |
It provides an additional feature : custom field mappings, from settings or default FIELD_MAPPINGS dict. | |
:param fields: fields to map to the backend | |
:returns: tuple content_field_name, mapping | |
""" | |
content_field_name = '' | |
final_mapping = { | |
DJANGO_CT: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False}, | |
DJANGO_ID: {'type': 'string', 'index': 'not_analyzed', 'include_in_all': False}, | |
} | |
type_mappings = copy(TYPE_MAPPINGS) | |
default_type_mappings = copy(DEFAULT_TYPE_MAPPINGS) | |
settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() ' | |
u'default_type_mappings = \n%s' | |
u'\ntype_mappings = \n%s', | |
json.dumps(default_type_mappings, indent=2), | |
json.dumps(type_mappings, indent=2)) | |
for field_name, field_class in fields.items(): | |
field_type = field_class.field_type | |
_mapping_for_field = type_mappings.get(field_type, default_type_mappings) | |
# settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() ' | |
# u'field_name = %s / field_type = %s / _mapping_for_field = \n%s', | |
# field_name, field_type, json.dumps(_mapping_for_field, indent=2)) | |
if field_class.boost != 1.0: | |
_mapping_for_field['boost'] = field_class.boost | |
if field_class.document is True: | |
content_field_name = field_class.index_fieldname | |
# Do this last to override `text` fields. | |
if _mapping_for_field['type'] == 'string' and field_class.indexed: | |
if not hasattr(field_class, 'facet_for') and not field_class.field_type in ('ngram', 'edge_ngram'): | |
_mapping_for_field['analyzer'] = getattr(field_class, 'analyzer', self.DEFAULT_ANALYZER) | |
final_mapping[field_class.index_fieldname] = _mapping_for_field | |
settings.DEBUG and logger.debug(u'ExtendedElasticsearchBackend::build_schema() ' | |
u'mapping = \n%s', | |
json.dumps(final_mapping, indent=2)) | |
return content_field_name, final_mapping | |
def more_like_this(self, model_instance, additional_query_string=None, result_class=None, **kwargs): | |
""" | |
Gives "more like this" items | |
:param model_instance: model instance | |
:param additional_query_string: additional srting | |
:param result_class: result | |
:param kwargs: additional kwargs | |
:returns: super | |
""" | |
return super(ExtendedElasticsearchBackend, self).more_like_this(model_instance, additional_query_string, | |
result_class, **kwargs) | |
def update(self, index, iterable=None, commit=True): | |
return super(ExtendedElasticsearchBackend, self).update(index, iterable) | |
def build_search_kwargs(self, query_string, sort_by=None, start_offset=0, end_offset=None, fields='', | |
highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, | |
spelling_query=None, within=None, dwithin=None, distance_point=None, models=None, | |
limit_to_registered_models=None, result_class=None): | |
return super(ExtendedElasticsearchBackend, self).build_search_kwargs(query_string, sort_by, start_offset, | |
end_offset, fields, | |
highlight, facets, date_facets, | |
query_facets, narrow_queries, | |
spelling_query, within, dwithin, | |
distance_point, models, | |
limit_to_registered_models, result_class) | |
class ExtendedElasticSearchEngine(ConfigurableElasticSearchEngine): | |
backend = ExtendedElasticsearchBackend | |
class AttachmentField(SearchField): | |
""" | |
Mapping for an `AttachmentField` | |
""" | |
field_type = 'attachment' | |
author_field_name = 'user_author' | |
author = None | |
def __init__(self, **kwargs): | |
if 'content_type_field' in kwargs: | |
self.content_type_field = kwargs.pop('content_type_field') | |
if 'author' in kwargs: | |
self.author = kwargs.pop(self.author_field_name) | |
super(AttachmentField, self).__init__(**kwargs) | |
def convert(self, value): | |
""" | |
Convert an attachment file to serializable data | |
:param value: value to convert | |
:returns: converted data | |
""" | |
output = value | |
return output | |
@staticmethod | |
def _get_file_data(field): | |
if isinstance(field, fi_File): | |
field_file = field.file | |
title = name = field.label | |
content_type = guess_content_type(name) | |
try: | |
content = base64.b64encode(field_file.read()) | |
except AttributeError: | |
content = base64.b64encode(field_file) | |
try: | |
content_length = len(field_file) | |
except TypeError: | |
content_length = len(field_file.file) | |
else: # isinstance(field, dj_File): | |
field_file = field | |
title = name = field_file.name | |
content_type = guess_content_type(name) | |
try: | |
content_length = len(field_file) | |
except TypeError: | |
content_length = len(field_file.file) | |
try: | |
content = base64.b64encode(field_file.read()) | |
except AttributeError: | |
content = base64.b64encode(field_file) | |
output = {'_language': 'fr', | |
'_content': content, | |
'_content_type': content_type, | |
'_name': name, | |
'_title': title, | |
'_content_length': content_length} | |
# output = content | |
return output | |
def prepare(self, obj): | |
if self.model_attr: | |
field = getattr(obj, self.model_attr) | |
else: | |
field = obj | |
if not isinstance(field, (dj_File, fi_File)): | |
raise NotImplementedError('AttachmentField does not implement file reading for %s file' | |
% field.__class__.__name__) | |
output = self._get_file_data(field) | |
if settings.DEBUG: | |
_output = deepcopy(output) | |
_output.update({'_content': _output['_content'][:50] + '...'}) | |
logger.debug(u'AttachmentField::prepare() output = %s', json.dumps(_output, indent=2)) | |
return output | |
class FacetedAttachmentField(FacetField, AttachmentField): | |
""" | |
Glue class to bind together `FacetField` and `AttachmentField` | |
""" | |
pass | |
def application_model_choices(app_name, using=DEFAULT_ALIAS): | |
choices = model_choices(using) | |
output = [] | |
if isinstance(app_name, (tuple, list)): | |
for app in app_name: | |
output.extend(application_model_choices(app, using)) | |
else: | |
for choice in choices: | |
if app_name in choice[0]: | |
output.append(choice) | |
output = sorted(output, key=(lambda x: x[1])) | |
return output | |
class HaystackSearchForm(CollapsibleFieldsetFormMixin, SearchForm, BetterForm): | |
""" | |
:mod:`haystack:haystack` search form for main `searching` feature | |
""" | |
class Meta: | |
fieldsets = (('main', {'legend': _('search'), 'fields': ('search_query', 'models', 'more_like_this')}),) | |
search_field_name = 'search_query' | |
load_all = True | |
#: can be a single application or a list of applications | |
search_app = None | |
#: global search field | |
search_query = forms.CharField(label=_('Search'), required=False, max_length=255, | |
help_text=_('You can use the wildcard * to search for words fragments, ' | |
'by example "comm*" will search for words starting by "comm". ' | |
'You can also write more than a word, each word will be searched.')) | |
# more_like_this = forms.BooleanField(label=_('More like this'), required=False) | |
def get_search_apps(self): | |
if self.search_app: | |
return self.search_app | |
return None | |
def get_models(self): | |
""" | |
Return an alphabetical list of model classes in the index. | |
""" | |
search_models = [] | |
if self.is_valid(): | |
for model in self.cleaned_data['models']: | |
# noinspection PyUnresolvedReferences | |
search_models.append(dj_models.get_model(*model.split('.'))) | |
return search_models | |
def get_filters(self, search_query): | |
""" | |
Build filter from a search_query | |
:param search_query: search query | |
:returns: built filters | |
""" | |
searched = search_query.strip('*') | |
if ' ' in searched: | |
filters = SQ() | |
search_list = search_query.split(' ') | |
for item in search_list: | |
sub_filters = SQ(text__contains=item.strip('*')) | |
if item.startswith('*'): | |
sub_filters |= SQ(text__endswith=item.strip('*')) | |
if item.endswith('*'): | |
sub_filters |= SQ(text__startswith=item.strip('*')) | |
filters &= sub_filters | |
else: | |
filters = SQ(text__contains=searched) | |
if search_query.startswith('*'): | |
filters |= SQ(text__endswith=searched) | |
if search_query.endswith('*'): | |
filters |= SQ(text__startswith=searched) | |
settings.DEBUG and logger.debug(u'HaystackSearchForm::get_filters(%s) filters = %s', search_query, filters) | |
return filters | |
@staticmethod | |
def get_fields(): | |
""" | |
Gets the fields for the search | |
:returns: list of fields | |
""" | |
fields = ['document_file.content', 'text', 'content', 'title', ] | |
settings.DEBUG and logger.debug(u'HaystackSearchForm::get_fields() fields = %s', fields) | |
return fields | |
def search(self): | |
if not self.is_valid(): | |
return self.no_query_found() | |
if not self.cleaned_data.get(self.search_field_name): | |
return self.no_query_found() | |
search_apps = self.get_search_apps() | |
search_query = self.cleaned_data.get(self.search_field_name, None) | |
search_models = self.get_models() | |
more_liks_this = self.cleaned_data.get('more_like_this', False) | |
filters = self.get_filters(search_query) | |
if search_models: | |
sub_filters = None | |
for model in search_models: | |
model_ct = ContentType.objects.get_for_model(model) | |
_filter = SQ(django_ct__iexact='%s.%s' % (model_ct.app_label, model_ct.model)) | |
sub_filters = (sub_filters | _filter) if sub_filters else _filter | |
filters = filters & sub_filters if filters else sub_filters | |
else: | |
if isinstance(search_apps, basestring): | |
filters &= SQ(django_ct__startswith=search_apps) | |
elif isinstance(search_apps, (tuple, list)): | |
sub_filters = None | |
for search_app in search_apps: | |
_filter = SQ(django_ct__startswith=search_app) | |
if sub_filters: | |
sub_filters |= _filter | |
else: | |
sub_filters = _filter | |
if sub_filters: | |
filters &= sub_filters | |
search_query_set = self.searchqueryset.filter(filters) | |
settings.DEBUG and logger.debug(u'HaystackSearchForm::search() ' | |
u'search_query_set.query = %s (%d)', search_query_set.query, | |
len(search_query_set)) | |
# Search for data | |
if not search_query_set: | |
search_query_set = search_query_set.load_all() | |
# Search for `more_liks_this` items | |
if search_query and more_liks_this: | |
search_query_set = search_query_set.more_like_this(search_query).load_all() | |
if self.load_all: | |
search_query_set = search_query_set.load_all() | |
settings.DEBUG and logger.debug(u'HaystackSearchForm::search() search_query (1) = %s ', search_query_set.query) | |
settings.DEBUG and logger.debug(u'HaystackSearchForm::search() len(search_query_set) = %d ' | |
u'(after models filtering)', len(search_query_set)) | |
return search_query_set | |
@staticmethod | |
def no_query_found(): | |
return [] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Awesome start, thanks! I'll try it soon.