Skip to content

Instantly share code, notes, and snippets.

@eranhirs
Last active May 29, 2021 21:35
Show Gist options
  • Save eranhirs/5c9ef5de8b8731948e6ed14486058842 to your computer and use it in GitHub Desktop.
Save eranhirs/5c9ef5de8b8731948e6ed14486058842 to your computer and use it in GitHub Desktop.
Sanitize a query string for Elasticsearch / Lucene . Based on a discussion on Stack Overflow: http://stackoverflow.com/questions/16205341/symbols-in-query-string-for-elasticsearch
import re
class ElasticsearchHelpers(object):
"""
Sanitize a query string for Elasticsearch / Lucene.
Based on:
http://stackoverflow.com/questions/16205341/symbols-in-query-string-for-elasticsearch
https://gist.github.com/bcoe/6505434
"""
@staticmethod
def sanitize_string(text):
# Escape special characters
# http://lucene.apache.org/core/old_versioned_docs/versions/2_9_1/queryparsersyntax.html#Escaping Special Characters
text = re.sub('([{}])'.format(re.escape('\\+\-&|!(){}\[\]^~*?:\/')), r"\\\1", text)
# AND, OR and NOT are used by lucene as logical operators. We need
# to escape them
for word in ['AND', 'OR', 'NOT']:
escaped_word = "".join(["\\" + letter for letter in word])
text = re.sub(r'\s*\b({})\b\s*'.format(word), r" {} ".format(escaped_word), text)
# Escape odd quotes
quote_count = text.count('"')
return re.sub(r'(.*)"(.*)', r'\1\"\2', text) if quote_count % 2 == 1 else text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment