Last active
May 29, 2021 21:35
-
-
Save eranhirs/5c9ef5de8b8731948e6ed14486058842 to your computer and use it in GitHub Desktop.
Sanitize a query string for Elasticsearch / Lucene . Based on a discussion on Stack Overflow: http://stackoverflow.com/questions/16205341/symbols-in-query-string-for-elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
class ElasticsearchHelpers(object): | |
""" | |
Sanitize a query string for Elasticsearch / Lucene. | |
Based on: | |
http://stackoverflow.com/questions/16205341/symbols-in-query-string-for-elasticsearch | |
https://gist.github.com/bcoe/6505434 | |
""" | |
@staticmethod | |
def sanitize_string(text): | |
# Escape special characters | |
# http://lucene.apache.org/core/old_versioned_docs/versions/2_9_1/queryparsersyntax.html#Escaping Special Characters | |
text = re.sub('([{}])'.format(re.escape('\\+\-&|!(){}\[\]^~*?:\/')), r"\\\1", text) | |
# AND, OR and NOT are used by lucene as logical operators. We need | |
# to escape them | |
for word in ['AND', 'OR', 'NOT']: | |
escaped_word = "".join(["\\" + letter for letter in word]) | |
text = re.sub(r'\s*\b({})\b\s*'.format(word), r" {} ".format(escaped_word), text) | |
# Escape odd quotes | |
quote_count = text.count('"') | |
return re.sub(r'(.*)"(.*)', r'\1\"\2', text) if quote_count % 2 == 1 else text |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment