Created
December 11, 2022 13:51
-
-
Save premchalmeti/d2e4ab72f2bde462a113d6d8327d88b3 to your computer and use it in GitHub Desktop.
Elasticsearch: custom highlighter implementation for proximity search implementation using slop query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
doc = 'The quick brown fox jumps over the lazy dog. The quick brown fox over the lazy dog. The quick brown fox jumps over the lazy dog.' | |
cur_pos = -1 | |
end_pos = 0 | |
distance = 2 | |
tracked_words = [] | |
start_pos = 0 | |
search_words = ['quick', 'fox', 'over'] | |
highlights = [] | |
splitted_words = doc.split(' ') | |
matched_pair_pos = [] | |
for word in splitted_words: | |
cur_pos += 1 | |
if word in search_words and word not in tracked_words: | |
if not tracked_words: | |
start_pos = cur_pos | |
tracked_words.append(word) | |
if len(tracked_words) == len(search_words): | |
end_pos = cur_pos + 1 | |
tracked_words = [] | |
cur_highlight = " ".join(splitted_words[start_pos:end_pos]) | |
pre_highlight = " ".join(splitted_words[:start_pos]) | |
post_highlight = " ".join(splitted_words[end_pos:]) | |
highlights.append("%s <b>%s</b> %s" % | |
(pre_highlight, cur_highlight, post_highlight)) | |
print highlights |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
default_highlights = {
"info.FullContent" : [
"The generated Lorem Ipsum is therefore always free from repetition, injected humour, or non-characteristic words etc."
]
}
search_words = ['repetition', 'humour']
pre_tags = """"""
post_tags = ""
distance = 2
proximity_highlights = {}
for field, highlights in default_highlights.iteritems():
proximity_highlights[field] = []
for highlight in highlights:
splitted_words = highlight.split(' ')
print proximity_highlights