Last active
June 10, 2019 17:09
-
-
Save karolyi/02510555e45f9050c945096233458351 to your computer and use it in GitHub Desktop.
Highlighter of search terms in HTML
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import lru_cache | |
from bs4 import BeautifulSoup as bs | |
from bs4 import Tag | |
from bs4.element import NavigableString | |
from django.utils.html import escape | |
from django.utils.functional import cached_property | |
from unidecode import unidecode | |
from . import memoized_method | |
@lru_cache(maxsize=5000) | |
def cached_unidecode_char(char: str) -> str: | |
"""Resolve and return one character with unidecode.""" | |
return unidecode(string=char).lower() | |
@lru_cache(maxsize=5000) | |
def cached_unidecode(string: str) -> str: | |
"""Translate, cache and return an unidecoded string.""" | |
return unidecode(string=string).lower() | |
class HtmlHighlighter(object): | |
"""Highlighter of HTML-based search results.""" | |
term_min_len = 2 | |
hl_pre = '<span class="search-result-highlight">' | |
hl_post = '</span>' | |
def __init__(self, term: str, hl_pre: str = None, hl_post: str = None): | |
self.term = term | |
self.hl_pre = hl_pre or self.hl_pre | |
self.hl_post = hl_post or self.hl_post | |
@cached_property | |
def unidecoded_terms(self) -> tuple: | |
"""Compile the terms into unidecoded terms.""" | |
terms = {x for x in self.term.split() if len(x) >= self.term_min_len} | |
result = set() | |
for term in terms: | |
unidecoded_term = cached_unidecode(string=term) | |
result.add(unidecoded_term) | |
return tuple(result) | |
@memoized_method(maxsize=30) | |
def _reduce_overlapping_terms(self, terms: tuple) -> tuple: | |
""" | |
Reduce and return the found search terms that may overlap, to | |
the longest overlapping ones. | |
""" | |
if len(terms) < 2: | |
# Zero or one element | |
return terms | |
term_set = set(terms) | |
new_terms = () | |
for term in terms: | |
other_terms = term_set - {term} | |
# Check if the term is part of any other terms | |
if not [x for x in other_terms if term in x]: | |
new_terms += term, | |
return new_terms | |
@memoized_method(maxsize=30) | |
def _get_highlighted_words( | |
self, word: str, found_unidecoded_terms: tuple) -> str: | |
""" | |
Take the passed `term` and replace any of its originals in the | |
passed `word`. | |
""" | |
unidecoded_part = result = word_part = '' | |
was_partial = False | |
for letter in word: | |
unidecoded_part += cached_unidecode_char(char=letter) | |
word_part += letter | |
if unidecoded_part in found_unidecoded_terms: | |
# Full match, add the highlighted word part | |
result += \ | |
f'{self.hl_pre}{escape(text=word_part)}{self.hl_post}' | |
word_part = unidecoded_part = '' | |
continue | |
partials_matches = [ | |
x for x in found_unidecoded_terms | |
if x.startswith(unidecoded_part)] | |
if partials_matches: | |
# Partial match, wait (iterate further) for the match | |
was_partial = True | |
continue | |
if was_partial: | |
# Former cycle was partial match, add word_part | |
result += escape(text=word_part) | |
was_partial = False | |
word_part = unidecoded_part = '' | |
continue | |
# No match, not even partial, add the letter, reset iters | |
# and continue looking | |
word_part = unidecoded_part = '' | |
result += escape(text=letter) | |
if was_partial: | |
# The last letters were a partial match | |
result += escape(text=word_part) | |
return result | |
def _replace_text(self, text: NavigableString, result: str): | |
"""Replace the passed `text` with the replaced HTML content.""" | |
result = f'<body>{result}</body>' | |
html = bs(markup=result, features='lxml') | |
childrens = list(html.body.children) | |
text.replace_with(replace_with=childrens[-1]) | |
for item in childrens[:-1]: | |
childrens[-1].insert_before(item) | |
def _find_in_text(self, text: NavigableString) -> str: | |
""" | |
Unidecode the splitted elements in the strings, look for | |
matches, and add the highlight HTML to the words that match. | |
Return the reconstructed HTML string. | |
""" | |
result = [] | |
is_modified = False | |
for str_item in text.split(): | |
is_found = False | |
found_unidecoded_terms = () | |
unidecoded_str = cached_unidecode(string=str_item) | |
for unidecoded_term in self.unidecoded_terms: | |
if unidecoded_term in unidecoded_str: | |
is_found = True | |
found_unidecoded_terms += unidecoded_term, | |
if not is_found: | |
result.append(escape(text=str_item)) | |
continue | |
# Match! | |
is_modified = True | |
found_unidecoded_terms = self._reduce_overlapping_terms( | |
terms=found_unidecoded_terms) | |
result.append(self._get_highlighted_words( | |
word=str_item, found_unidecoded_terms=found_unidecoded_terms)) | |
if is_modified: | |
changed_text = ' '.join(result) | |
self._replace_text(text=text, result=changed_text) | |
def _find_text(self): | |
"""Find the regex in the HTML.""" | |
for navigable_str in self.bs.body.find_all( | |
text=True): # type: NavigableString | |
if not type(navigable_str) is NavigableString: | |
# It can be a comment | |
continue | |
if navigable_str.parent.name in ['script', 'style']: | |
# Skip highlighting script and style content | |
continue | |
self._find_in_text(text=navigable_str) | |
def highlight(self, html: str) -> str: | |
"""Highlight the terms in the HTML input.""" | |
html = '<body>' + html + '</body>' | |
self.bs = bs(markup=html, features='lxml') # type: Tag | |
self._find_text() | |
return self.bs.body.decode_contents() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment