Last active
November 27, 2023 14:45
-
-
Save elvanja/202b31ae5735f297956e1d8f567c7d7e to your computer and use it in GitHub Desktop.
Boolean query lexer (not AST unfortunately)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
defmodule BooleanQuery do | |
use Ecto.Type | |
import Gettext | |
def type, do: :string | |
@operator_expressions %{ | |
"AND" => {:operator, "+"}, | |
"UND" => {:operator, "+"}, | |
"OR" => {:operator, "|"}, | |
"ODER" => {:operator, "|"}, | |
"NOT" => {:operator, "-"}, | |
"NICHT" => {:operator, "-"}, | |
"(" => {:operator, "("}, | |
")" => {:operator, ")"} | |
} | |
@doc """ | |
Converts input boolean search string into list of terms and operators, to be used for ES searching as needed | |
see https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax | |
## Examples | |
iex> BooleanQuery.cast("Java OR Ruby") | |
{:ok, [{:term, "java"}, {:operator, "|"}, {:term, "ruby"}]} | |
""" | |
def cast(query) when is_binary(query) do | |
with {:ok, terms} <- lex(query) do | |
terms | |
|> parse() | |
|> detect_invalid_multiple_operators() | |
end | |
end | |
def cast(_), do: :error | |
def load(_data), do: :error | |
def dump(_), do: :error | |
defp lex(query) do | |
result = | |
query | |
|> String.replace("\n", " ") | |
|> String.split("") | |
|> Enum.reduce({"", [], false, 0}, &scan/2) | |
case result do | |
{term, terms, false, 0} -> | |
{:ok, Enum.reject([term | terms], &(&1 == ""))} | |
{_, _, quotes, parenthesis} -> | |
message = | |
case {quotes, parenthesis} do | |
{true, 0} -> gettext("unclosed quotes detected") | |
{false, _} -> gettext("unclosed parenthesis detected") | |
_ -> gettext("unclosed parenthesis and/or quotes detected") | |
end | |
{:error, [{:message, message}]} | |
end | |
end | |
defp parse(terms) do | |
terms | |
|> detect_operators() | |
|> concat_consecutive_terms() | |
|> nest() | |
|> consolidate_prefix_operators() | |
|> remove_suffix_operators() | |
end | |
# ==> QUOTED BEGIN | |
# does not handle nested quotes | |
defp scan("\"", {_term, terms, false, parenthesis}) do | |
{"", terms, true, parenthesis} | |
end | |
defp scan("\"", {term, terms, true, parenthesis}) do | |
{"", terms ++ [~s{"#{term}"}], false, parenthesis} | |
end | |
defp scan(char, {term, terms, true, parenthesis}) do | |
{term <> char, terms, true, parenthesis} | |
end | |
# ==> QUOTED END | |
# ==> PARENTHESIS BEGIN | |
defp scan("(", {term, terms, false, parenthesis}) do | |
{"", terms ++ [term, "("], false, parenthesis + 1} | |
end | |
defp scan(")", {term, terms, false, parenthesis}) do | |
{"", terms ++ [term, ")"], false, parenthesis - 1} | |
end | |
# ==> PARENTHESIS END | |
# ==> WHITESPACE BEGIN | |
defp scan("", {term, terms, false, parenthesis}) do | |
{"", terms ++ [term], false, parenthesis} | |
end | |
defp scan(" ", {term, terms, false, parenthesis}) do | |
{"", terms ++ [term], false, parenthesis} | |
end | |
# ==> WHITESPACE END | |
# ==> THE REST BEGIN | |
defp scan(char, {term, terms, false, parenthesis}) do | |
{term <> char, terms, false, parenthesis} | |
end | |
# ==> THE REST END | |
defp detect_operators(terms) do | |
terms | |
|> Enum.map(fn term -> | |
@operator_expressions[String.upcase(term)] || {:term, term} | |
end) | |
|> detect_and_not() | |
end | |
defp detect_and_not([{:operator, "+"} | rest]) do | |
case rest do | |
[{:operator, "-"} | after_and_not] -> [{:operator, "+-"} | detect_and_not(after_and_not)] | |
_ -> [{:operator, "+"} | detect_and_not(rest)] | |
end | |
end | |
defp detect_and_not([term_or_operator | rest]) do | |
[term_or_operator | detect_and_not(rest)] | |
end | |
defp detect_and_not([]), do: [] | |
defp concat_consecutive_terms([{:term, current} | rest]) do | |
case rest do | |
[{:term, next} | after_next] -> concat_consecutive_terms([{:term, current <> " " <> next} | after_next]) | |
_ -> [{:term, current} | concat_consecutive_terms(rest)] | |
end | |
end | |
defp concat_consecutive_terms([operator | rest]) do | |
[operator | concat_consecutive_terms(rest)] | |
end | |
defp concat_consecutive_terms([]), do: [] | |
defp nest([]), do: [] | |
defp nest([{:operator, "("} | rest]) do | |
nest([{:nested, []} | rest]) | |
end | |
defp nest([{:nested, nested} | rest]) do | |
case rest do | |
[{:operator, "("} | _] = reopened -> | |
[{:nested, _} = re_nested | remainder] = nest(reopened) | |
nest([{:nested, nested ++ [re_nested]} | remainder]) | |
[{:operator, ")"} | after_closing] -> | |
[{:nested, nested} | nest(after_closing)] | |
[term_or_operator | remainder] -> | |
nest([{:nested, nested ++ [term_or_operator]} | remainder]) | |
end | |
end | |
defp nest([term_or_operator | rest]) do | |
[term_or_operator | nest(rest)] | |
end | |
# credo:disable-for-lines:38 Credo.Check.Refactor.CyclomaticComplexity | |
defp consolidate_prefix_operators(list) do | |
{list, _, _} = | |
Enum.reduce(list, {[], [], false}, fn term_or_operator, {list, prefix_operators, consolidated} -> | |
case {term_or_operator, consolidated} do | |
{{:operator, _}, false} -> | |
{list, [term_or_operator | prefix_operators], consolidated} | |
{{:term, _}, false} -> | |
list = | |
case operator = List.first(prefix_operators) do | |
{:operator, "-"} -> [operator | list] | |
{:operator, "+-"} -> [operator | list] | |
_ -> list | |
end | |
{[term_or_operator | list], [], true} | |
{{:nested, nested}, false} -> | |
# credo:disable-for-lines:6 Credo.Check.Refactor.Nesting | |
list = | |
case operator = List.first(prefix_operators) do | |
{:operator, "-"} -> [operator | list] | |
{:operator, "+-"} -> [operator | list] | |
_ -> list | |
end | |
{[{:nested, consolidate_prefix_operators(nested)} | list], [], true} | |
{{:nested, nested}, true} -> | |
{[{:nested, consolidate_prefix_operators(nested)} | list], prefix_operators, consolidated} | |
{_, true} -> | |
{[term_or_operator | list], prefix_operators, consolidated} | |
end | |
end) | |
Enum.reverse(list) | |
end | |
defp remove_suffix_operators(list) do | |
{list, _} = | |
list | |
|> Enum.reverse() | |
|> Enum.reduce({[], false}, fn term_or_operator, {list, removed} -> | |
case {term_or_operator, removed} do | |
{{:operator, _}, false} -> | |
{list, removed} | |
{{:term, _}, false} -> | |
{[term_or_operator | list], true} | |
{{:nested, nested}, false} -> | |
{[{:nested, remove_suffix_operators(nested)} | list], true} | |
{{:nested, nested}, true} -> | |
{[{:nested, remove_suffix_operators(nested)} | list], removed} | |
{_, true} -> | |
{[term_or_operator | list], removed} | |
end | |
end) | |
list | |
end | |
defp detect_invalid_multiple_operators(list) do | |
case do_detect_invalid_multiple_operators(list) do | |
{_, nil} -> | |
{:ok, list} | |
{_, [previous, current]} -> | |
operators = | |
[previous, current] | |
|> Enum.map_join(", ", &translate/1) | |
message = gettext("consecutive boolean operators detected: %{operators}", operators: operators) | |
{:error, [{:message, message}]} | |
end | |
end | |
defp do_detect_invalid_multiple_operators(list) do | |
Enum.reduce_while(list, {nil, nil}, fn term_or_operator, {last_term_or_operator, _} -> | |
case {term_or_operator, last_term_or_operator} do | |
{{:term, _}, _} -> | |
{:cont, {nil, nil}} | |
{{:operator, _}, nil} -> | |
{:cont, {term_or_operator, nil}} | |
{{:operator, current}, {:operator, previous}} -> | |
{:halt, {nil, [previous, current]}} | |
{{:nested, nested}, _} -> | |
# credo:disable-for-lines:4 Credo.Check.Refactor.Nesting | |
case do_detect_invalid_multiple_operators(nested) do | |
{_, nil} -> {:cont, {nil, nil}} | |
{_, [previous, current]} -> {:halt, {nil, [previous, current]}} | |
end | |
end | |
end) | |
end | |
defp translate("+"), do: gettext("AND") | |
defp translate("|"), do: gettext("OR") | |
defp translate("-"), do: gettext("NOT") | |
defp translate("+-"), do: gettext("AND NOT") | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment