Skip to content

Instantly share code, notes, and snippets.

@bsolomon1124
Last active August 28, 2019 13:19
Show Gist options
  • Select an option

  • Save bsolomon1124/4585735891542fe021e12f4c11f8699b to your computer and use it in GitHub Desktop.

Select an option

Save bsolomon1124/4585735891542fe021e12f4c11f8699b to your computer and use it in GitHub Desktop.
Boolean query string parser
"""Boolean query parser."""
import collections
import enum
import re
from typing import Generator, Iterable, List, Optional, Sequence, Tuple, Union
class BooleanOp(enum.Enum):
# Note: Enum defines __hash__ as hash(self._name_)
OR = "OR"
AND = "AND"
class Node(object):
"""Self-referential node in the query tree."""
# Structure is similar to a node in the tree constructed from a binary
# search, though our tree is not binary;
# see section 6.5 of The C Programming Langauge (K&R) for detail.
# In C the binary tree node would look like:
# struct tnode {
# char *word;
# int count;
# struct tnode *left;
# struct tnode *right;
# }
# TODO: good place for attr.s/attr.ib or just namedtuple
__slots__ = ("value", "children", "negated")
def __init__(
self,
value: Union[BooleanOp, str],
children: Optional[Iterable["Node"]] = None,
negated: bool = False,
):
self.value = value
self.children = children
self.negated = negated
def __str__(self):
if self.negated:
return f"<{self.__class__.__name__}> NOT:{self.value}"
return f"<{self.__class__.__name__}> {self.value}"
__repr__ = __str__
Token = collections.namedtuple("Token", ["type", "value"])
class BadQueryError(Exception):
"""Invalid query"""
_TOK_SPEC: Sequence[Tuple[str, str]] = (
("OPENP", r"\("), # Open parenthesis
("CLOSEP", r"\)"), # Close parenthesis
("SPACE", r"\s+"), # [ \t\n\r\f\v]
# A TERM is either:
# 1. phrase of 1 or more terms surrounded in quotes
# 2. single word/term, optionally surrounded in quotes
("TERM", r'"[^"]+"|[^()\s"]+'),
("ERR", r"."), # Any other character
)
_TOK_PAT = re.compile(
r"|".join("(?P<%s>%s)" % pair for pair in _TOK_SPEC), re.I
)
def tokenize(query: str, tok_pat=_TOK_PAT) -> Generator[Token, None, None]:
'''Break string into sequence of tokens, dropping whitespace.
>>> query = """\
... (finance OR "financial theory")
... AND
... (laffer AND ("supply-side" OR "trickle-down"))"""
>>> list(tokenize(query))
[Token(type='OPENP', value='('),
Token(type='TERM', value='finance'),
Token(type='OR', value='OR'),
Token(type='PHRASE', value='financial theory'),
Token(type='CLOSEP', value=')'),
Token(type='AND', value='AND'),
Token(type='OPENP', value='('),
Token(type='TERM', value='laffer'),
Token(type='AND', value='AND'),
Token(type='OPENP', value='('),
Token(type='PHRASE', value='supply-side'),
Token(type='OR', value='OR'),
Token(type='PHRASE', value='trickle-down'),
Token(type='CLOSEP', value=')'),
Token(type='CLOSEP', value=')')]
'''
# From:
# https://docs.python.org/3/library/re.html#writing-a-tokenizer
for mo in tok_pat.finditer(query):
kind = mo.lastgroup
value = mo.group()
if kind == "SPACE":
continue
if kind == "ERR":
raise BadQueryError(value)
elif kind == "TERM":
uval = value.upper()
if uval == "AND":
yield Token("AND", "AND")
elif uval == "OR":
yield Token("OR", "OR")
else:
yield Token(kind, value.strip('"'))
else:
yield Token(kind, value)
TokenSequenceType = List[Token]
class TokenSequence(TokenSequenceType):
"""Ordered sequence of token objects."""
@property
def is_ambiguous_and_or(self):
n_parens = n_and = n_or = 0
i: str
for i in self:
if i.type in {"OPENP", "CLOSEP"}:
n_parens += 1
elif i.type == "AND":
n_and += 1
elif i.type == "OR":
n_or += 1
if n_parens == 0 and (n_and and n_or):
return True
return False
@property
def first_op(self):
if self.is_ambiguous_and_or:
raise ValueError("Ambiguous flat and/or")
for i in self:
if i.type == "AND":
return "AND"
if i.type == "OR":
return "OR"
return None # No op found
@property
def is_flat(self):
"""Is this a 'bottom-level' TokenSequence?
- single term or phrase, or
- non-nested chain of terms/phrases
"""
return not any(i.type in {"OPENP", "CLOSEP"} for i in self)
def get_groups(
query: str
) -> Generator[Tuple[int, int, TokenSequence], None, None]:
"""Yield (nest-level, n_seen, TokenSequence) pairs.
See BooleanQueryParser.__doc__ for examples of levels.
"""
# In the query
#
# (finance OR "financial theory")
# AND
# (laffer AND ("supply-side" OR "trickle-down"))
# the first "stack pop" occurs when we get to --- ^ --- this parenthesis.
# At that point, the stack has 2 open parens, but we then pop 1 off.
tokens = list(tokenize(query)) # Convert to list to index
stack = []
n = 0 # Number encoutered
for i, t in enumerate(tokens):
if t.type == "OPENP":
# Descend 1 level & implicitly create a child node
stack.append(i)
elif t.type == "CLOSEP" and stack:
start = stack.pop()
yield len(stack) + 1, n, TokenSequence(tokens[start + 1 : i])
n += 1
# Tokens are not 'bookended' in parens by default
yield 0, n, TokenSequence(tokens)
class BooleanQueryParser(object):
"""Constructs a Tree object from boolean query string.
Algorithm
---------
Input:
(finance OR "financial theory")
AND
(laffer AND ("supply-side" OR "trickle-down"))
Output:
0 _________ AND ________
| |
1 _____ OR _____ _____ AND _____
| | | |
2 finance financial laffer __ OR ___________
theory | |
supply-side trickle-down
From this the logic is to work "inside out":
# Innermost nodes with no children come first
n1 = Node("finance")
n2 = Node("financial theory")
n3 = Node("supply-side")
n4 = Node("trickle-down")
n5 = Node(BooleanOp.OR, left=n1, right=n2)
n6 = Node(BooleanOp.AND, left=n3, right=n4)
root = Tree(BooleanOp.AND, left=n5, right=n6)
Rules:
- phrases must be surrounded in quotes
- and/or are not case sensitive
- supports nesting
- case-insensitive terms
- instead of one or the other being "tightly binding",
you *must* use parentheses unambiguously.
That means `a OR b AND c` is not kosher.
- terms or phrases cannot contain special characters:
"(", ")", '"'
"""
def __call__(self, query: str):
raise NotImplementedError
def test():
assert list(tokenize("a OR b")) == [
Token(type="TERM", value="a"),
Token(type="OR", value="OR"),
Token(type="TERM", value="b"),
]
assert list(tokenize("a or (b AND c)")) == [
Token(type="TERM", value="a"),
Token(type="OR", value="OR"),
Token(type="OPENP", value="("),
Token(type="TERM", value="b"),
Token(type="AND", value="AND"),
Token(type="TERM", value="c"),
Token(type="CLOSEP", value=")"),
]
assert list(tokenize("oregon")) == [Token(type="TERM", value="oregon")]
assert (
list(
tokenize(
"""\
(finance OR "financial theory")
AND (laffer AND ("supply-side" OR "trickle-down"))"""
)
)
== [
Token(type="OPENP", value="("),
Token(type="TERM", value="finance"),
Token(type="OR", value="OR"),
Token(type="TERM", value="financial theory"),
Token(type="CLOSEP", value=")"),
Token(type="AND", value="AND"),
Token(type="OPENP", value="("),
Token(type="TERM", value="laffer"),
Token(type="AND", value="AND"),
Token(type="OPENP", value="("),
Token(type="TERM", value="supply-side"),
Token(type="OR", value="OR"),
Token(type="TERM", value="trickle-down"),
Token(type="CLOSEP", value=")"),
Token(type="CLOSEP", value=")"),
]
)
assert (
list(
tokenize(
"""(finance OR "financial theory" OR "financial economics")
AND
(laffer AND ("supply-side" OR "trickle-down"))"""
)
)
== [
Token(type="OPENP", value="("),
Token(type="TERM", value="finance"),
Token(type="OR", value="OR"),
Token(type="TERM", value="financial theory"),
Token(type="OR", value="OR"),
Token(type="TERM", value="financial economics"),
Token(type="CLOSEP", value=")"),
Token(type="AND", value="AND"),
Token(type="OPENP", value="("),
Token(type="TERM", value="laffer"),
Token(type="AND", value="AND"),
Token(type="OPENP", value="("),
Token(type="TERM", value="supply-side"),
Token(type="OR", value="OR"),
Token(type="TERM", value="trickle-down"),
Token(type="CLOSEP", value=")"),
Token(type="CLOSEP", value=")"),
]
)
assert list(tokenize('"do and" OR "DO OR DIE"')) == [
Token(type="TERM", value="do and"),
Token(type="OR", value="OR"),
Token(type="TERM", value="DO OR DIE"),
]
if __name__ == "__main__":
import sys
sys.exit(test())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment