Last active
August 28, 2019 13:19
-
-
Save bsolomon1124/4585735891542fe021e12f4c11f8699b to your computer and use it in GitHub Desktop.
Boolean query string parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """Boolean query parser.""" | |
| import collections | |
| import enum | |
| import re | |
| from typing import Generator, Iterable, List, Optional, Sequence, Tuple, Union | |
| class BooleanOp(enum.Enum): | |
| # Note: Enum defines __hash__ as hash(self._name_) | |
| OR = "OR" | |
| AND = "AND" | |
| class Node(object): | |
| """Self-referential node in the query tree.""" | |
| # Structure is similar to a node in the tree constructed from a binary | |
| # search, though our tree is not binary; | |
| # see section 6.5 of The C Programming Langauge (K&R) for detail. | |
| # In C the binary tree node would look like: | |
| # struct tnode { | |
| # char *word; | |
| # int count; | |
| # struct tnode *left; | |
| # struct tnode *right; | |
| # } | |
| # TODO: good place for attr.s/attr.ib or just namedtuple | |
| __slots__ = ("value", "children", "negated") | |
| def __init__( | |
| self, | |
| value: Union[BooleanOp, str], | |
| children: Optional[Iterable["Node"]] = None, | |
| negated: bool = False, | |
| ): | |
| self.value = value | |
| self.children = children | |
| self.negated = negated | |
| def __str__(self): | |
| if self.negated: | |
| return f"<{self.__class__.__name__}> NOT:{self.value}" | |
| return f"<{self.__class__.__name__}> {self.value}" | |
| __repr__ = __str__ | |
| Token = collections.namedtuple("Token", ["type", "value"]) | |
| class BadQueryError(Exception): | |
| """Invalid query""" | |
| _TOK_SPEC: Sequence[Tuple[str, str]] = ( | |
| ("OPENP", r"\("), # Open parenthesis | |
| ("CLOSEP", r"\)"), # Close parenthesis | |
| ("SPACE", r"\s+"), # [ \t\n\r\f\v] | |
| # A TERM is either: | |
| # 1. phrase of 1 or more terms surrounded in quotes | |
| # 2. single word/term, optionally surrounded in quotes | |
| ("TERM", r'"[^"]+"|[^()\s"]+'), | |
| ("ERR", r"."), # Any other character | |
| ) | |
| _TOK_PAT = re.compile( | |
| r"|".join("(?P<%s>%s)" % pair for pair in _TOK_SPEC), re.I | |
| ) | |
| def tokenize(query: str, tok_pat=_TOK_PAT) -> Generator[Token, None, None]: | |
| '''Break string into sequence of tokens, dropping whitespace. | |
| >>> query = """\ | |
| ... (finance OR "financial theory") | |
| ... AND | |
| ... (laffer AND ("supply-side" OR "trickle-down"))""" | |
| >>> list(tokenize(query)) | |
| [Token(type='OPENP', value='('), | |
| Token(type='TERM', value='finance'), | |
| Token(type='OR', value='OR'), | |
| Token(type='PHRASE', value='financial theory'), | |
| Token(type='CLOSEP', value=')'), | |
| Token(type='AND', value='AND'), | |
| Token(type='OPENP', value='('), | |
| Token(type='TERM', value='laffer'), | |
| Token(type='AND', value='AND'), | |
| Token(type='OPENP', value='('), | |
| Token(type='PHRASE', value='supply-side'), | |
| Token(type='OR', value='OR'), | |
| Token(type='PHRASE', value='trickle-down'), | |
| Token(type='CLOSEP', value=')'), | |
| Token(type='CLOSEP', value=')')] | |
| ''' | |
| # From: | |
| # https://docs.python.org/3/library/re.html#writing-a-tokenizer | |
| for mo in tok_pat.finditer(query): | |
| kind = mo.lastgroup | |
| value = mo.group() | |
| if kind == "SPACE": | |
| continue | |
| if kind == "ERR": | |
| raise BadQueryError(value) | |
| elif kind == "TERM": | |
| uval = value.upper() | |
| if uval == "AND": | |
| yield Token("AND", "AND") | |
| elif uval == "OR": | |
| yield Token("OR", "OR") | |
| else: | |
| yield Token(kind, value.strip('"')) | |
| else: | |
| yield Token(kind, value) | |
| TokenSequenceType = List[Token] | |
| class TokenSequence(TokenSequenceType): | |
| """Ordered sequence of token objects.""" | |
| @property | |
| def is_ambiguous_and_or(self): | |
| n_parens = n_and = n_or = 0 | |
| i: str | |
| for i in self: | |
| if i.type in {"OPENP", "CLOSEP"}: | |
| n_parens += 1 | |
| elif i.type == "AND": | |
| n_and += 1 | |
| elif i.type == "OR": | |
| n_or += 1 | |
| if n_parens == 0 and (n_and and n_or): | |
| return True | |
| return False | |
| @property | |
| def first_op(self): | |
| if self.is_ambiguous_and_or: | |
| raise ValueError("Ambiguous flat and/or") | |
| for i in self: | |
| if i.type == "AND": | |
| return "AND" | |
| if i.type == "OR": | |
| return "OR" | |
| return None # No op found | |
| @property | |
| def is_flat(self): | |
| """Is this a 'bottom-level' TokenSequence? | |
| | |
| - single term or phrase, or | |
| - non-nested chain of terms/phrases | |
| """ | |
| return not any(i.type in {"OPENP", "CLOSEP"} for i in self) | |
| def get_groups( | |
| query: str | |
| ) -> Generator[Tuple[int, int, TokenSequence], None, None]: | |
| """Yield (nest-level, n_seen, TokenSequence) pairs. | |
| See BooleanQueryParser.__doc__ for examples of levels. | |
| """ | |
| # In the query | |
| # | |
| # (finance OR "financial theory") | |
| # AND | |
| # (laffer AND ("supply-side" OR "trickle-down")) | |
| # the first "stack pop" occurs when we get to --- ^ --- this parenthesis. | |
| # At that point, the stack has 2 open parens, but we then pop 1 off. | |
| tokens = list(tokenize(query)) # Convert to list to index | |
| stack = [] | |
| n = 0 # Number encoutered | |
| for i, t in enumerate(tokens): | |
| if t.type == "OPENP": | |
| # Descend 1 level & implicitly create a child node | |
| stack.append(i) | |
| elif t.type == "CLOSEP" and stack: | |
| start = stack.pop() | |
| yield len(stack) + 1, n, TokenSequence(tokens[start + 1 : i]) | |
| n += 1 | |
| # Tokens are not 'bookended' in parens by default | |
| yield 0, n, TokenSequence(tokens) | |
| class BooleanQueryParser(object): | |
| """Constructs a Tree object from boolean query string. | |
| Algorithm | |
| --------- | |
| | |
| Input: | |
| | |
| (finance OR "financial theory") | |
| AND | |
| (laffer AND ("supply-side" OR "trickle-down")) | |
| | |
| Output: | |
| | |
| 0 _________ AND ________ | |
| | | | |
| 1 _____ OR _____ _____ AND _____ | |
| | | | | | |
| 2 finance financial laffer __ OR ___________ | |
| theory | | | |
| supply-side trickle-down | |
| From this the logic is to work "inside out": | |
| # Innermost nodes with no children come first | |
| n1 = Node("finance") | |
| n2 = Node("financial theory") | |
| n3 = Node("supply-side") | |
| n4 = Node("trickle-down") | |
| n5 = Node(BooleanOp.OR, left=n1, right=n2) | |
| n6 = Node(BooleanOp.AND, left=n3, right=n4) | |
| root = Tree(BooleanOp.AND, left=n5, right=n6) | |
| Rules: | |
| - phrases must be surrounded in quotes | |
| - and/or are not case sensitive | |
| - supports nesting | |
| - case-insensitive terms | |
| - instead of one or the other being "tightly binding", | |
| you *must* use parentheses unambiguously. | |
| That means `a OR b AND c` is not kosher. | |
| - terms or phrases cannot contain special characters: | |
| "(", ")", '"' | |
| """ | |
| def __call__(self, query: str): | |
| raise NotImplementedError | |
| def test(): | |
| assert list(tokenize("a OR b")) == [ | |
| Token(type="TERM", value="a"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="b"), | |
| ] | |
| assert list(tokenize("a or (b AND c)")) == [ | |
| Token(type="TERM", value="a"), | |
| Token(type="OR", value="OR"), | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="b"), | |
| Token(type="AND", value="AND"), | |
| Token(type="TERM", value="c"), | |
| Token(type="CLOSEP", value=")"), | |
| ] | |
| assert list(tokenize("oregon")) == [Token(type="TERM", value="oregon")] | |
| assert ( | |
| list( | |
| tokenize( | |
| """\ | |
| (finance OR "financial theory") | |
| AND (laffer AND ("supply-side" OR "trickle-down"))""" | |
| ) | |
| ) | |
| == [ | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="finance"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="financial theory"), | |
| Token(type="CLOSEP", value=")"), | |
| Token(type="AND", value="AND"), | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="laffer"), | |
| Token(type="AND", value="AND"), | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="supply-side"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="trickle-down"), | |
| Token(type="CLOSEP", value=")"), | |
| Token(type="CLOSEP", value=")"), | |
| ] | |
| ) | |
| assert ( | |
| list( | |
| tokenize( | |
| """(finance OR "financial theory" OR "financial economics") | |
| AND | |
| (laffer AND ("supply-side" OR "trickle-down"))""" | |
| ) | |
| ) | |
| == [ | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="finance"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="financial theory"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="financial economics"), | |
| Token(type="CLOSEP", value=")"), | |
| Token(type="AND", value="AND"), | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="laffer"), | |
| Token(type="AND", value="AND"), | |
| Token(type="OPENP", value="("), | |
| Token(type="TERM", value="supply-side"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="trickle-down"), | |
| Token(type="CLOSEP", value=")"), | |
| Token(type="CLOSEP", value=")"), | |
| ] | |
| ) | |
| assert list(tokenize('"do and" OR "DO OR DIE"')) == [ | |
| Token(type="TERM", value="do and"), | |
| Token(type="OR", value="OR"), | |
| Token(type="TERM", value="DO OR DIE"), | |
| ] | |
| if __name__ == "__main__": | |
| import sys | |
| sys.exit(test()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment