bsolomon1124 · August 28, 2019 13:19
diff --git a/qparser.py b/qparser.py
 """Boolean query parser."""

 import collections
 import enum
 import re
 from typing import Generator, Iterable, List, Optional, Sequence, Tuple, Union


 class BooleanOp(enum.Enum):
    # Note: Enum defines __hash__ as hash(self._name_)
    OR = "OR"
    AND = "AND"


 class Node(object):
    """Self-referential node in the query tree."""

    # Structure is similar to a node in the tree constructed from a binary
    # search, though our tree is not binary;
    # see section 6.5 of The C Programming Langauge (K&R) for detail.
    # In C the binary tree node would look like:
    #     struct tnode {
    #         char *word;
    #         int count;
    #         struct tnode *left;
    #         struct tnode *right;
    #     }
    # TODO: good place for attr.s/attr.ib or just namedtuple

    __slots__ = ("value", "children", "negated")

    def __init__(
        self,
        value: Union[BooleanOp, str],
        children: Optional[Iterable["Node"]] = None,
        negated: bool = False,
    ):
        self.value = value
        self.children = children
        self.negated = negated

    def __str__(self):
        if self.negated:
            return f"<{self.__class__.__name__}> NOT:{self.value}"
        return f"<{self.__class__.__name__}> {self.value}"

    __repr__ = __str__


 Token = collections.namedtuple("Token", ["type", "value"])


 class BadQueryError(Exception):
    """Invalid query"""


 _TOK_SPEC: Sequence[Tuple[str, str]] = (
    ("OPENP", r"\("),  # Open parenthesis
    ("CLOSEP", r"\)"),  # Close parenthesis
    ("SPACE", r"\s+"),  # [ \t\n\r\f\v]
    # A TERM is either:
    # 1. phrase of 1 or more terms surrounded in quotes
    # 2. single word/term, optionally surrounded in quotes
    ("TERM", r'"[^"]+"|[^()\s"]+'),
    ("ERR", r"."),  # Any other character
 )

 _TOK_PAT = re.compile(
    r"|".join("(?P<%s>%s)" % pair for pair in _TOK_SPEC), re.I
 )


 def tokenize(query: str, tok_pat=_TOK_PAT) -> Generator[Token, None, None]:
    '''Break string into sequence of tokens, dropping whitespace.

    >>> query = """\
    ... (finance OR "financial theory")
    ... AND
    ... (laffer AND ("supply-side" OR "trickle-down"))"""
    >>> list(tokenize(query))
    [Token(type='OPENP', value='('),
    Token(type='TERM', value='finance'),
    Token(type='OR', value='OR'),
    Token(type='PHRASE', value='financial theory'),
    Token(type='CLOSEP', value=')'),
    Token(type='AND', value='AND'),
    Token(type='OPENP', value='('),
    Token(type='TERM', value='laffer'),
    Token(type='AND', value='AND'),
    Token(type='OPENP', value='('),
    Token(type='PHRASE', value='supply-side'),
    Token(type='OR', value='OR'),
    Token(type='PHRASE', value='trickle-down'),
    Token(type='CLOSEP', value=')'),
    Token(type='CLOSEP', value=')')]
    '''

    # From:
    # https://docs.python.org/3/library/re.html#writing-a-tokenizer

    for mo in tok_pat.finditer(query):
        kind = mo.lastgroup
        value = mo.group()
        if kind == "SPACE":
            continue
        if kind == "ERR":
            raise BadQueryError(value)
        elif kind == "TERM":
            uval = value.upper()
            if uval == "AND":
                yield Token("AND", "AND")
            elif uval == "OR":
                yield Token("OR", "OR")
            else:
                yield Token(kind, value.strip('"'))
        else:
            yield Token(kind, value)


 TokenSequenceType = List[Token]


 class TokenSequence(TokenSequenceType):
    """Ordered sequence of token objects."""

    @property
    def is_ambiguous_and_or(self):
        n_parens = n_and = n_or = 0
        i: str
        for i in self:
            if i.type in {"OPENP", "CLOSEP"}:
                n_parens += 1
            elif i.type == "AND":
                n_and += 1
            elif i.type == "OR":
                n_or += 1
        if n_parens == 0 and (n_and and n_or):
            return True
        return False

    @property
    def first_op(self):
        if self.is_ambiguous_and_or:
            raise ValueError("Ambiguous flat and/or")
        for i in self:
            if i.type == "AND":
                return "AND"
            if i.type == "OR":
                return "OR"
        return None  # No op found

    @property
    def is_flat(self):
        """Is this a 'bottom-level' TokenSequence?

        - single term or phrase, or
        - non-nested chain of terms/phrases
        """
        return not any(i.type in {"OPENP", "CLOSEP"} for i in self)


 def get_groups(
    query: str
 ) -> Generator[Tuple[int, int, TokenSequence], None, None]:
    """Yield (nest-level, n_seen, TokenSequence) pairs.

    See BooleanQueryParser.__doc__ for examples of levels.
    """

    # In the query
    #
    #     (finance OR "financial theory")
    #     AND
    #     (laffer AND ("supply-side" OR "trickle-down"))
    # the first "stack pop" occurs when we get to --- ^ --- this parenthesis.
    # At that point, the stack has 2 open parens, but we then pop 1 off.

    tokens = list(tokenize(query))  # Convert to list to index
    stack = []
    n = 0  # Number encoutered
    for i, t in enumerate(tokens):
        if t.type == "OPENP":
            # Descend 1 level & implicitly create a child node
            stack.append(i)
        elif t.type == "CLOSEP" and stack:
            start = stack.pop()
            yield len(stack) + 1, n, TokenSequence(tokens[start + 1 : i])
            n += 1
    # Tokens are not 'bookended' in parens by default
    yield 0, n, TokenSequence(tokens)


 class BooleanQueryParser(object):
    """Constructs a Tree object from boolean query string.

    Algorithm
    ---------

    Input:

        (finance OR "financial theory")
        AND
        (laffer AND ("supply-side" OR "trickle-down"))

    Output:

        0            _________ AND ________
                    |                     |
        1     _____ OR _____        _____ AND _____
              |            |       |              |
        2  finance     financial  laffer      __ OR ___________
                        theory                |                |
                                          supply-side      trickle-down

    From this the logic is to work "inside out":

        # Innermost nodes with no children come first
        n1 = Node("finance")
        n2 = Node("financial theory")
        n3 = Node("supply-side")
        n4 = Node("trickle-down")
        n5 = Node(BooleanOp.OR, left=n1, right=n2)
        n6 = Node(BooleanOp.AND, left=n3, right=n4)
        root = Tree(BooleanOp.AND, left=n5, right=n6)

    Rules:

    - phrases must be surrounded in quotes
    - and/or are not case sensitive
    - supports nesting
    - case-insensitive terms
    - instead of one or the other being "tightly binding",
      you *must* use parentheses unambiguously.
      That means `a OR b AND c` is not kosher.
    - terms or phrases cannot contain special characters:
      "(", ")", '"'
    """

    def __call__(self, query: str):
        raise NotImplementedError


 def test():
    assert list(tokenize("a OR b")) == [
        Token(type="TERM", value="a"),
        Token(type="OR", value="OR"),
        Token(type="TERM", value="b"),
    ]
    assert list(tokenize("a or (b AND c)")) == [
        Token(type="TERM", value="a"),
        Token(type="OR", value="OR"),
        Token(type="OPENP", value="("),
        Token(type="TERM", value="b"),
        Token(type="AND", value="AND"),
        Token(type="TERM", value="c"),
        Token(type="CLOSEP", value=")"),
    ]
    assert list(tokenize("oregon")) == [Token(type="TERM", value="oregon")]
    assert (
        list(
            tokenize(
                """\
            (finance OR "financial theory")
            AND (laffer AND ("supply-side" OR "trickle-down"))"""
            )
        )
        == [
            Token(type="OPENP", value="("),
            Token(type="TERM", value="finance"),
            Token(type="OR", value="OR"),
            Token(type="TERM", value="financial theory"),
            Token(type="CLOSEP", value=")"),
            Token(type="AND", value="AND"),
            Token(type="OPENP", value="("),
            Token(type="TERM", value="laffer"),
            Token(type="AND", value="AND"),
            Token(type="OPENP", value="("),
            Token(type="TERM", value="supply-side"),
            Token(type="OR", value="OR"),
            Token(type="TERM", value="trickle-down"),
            Token(type="CLOSEP", value=")"),
            Token(type="CLOSEP", value=")"),
        ]
    )

    assert (
        list(
            tokenize(
                """(finance OR "financial theory" OR "financial economics")
            AND
            (laffer AND ("supply-side" OR "trickle-down"))"""
            )
        )
        == [
            Token(type="OPENP", value="("),
            Token(type="TERM", value="finance"),
            Token(type="OR", value="OR"),
            Token(type="TERM", value="financial theory"),
            Token(type="OR", value="OR"),
            Token(type="TERM", value="financial economics"),
            Token(type="CLOSEP", value=")"),
            Token(type="AND", value="AND"),
            Token(type="OPENP", value="("),
            Token(type="TERM", value="laffer"),
            Token(type="AND", value="AND"),
            Token(type="OPENP", value="("),
            Token(type="TERM", value="supply-side"),
            Token(type="OR", value="OR"),
            Token(type="TERM", value="trickle-down"),
            Token(type="CLOSEP", value=")"),
            Token(type="CLOSEP", value=")"),
        ]
    )

    assert list(tokenize('"do and" OR "DO OR DIE"')) == [
        Token(type="TERM", value="do and"),
        Token(type="OR", value="OR"),
        Token(type="TERM", value="DO OR DIE"),
    ]


 if __name__ == "__main__":
    import sys

    sys.exit(test())
	"""Boolean query parser."""

	import collections
	import enum
	import re
	from typing import Generator, Iterable, List, Optional, Sequence, Tuple, Union


	class BooleanOp(enum.Enum):
	# Note: Enum defines __hash__ as hash(self._name_)
	OR = "OR"
	AND = "AND"


	class Node(object):
	"""Self-referential node in the query tree."""

	# Structure is similar to a node in the tree constructed from a binary
	# search, though our tree is not binary;
	# see section 6.5 of The C Programming Langauge (K&R) for detail.
	# In C the binary tree node would look like:
	# struct tnode {
	# char *word;
	# int count;
	# struct tnode *left;
	# struct tnode *right;
	# }
	# TODO: good place for attr.s/attr.ib or just namedtuple

	__slots__ = ("value", "children", "negated")

	def __init__(
	self,
	value: Union[BooleanOp, str],
	children: Optional[Iterable["Node"]] = None,
	negated: bool = False,
	):
	self.value = value
	self.children = children
	self.negated = negated

	def __str__(self):
	if self.negated:
	return f"<{self.__class__.__name__}> NOT:{self.value}"
	return f"<{self.__class__.__name__}> {self.value}"

	__repr__ = __str__


	Token = collections.namedtuple("Token", ["type", "value"])


	class BadQueryError(Exception):
	"""Invalid query"""


	_TOK_SPEC: Sequence[Tuple[str, str]] = (
	("OPENP", r"\("), # Open parenthesis
	("CLOSEP", r"\)"), # Close parenthesis
	("SPACE", r"\s+"), # [ \t\n\r\f\v]
	# A TERM is either:
	# 1. phrase of 1 or more terms surrounded in quotes
	# 2. single word/term, optionally surrounded in quotes
	("TERM", r'"[^"]+"\|[^()\s"]+'),
	("ERR", r"."), # Any other character
	)

	_TOK_PAT = re.compile(
	r"\|".join("(?P<%s>%s)" % pair for pair in _TOK_SPEC), re.I
	)


	def tokenize(query: str, tok_pat=_TOK_PAT) -> Generator[Token, None, None]:
	'''Break string into sequence of tokens, dropping whitespace.

	>>> query = """\
	... (finance OR "financial theory")
	... AND
	... (laffer AND ("supply-side" OR "trickle-down"))"""
	>>> list(tokenize(query))
	[Token(type='OPENP', value='('),
	Token(type='TERM', value='finance'),
	Token(type='OR', value='OR'),
	Token(type='PHRASE', value='financial theory'),
	Token(type='CLOSEP', value=')'),
	Token(type='AND', value='AND'),
	Token(type='OPENP', value='('),
	Token(type='TERM', value='laffer'),
	Token(type='AND', value='AND'),
	Token(type='OPENP', value='('),
	Token(type='PHRASE', value='supply-side'),
	Token(type='OR', value='OR'),
	Token(type='PHRASE', value='trickle-down'),
	Token(type='CLOSEP', value=')'),
	Token(type='CLOSEP', value=')')]
	'''

	# From:
	# https://docs.python.org/3/library/re.html#writing-a-tokenizer

	for mo in tok_pat.finditer(query):
	kind = mo.lastgroup
	value = mo.group()
	if kind == "SPACE":
	continue
	if kind == "ERR":
	raise BadQueryError(value)
	elif kind == "TERM":
	uval = value.upper()
	if uval == "AND":
	yield Token("AND", "AND")
	elif uval == "OR":
	yield Token("OR", "OR")
	else:
	yield Token(kind, value.strip('"'))
	else:
	yield Token(kind, value)


	TokenSequenceType = List[Token]


	class TokenSequence(TokenSequenceType):
	"""Ordered sequence of token objects."""

	@property
	def is_ambiguous_and_or(self):
	n_parens = n_and = n_or = 0
	i: str
	for i in self:
	if i.type in {"OPENP", "CLOSEP"}:
	n_parens += 1
	elif i.type == "AND":
	n_and += 1
	elif i.type == "OR":
	n_or += 1
	if n_parens == 0 and (n_and and n_or):
	return True
	return False

	@property
	def first_op(self):
	if self.is_ambiguous_and_or:
	raise ValueError("Ambiguous flat and/or")
	for i in self:
	if i.type == "AND":
	return "AND"
	if i.type == "OR":
	return "OR"
	return None # No op found

	@property
	def is_flat(self):
	"""Is this a 'bottom-level' TokenSequence?

	- single term or phrase, or
	- non-nested chain of terms/phrases
	"""
	return not any(i.type in {"OPENP", "CLOSEP"} for i in self)


	def get_groups(
	query: str
	) -> Generator[Tuple[int, int, TokenSequence], None, None]:
	"""Yield (nest-level, n_seen, TokenSequence) pairs.

	See BooleanQueryParser.__doc__ for examples of levels.
	"""

	# In the query
	#
	# (finance OR "financial theory")
	# AND
	# (laffer AND ("supply-side" OR "trickle-down"))
	# the first "stack pop" occurs when we get to --- ^ --- this parenthesis.
	# At that point, the stack has 2 open parens, but we then pop 1 off.

	tokens = list(tokenize(query)) # Convert to list to index
	stack = []
	n = 0 # Number encoutered
	for i, t in enumerate(tokens):
	if t.type == "OPENP":
	# Descend 1 level & implicitly create a child node
	stack.append(i)
	elif t.type == "CLOSEP" and stack:
	start = stack.pop()
	yield len(stack) + 1, n, TokenSequence(tokens[start + 1 : i])
	n += 1
	# Tokens are not 'bookended' in parens by default
	yield 0, n, TokenSequence(tokens)


	class BooleanQueryParser(object):
	"""Constructs a Tree object from boolean query string.

	Algorithm
	---------

	Input:

	(finance OR "financial theory")
	AND
	(laffer AND ("supply-side" OR "trickle-down"))

	Output:

	0 _________ AND ________
	\| \|
	1 _____ OR _____ _____ AND _____
	\| \| \| \|
	2 finance financial laffer __ OR ___________
	theory \| \|
	supply-side trickle-down

	From this the logic is to work "inside out":

	# Innermost nodes with no children come first
	n1 = Node("finance")
	n2 = Node("financial theory")
	n3 = Node("supply-side")
	n4 = Node("trickle-down")
	n5 = Node(BooleanOp.OR, left=n1, right=n2)
	n6 = Node(BooleanOp.AND, left=n3, right=n4)
	root = Tree(BooleanOp.AND, left=n5, right=n6)

	Rules:

	- phrases must be surrounded in quotes
	- and/or are not case sensitive
	- supports nesting
	- case-insensitive terms
	- instead of one or the other being "tightly binding",
	you must use parentheses unambiguously.
	That means `a OR b AND c` is not kosher.
	- terms or phrases cannot contain special characters:
	"(", ")", '"'
	"""

	def __call__(self, query: str):
	raise NotImplementedError


	def test():
	assert list(tokenize("a OR b")) == [
	Token(type="TERM", value="a"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="b"),
	]
	assert list(tokenize("a or (b AND c)")) == [
	Token(type="TERM", value="a"),
	Token(type="OR", value="OR"),
	Token(type="OPENP", value="("),
	Token(type="TERM", value="b"),
	Token(type="AND", value="AND"),
	Token(type="TERM", value="c"),
	Token(type="CLOSEP", value=")"),
	]
	assert list(tokenize("oregon")) == [Token(type="TERM", value="oregon")]
	assert (
	list(
	tokenize(
	"""\
	(finance OR "financial theory")
	AND (laffer AND ("supply-side" OR "trickle-down"))"""
	)
	)
	== [
	Token(type="OPENP", value="("),
	Token(type="TERM", value="finance"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="financial theory"),
	Token(type="CLOSEP", value=")"),
	Token(type="AND", value="AND"),
	Token(type="OPENP", value="("),
	Token(type="TERM", value="laffer"),
	Token(type="AND", value="AND"),
	Token(type="OPENP", value="("),
	Token(type="TERM", value="supply-side"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="trickle-down"),
	Token(type="CLOSEP", value=")"),
	Token(type="CLOSEP", value=")"),
	]
	)

	assert (
	list(
	tokenize(
	"""(finance OR "financial theory" OR "financial economics")
	AND
	(laffer AND ("supply-side" OR "trickle-down"))"""
	)
	)
	== [
	Token(type="OPENP", value="("),
	Token(type="TERM", value="finance"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="financial theory"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="financial economics"),
	Token(type="CLOSEP", value=")"),
	Token(type="AND", value="AND"),
	Token(type="OPENP", value="("),
	Token(type="TERM", value="laffer"),
	Token(type="AND", value="AND"),
	Token(type="OPENP", value="("),
	Token(type="TERM", value="supply-side"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="trickle-down"),
	Token(type="CLOSEP", value=")"),
	Token(type="CLOSEP", value=")"),
	]
	)

	assert list(tokenize('"do and" OR "DO OR DIE"')) == [
	Token(type="TERM", value="do and"),
	Token(type="OR", value="OR"),
	Token(type="TERM", value="DO OR DIE"),
	]


	if __name__ == "__main__":
	import sys

	sys.exit(test())
No results found