Skip to content

Instantly share code, notes, and snippets.

@kenballus
Created October 18, 2023 17:36
Show Gist options
  • Save kenballus/c6c3b80c05f835c25d6aad671a95ef49 to your computer and use it in GitHub Desktop.
Save kenballus/c6c3b80c05f835c25d6aad671a95ef49 to your computer and use it in GitHub Desktop.
A function for generating examples that match a given Python re for bytes
# This relies on the internal workings of the re module, so don't be surprised if it crashes or doesn't
# work on versions of Python other than 3.11.3.
# I do not want to support Unicode, because it would be way harder. This only works for bytes.
# If you want something that supports Unicode, consider using hypothesis.strategies.from_regex.
# This also does not support \A, \Z, \b, ^, and $.
import re
from re._constants import _NamedIntConstant as RegexConstant # type: ignore
from re._parser import ( # type: ignore
IN,
CATEGORY_DIGIT,
CATEGORY_NOT_DIGIT,
CATEGORY_WORD,
CATEGORY_NOT_WORD,
CATEGORY_SPACE,
CATEGORY_NOT_SPACE,
SubPattern,
LITERAL,
NOT_LITERAL,
MAX_REPEAT,
MAXREPEAT,
SUBPATTERN,
NEGATE,
RANGE,
CATEGORY,
BRANCH,
ANY,
)
import random
from typing import Iterable
WORD_CHARSET: frozenset[int] = frozenset(
b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz\xaa\xb2\xb3\xb5\xb9\xba\xbc\xbd\xbe\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
)
SPACE_CHARSET: frozenset[int] = frozenset(b"\t\n\x0b\x0c\r\x1c\x1d\x1e\x1f \x85\xa0")
DIGIT_CHARSET: frozenset[int] = frozenset(b"0123456789")
ALL_CHARSET: frozenset[int] = frozenset(range(256))
def category_to_charset(category: RegexConstant) -> frozenset[int]:
if category == CATEGORY_DIGIT:
return DIGIT_CHARSET
if category == CATEGORY_NOT_DIGIT:
return negate_charset(DIGIT_CHARSET)
if category == CATEGORY_WORD:
return WORD_CHARSET
if category == CATEGORY_NOT_WORD:
return negate_charset(WORD_CHARSET)
if category == CATEGORY_SPACE:
return SPACE_CHARSET
if category == CATEGORY_NOT_SPACE:
return negate_charset(SPACE_CHARSET)
raise NotImplementedError(f"I don't know how to generate examples of {category}")
def negate_charset(charset: Iterable[int]) -> frozenset[int]:
return ALL_CHARSET - set(charset)
def generate_random_match_from_pattern(pattern: bytes | str) -> bytes:
return generate_random_match_from_tree(re._parser.parse(pattern))
def generate_random_match_from_tree(parse_tree: SubPattern) -> bytes:
"""
This takes the parse tree output from re._parser.parse and
returns a bytes object that matches the parse tree.
"""
result: bytes = b""
if len(parse_tree) == 0:
return result
curr = parse_tree[0] # I don't know what type to put here.
node_type: RegexConstant = curr[0]
node_value = curr[1] # I don't know what type to put here.
if node_type == LITERAL:
code_point: int = node_value
result = bytes([code_point])
elif node_type == NOT_LITERAL:
forbidden_code_point: int = node_value
result = generate_random_match_from_tree(
[(IN, [(LITERAL, b) for b in negate_charset([forbidden_code_point])])]
)
elif node_type == MAX_REPEAT:
min_reps: int = node_value[0]
max_reps: int | RegexConstant = node_value[1]
if min_reps == 0 and (max_reps == MAXREPEAT or max_reps > min_reps):
min_reps = 1
subpattern: SubPattern = node_value[2]
for _ in range(min_reps):
result += generate_random_match_from_tree(subpattern)
elif node_type == SUBPATTERN:
result = generate_random_match_from_tree(node_value[3])
elif node_type == IN:
# This needs to handle literal, range, and category
# It also needs to handle negations for all of those
need_to_negate: bool = False
charset: set[int] = set()
for subpattern in node_value:
if subpattern[0] == NEGATE:
need_to_negate = not need_to_negate
elif subpattern[0] == LITERAL:
charset |= set([subpattern[1]])
elif subpattern[0] == RANGE:
charset |= set(range(subpattern[1][0], subpattern[1][1] + 1))
elif subpattern[0] == CATEGORY:
charset |= category_to_charset(subpattern[1])
else:
raise NotImplementedError(f"I don't know how to generate examples of {subpattern[0]}")
result = bytes([random.choice(list(negate_charset(charset) if need_to_negate else charset))])
elif node_type == BRANCH:
result = generate_random_match_from_tree(random.choice(node_value[1]))
elif node_type == ANY:
result = generate_random_match_from_tree([(IN, [(LITERAL, b) for b in ALL_CHARSET])])
else:
raise NotImplementedError(f"I don't know how to generate examples of {node_type}")
return result + generate_random_match_from_tree(parse_tree[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment