Created
October 18, 2023 17:36
-
-
Save kenballus/c6c3b80c05f835c25d6aad671a95ef49 to your computer and use it in GitHub Desktop.
A function for generating examples that match a given Python re for bytes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This relies on the internal workings of the re module, so don't be surprised if it crashes or doesn't | |
# work on versions of Python other than 3.11.3. | |
# I do not want to support Unicode, because it would be way harder. This only works for bytes. | |
# If you want something that supports Unicode, consider using hypothesis.strategies.from_regex. | |
# This also does not support \A, \Z, \b, ^, and $. | |
import re | |
from re._constants import _NamedIntConstant as RegexConstant # type: ignore | |
from re._parser import ( # type: ignore | |
IN, | |
CATEGORY_DIGIT, | |
CATEGORY_NOT_DIGIT, | |
CATEGORY_WORD, | |
CATEGORY_NOT_WORD, | |
CATEGORY_SPACE, | |
CATEGORY_NOT_SPACE, | |
SubPattern, | |
LITERAL, | |
NOT_LITERAL, | |
MAX_REPEAT, | |
MAXREPEAT, | |
SUBPATTERN, | |
NEGATE, | |
RANGE, | |
CATEGORY, | |
BRANCH, | |
ANY, | |
) | |
import random | |
from typing import Iterable | |
WORD_CHARSET: frozenset[int] = frozenset( | |
b"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz\xaa\xb2\xb3\xb5\xb9\xba\xbc\xbd\xbe\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff" | |
) | |
SPACE_CHARSET: frozenset[int] = frozenset(b"\t\n\x0b\x0c\r\x1c\x1d\x1e\x1f \x85\xa0") | |
DIGIT_CHARSET: frozenset[int] = frozenset(b"0123456789") | |
ALL_CHARSET: frozenset[int] = frozenset(range(256)) | |
def category_to_charset(category: RegexConstant) -> frozenset[int]: | |
if category == CATEGORY_DIGIT: | |
return DIGIT_CHARSET | |
if category == CATEGORY_NOT_DIGIT: | |
return negate_charset(DIGIT_CHARSET) | |
if category == CATEGORY_WORD: | |
return WORD_CHARSET | |
if category == CATEGORY_NOT_WORD: | |
return negate_charset(WORD_CHARSET) | |
if category == CATEGORY_SPACE: | |
return SPACE_CHARSET | |
if category == CATEGORY_NOT_SPACE: | |
return negate_charset(SPACE_CHARSET) | |
raise NotImplementedError(f"I don't know how to generate examples of {category}") | |
def negate_charset(charset: Iterable[int]) -> frozenset[int]: | |
return ALL_CHARSET - set(charset) | |
def generate_random_match_from_pattern(pattern: bytes | str) -> bytes: | |
return generate_random_match_from_tree(re._parser.parse(pattern)) | |
def generate_random_match_from_tree(parse_tree: SubPattern) -> bytes: | |
""" | |
This takes the parse tree output from re._parser.parse and | |
returns a bytes object that matches the parse tree. | |
""" | |
result: bytes = b"" | |
if len(parse_tree) == 0: | |
return result | |
curr = parse_tree[0] # I don't know what type to put here. | |
node_type: RegexConstant = curr[0] | |
node_value = curr[1] # I don't know what type to put here. | |
if node_type == LITERAL: | |
code_point: int = node_value | |
result = bytes([code_point]) | |
elif node_type == NOT_LITERAL: | |
forbidden_code_point: int = node_value | |
result = generate_random_match_from_tree( | |
[(IN, [(LITERAL, b) for b in negate_charset([forbidden_code_point])])] | |
) | |
elif node_type == MAX_REPEAT: | |
min_reps: int = node_value[0] | |
max_reps: int | RegexConstant = node_value[1] | |
if min_reps == 0 and (max_reps == MAXREPEAT or max_reps > min_reps): | |
min_reps = 1 | |
subpattern: SubPattern = node_value[2] | |
for _ in range(min_reps): | |
result += generate_random_match_from_tree(subpattern) | |
elif node_type == SUBPATTERN: | |
result = generate_random_match_from_tree(node_value[3]) | |
elif node_type == IN: | |
# This needs to handle literal, range, and category | |
# It also needs to handle negations for all of those | |
need_to_negate: bool = False | |
charset: set[int] = set() | |
for subpattern in node_value: | |
if subpattern[0] == NEGATE: | |
need_to_negate = not need_to_negate | |
elif subpattern[0] == LITERAL: | |
charset |= set([subpattern[1]]) | |
elif subpattern[0] == RANGE: | |
charset |= set(range(subpattern[1][0], subpattern[1][1] + 1)) | |
elif subpattern[0] == CATEGORY: | |
charset |= category_to_charset(subpattern[1]) | |
else: | |
raise NotImplementedError(f"I don't know how to generate examples of {subpattern[0]}") | |
result = bytes([random.choice(list(negate_charset(charset) if need_to_negate else charset))]) | |
elif node_type == BRANCH: | |
result = generate_random_match_from_tree(random.choice(node_value[1])) | |
elif node_type == ANY: | |
result = generate_random_match_from_tree([(IN, [(LITERAL, b) for b in ALL_CHARSET])]) | |
else: | |
raise NotImplementedError(f"I don't know how to generate examples of {node_type}") | |
return result + generate_random_match_from_tree(parse_tree[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment