-
-
Save Quacky2200/714acad06f3f80f6bdb92d7d49dea4bf to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*- | |
# | |
# Used like this: | |
# | |
# from repermute import ipermute | |
# | |
# for s in ipermute('[A-Z]\d'): | |
# print s | |
# | |
# Almost all regular expression constructs are supported except for '*' | |
# (which in the Python sre_parse implementation matches 0 to 65535 | |
# times), '+' and '^'. Non-ASCII characters doesn't work, but I think | |
# that is a limitation in the sre_parse module. It works by first | |
# parsing the regexp to string sequences so that [A-Z] becomes ['A', | |
# 'B', ... 'Z'], \d becomes ['1', ... , '9']. Then a Cartesian join is | |
# applied on all string sequences to get all possible permutations of | |
# them all. | |
import itertools | |
from sre_constants import * | |
import sre_parse | |
import string | |
category_chars = { | |
CATEGORY_DIGIT : string.digits, | |
CATEGORY_SPACE : string.whitespace, | |
CATEGORY_WORD : string.digits + string.ascii_letters + '_' | |
} | |
def unique_extend(res_list, list): | |
for item in list: | |
if item not in res_list: | |
res_list.append(item) | |
def handle_any(val): | |
""" | |
This is different from normal regexp matching. It only matches | |
printable ASCII characters. | |
""" | |
return string.printable | |
def handle_branch(_tuple): | |
(tok, val) = _tuple | |
all_opts = [] | |
for toks in val: | |
opts = permute_toks(toks) | |
unique_extend(all_opts, opts) | |
return all_opts | |
def handle_category(val): | |
return list(category_chars[val]) | |
def handle_in(val): | |
out = [] | |
for tok, val in val: | |
out += handle_tok(tok, val) | |
return out | |
def handle_literal(val): | |
return [chr(val)] | |
def handle_max_repeat(repeat): | |
(min, max, val) = repeat | |
""" | |
Handle a repeat token such as {x,y} or ?. | |
""" | |
subtok, subval = val[0] | |
if max > 5000: | |
# max is the number of cartesian join operations needed to be | |
# carried out. More than 5000 consumes way to much memory. | |
raise ValueError("To many repetitions requested (%d)" % max) | |
optlist = handle_tok(subtok, subval) | |
results = [] | |
for rep in range(min, max + 1): | |
results += [''.join(it) for it in join([optlist] * rep)] | |
return results | |
def handle_range(val): | |
lo, hi = val | |
return (chr(x) for x in range(lo, hi + 1)) | |
def handle_subpattern(val): | |
return list(permute_toks(val[3])) | |
def handle_tok(tok, val): | |
""" | |
Returns a list of strings of possible permutations for this regexp | |
token. | |
""" | |
handlers = { | |
ANY : handle_any, | |
BRANCH : handle_branch, | |
CATEGORY : handle_category, | |
LITERAL : handle_literal, | |
IN : handle_in, | |
MAX_REPEAT : handle_max_repeat, | |
RANGE : handle_range, | |
SUBPATTERN : handle_subpattern} | |
try: | |
return handlers[tok](val) | |
except KeyError as e: | |
fmt = "Unsupported regular expression construct: %s" | |
raise ValueError(fmt % tok) | |
def permute_toks(toks): | |
""" | |
Returns a generator of strings of possible permutations for this | |
regexp token list. | |
""" | |
lists = [handle_tok(tok, val) for tok, val in toks] | |
return (''.join(it) for it in join(lists)) | |
def join(iterlist): | |
""" | |
Cartesian join as an iterator of the supplied sequences. Borrowed | |
from: | |
http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/302478 | |
""" | |
def rloop(seqin, comb): | |
if seqin: | |
for item in seqin[0]: | |
newcomb = comb + [item] | |
for item in rloop(seqin[1:], newcomb): | |
yield item | |
else: | |
yield comb | |
return rloop(iterlist, []) | |
########## PUBLIC API #################### | |
def ipermute(p): | |
toks = [tok_n_val for tok_n_val in sre_parse.parse(p)] | |
return permute_toks(toks) | |
def permute(p): | |
return list(ipermute(p)) |
Change the handle_subpattern function to the following:
def handle_subpattern(val):
return list(permute_toks(val[3]))
You can see the issue when I printed from this function:
def handle_subpattern(val):
print('handle_subpattern val')
print(val)
return list(permute_toks(val[1]))
Returns:
handle_subpattern val
(1, 0, 0, [(LITERAL, 65), (LITERAL, 66)])
Changing the handle_subpattern function to val[3] results in the following list:
['CABABAB']
Did you want the following regexes instead?
(C[AB]){3}
-> ['CACACA', 'CACACB', 'CACBCA', 'CACBCB', 'CBCACA', 'CBCACB', 'CBCBCA', 'CBCBCB']
(C[AB]){2}
-> ['CACA', 'CACB', 'CBCA', 'CBCB']
C[AB]{3}
-> ['CAAA', 'CAAB', 'CABA', 'CABB', 'CBAA', 'CBAB', 'CBBA', 'CBBB']
(similar to below:)
C(A|B){3}
-> ['CAAA', 'CAAB', 'CABA', 'CABB', 'CBAA', 'CBAB', 'CBBA', 'CBBB']
That fixed it! Thanks!
Appears to only generate the OR permutations for the last capturing group in my regex...
Here's the regex I'm trying to permutate:
(abcd|AbCd)(1!|(1|!))?(foo11bar|bar11foo|asd|dsa|qwerty)(1!|(1|!))?
I'm getting only these results:
abcdfoo11bar
abcdfoo11bar1!
abcdfoo11bar1
abcdfoo11bar!
So basically, it seems to only take the first choice from each capturing group (or in the case of (1!|(1|!))?
, nothing. i.e. the "first choice", due to ?
), except for the last one, where it correctly generates each option.
Hi @OneNot,
Thanks for raising an issue with this. I think the problem lies with handle_max_repeat
. My initial thought was that the function was only returning a string (misread the iterator) but the issue seems to lie directly with how itertools.chain and the deconstruction of results from the join iterable are stored. This is the likely reason as to why non-empty lists are flattened and empty lists are ignored. Explicitly stating these and converting them into the result as strings, without any need for deconstruction or the itertools.chain function fixes this.
It's unclear if my change could potentially impact other advanced regexes. I don't have a set of tests and because I've not been able to test this extensively, I cannot provide any guarantee of further stability.
Please note I'm not currently using repermute in any projects and the previous owner of repermute seems to have abandoned it over 5 years ago.
A change from:
def handle_max_repeat(repeat):
(min, max, val) = repeat
"""
Handle a repeat token such as {x,y} or ?.
"""
subtok, subval = val[0]
if max > 5000:
# max is the number of cartesian join operations needed to be
# carried out. More than 5000 consumes way to much memory.
raise ValueError("To many repetitions requested (%d)" % max)
optlist = handle_tok(subtok, subval)
iterlist = []
for x in range(min, max + 1):
joined = join([optlist] * x)
iterlist.append(joined)
return (''.join(it) for it in itertools.chain(*iterlist))
into the following:
def handle_max_repeat(repeat):
(min, max, val) = repeat
"""
Handle a repeat token such as {x,y} or ?.
"""
subtok, subval = val[0]
if max > 5000:
# max is the number of cartesian join operations needed to be
# carried out. More than 5000 consumes way to much memory.
raise ValueError("To many repetitions requested (%d)" % max)
optlist = handle_tok(subtok, subval)
results = []
for rep in range(min, max + 1):
results += [''.join(it) for it in join([optlist] * rep)]
return results
Should provide the correct response.
Let me know if you still believe there's a problem. Otherwise, I believe this fixes your issue and will happily update the original.
Test results:
/C(AB){3}/
CABABAB
/(C[AB]){3}/
CACACA
CACACB
CACBCA
CACBCB
CBCACA
CBCACB
CBCBCA
CBCBCB
(123|456)?(abcd|zyxw)
abcd
zyxw
123abcd
123zyxw
456abcd
456zyxw
(abcd|AbCd)(1!|(1|!))?(foo11bar|bar11foo|asd|dsa|qwerty)(1!|(1|!))?
abcdfoo11bar
abcdfoo11bar1!
abcdfoo11bar1
abcdfoo11bar!
abcdbar11foo
abcdbar11foo1!
abcdbar11foo1
abcdbar11foo!
abcdasd
abcdasd1!
abcdasd1
abcdasd!
abcddsa
abcddsa1!
abcddsa1
abcddsa!
abcdqwerty
abcdqwerty1!
abcdqwerty1
abcdqwerty!
abcd1!foo11bar
abcd1!foo11bar1!
abcd1!foo11bar1
abcd1!foo11bar!
abcd1!bar11foo
abcd1!bar11foo1!
abcd1!bar11foo1
abcd1!bar11foo!
abcd1!asd
abcd1!asd1!
abcd1!asd1
abcd1!asd!
abcd1!dsa
abcd1!dsa1!
abcd1!dsa1
abcd1!dsa!
abcd1!qwerty
abcd1!qwerty1!
abcd1!qwerty1
abcd1!qwerty!
abcd1foo11bar
abcd1foo11bar1!
abcd1foo11bar1
abcd1foo11bar!
abcd1bar11foo
abcd1bar11foo1!
abcd1bar11foo1
abcd1bar11foo!
abcd1asd
abcd1asd1!
abcd1asd1
abcd1asd!
abcd1dsa
abcd1dsa1!
abcd1dsa1
abcd1dsa!
abcd1qwerty
abcd1qwerty1!
abcd1qwerty1
abcd1qwerty!
abcd!foo11bar
abcd!foo11bar1!
abcd!foo11bar1
abcd!foo11bar!
abcd!bar11foo
abcd!bar11foo1!
abcd!bar11foo1
abcd!bar11foo!
abcd!asd
abcd!asd1!
abcd!asd1
abcd!asd!
abcd!dsa
abcd!dsa1!
abcd!dsa1
abcd!dsa!
abcd!qwerty
abcd!qwerty1!
abcd!qwerty1
abcd!qwerty!
AbCdfoo11bar
AbCdfoo11bar1!
AbCdfoo11bar1
AbCdfoo11bar!
AbCdbar11foo
AbCdbar11foo1!
AbCdbar11foo1
AbCdbar11foo!
AbCdasd
AbCdasd1!
AbCdasd1
AbCdasd!
AbCddsa
AbCddsa1!
AbCddsa1
AbCddsa!
AbCdqwerty
AbCdqwerty1!
AbCdqwerty1
AbCdqwerty!
AbCd1!foo11bar
AbCd1!foo11bar1!
AbCd1!foo11bar1
AbCd1!foo11bar!
AbCd1!bar11foo
AbCd1!bar11foo1!
AbCd1!bar11foo1
AbCd1!bar11foo!
AbCd1!asd
AbCd1!asd1!
AbCd1!asd1
AbCd1!asd!
AbCd1!dsa
AbCd1!dsa1!
AbCd1!dsa1
AbCd1!dsa!
AbCd1!qwerty
AbCd1!qwerty1!
AbCd1!qwerty1
AbCd1!qwerty!
AbCd1foo11bar
AbCd1foo11bar1!
AbCd1foo11bar1
AbCd1foo11bar!
AbCd1bar11foo
AbCd1bar11foo1!
AbCd1bar11foo1
AbCd1bar11foo!
AbCd1asd
AbCd1asd1!
AbCd1asd1
AbCd1asd!
AbCd1dsa
AbCd1dsa1!
AbCd1dsa1
AbCd1dsa!
AbCd1qwerty
AbCd1qwerty1!
AbCd1qwerty1
AbCd1qwerty!
AbCd!foo11bar
AbCd!foo11bar1!
AbCd!foo11bar1
AbCd!foo11bar!
AbCd!bar11foo
AbCd!bar11foo1!
AbCd!bar11foo1
AbCd!bar11foo!
AbCd!asd
AbCd!asd1!
AbCd!asd1
AbCd!asd!
AbCd!dsa
AbCd!dsa1!
AbCd!dsa1
AbCd!dsa!
AbCd!qwerty
AbCd!qwerty1!
AbCd!qwerty1
AbCd!qwerty!
Thanks for getting back to me.
I solved the issue I originally wanted to use this for, using a different method, but it's nice to get it fixed for the next person (or future me) that might want to use it.
The fix and test results look right to me. As for the possible side effects of this fix, without extensive testing, I'm sure you'd know better than me.
I was trying to generate permutations of the regex "C(AB){3}" and it bombed with TypeError: 'int' object is not iterable. I think the parentheses confused it.