Last active
June 12, 2017 07:46
-
-
Save mokhdzanifaeq/25088117c2933b3e8ee9a3bc53d3a24d to your computer and use it in GitHub Desktop.
satisfying regular expression using smt solver
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import sys | |
import string | |
from z3 import * | |
# hackish way to generate string that satisfy simple regex | |
# what's not supported: group reference & look behind | |
# usage: python regex.py PATTERN LENGTH | |
pattern = sys.argv[1] | |
length = int(sys.argv[2]) | |
solver = Solver() | |
chars = [BitVec("%i" % i, 32) for i in xrange(length)] | |
categories = { | |
'category_digit': list(string.digits), | |
'category_not_digit': list(string.ascii_letters + string.punctuation), | |
'category_space': list(string.whitespace), | |
'category_not_space': list(string.printable.strip()), | |
'category_word': list(string.ascii_letters + string.digits + '_'), | |
'category_not_word': set(string.printable) - set(string.ascii_letters + string.digits + '_') | |
} | |
# solve recursively | |
def solve(regex, pos, fill = 1, step = 1): | |
if pos == length and regex: | |
if regex[0][1] != 'at_end': | |
raise Exception('Specified length is too short') | |
if not regex: | |
if pos != length and fill == 1: | |
return [chars[i] == 32 for i in xrange(pos, length)] | |
else: | |
return [] | |
key, value = regex[0] | |
if key == 'literal': | |
expr = [chars[pos] == value] | |
elif key == 'not_literal': | |
expr = [chars[pos] != value] | |
elif key == 'in': | |
clause = solve(value, pos, 0, 0) | |
expr = [Not(Or(clause))] if value[0][0] == 'negate' else [Or(clause)] | |
elif key == 'any': | |
expr = [And(chars[pos] >= 33, chars[pos] <= 126)] | |
elif key == 'range': | |
expr = [And(chars[pos] >= value[0], chars[pos] <= value[1])] | |
elif key == 'category': | |
expr = [chars[pos] == ord(x) for x in categories[value]] | |
elif key == 'branch': | |
clause = [] | |
for branch in value[1]: | |
inner_clause = solve(branch, pos, 0) | |
inner_clause += solve(regex[1:], pos + len(branch)) | |
clause.append(And(inner_clause)) | |
return [Or(clause)] | |
elif key == 'subpattern': | |
regex = regex[1:] | |
clause = list(value[1]) | |
while regex: | |
clause.append(regex[0]) | |
regex = regex[1:] | |
expr = solve(clause, pos) | |
pos = length | |
elif key == 'assert': | |
expr = solve(value[1], pos) | |
pos -= 1 | |
fill = 0 | |
elif key == 'assert_not': | |
expr = [Not(And(solve(value[1], pos)))] | |
pos -= 1 | |
fill = 0 | |
elif key == 'groupref': | |
# TODO | |
pass | |
elif key == 'max_repeat' or key == 'min_repeat': | |
clause = [] | |
iteration = min(length, value[1]) + 1 | |
for i in xrange(value[0], iteration): | |
tmp = list(value[2]) * i + list(regex[1:]) | |
try: clause.append(And(solve(tmp, pos))) | |
except Exception: break | |
return [Or(clause)] | |
elif key == 'negate' or key == 'at': | |
expr = [] | |
pos -= step | |
return expr + solve(regex[1:], pos + step, fill, step) | |
# all must be printable | |
for i in xrange(length): | |
solver.add(chars[i] >= 32, chars[i] <= 126) | |
regex = re.sre_parse.parse(pattern) | |
solver.add(solve(regex, 0)) | |
if str(solver.check()) == 'sat': | |
model = solver.model() | |
answer = ['' for i in xrange(len(model))] | |
for d in model.decls(): | |
answer[int(str(d.name()))]= chr(model[d].as_long()) | |
print ''.join(answer) | |
else: | |
print 'unsat' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment