Created
June 17, 2023 15:19
-
-
Save GibsonRuitiari/14c5b5199945474ca8b01004a7db711f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.util.* | |
import kotlin.random.Random | |
// for references see https://swtch.com/~rsc/regexp/regexp-bytecode.c.txt | |
// https://swtch.com/~rsc/regexp/regexp1.html | |
fun makeRange(start: Char, end: Char): List<Char> { | |
val out = mutableListOf('(') | |
for (s in start until end) { | |
out.add(s) | |
out.add('|') | |
} | |
out.add(end) | |
return out | |
} | |
/* | |
add additional symbols and transformations to the input regular | |
expression to make it compatible with the subsequent conversion to postfix notation | |
*/ | |
fun preProcess(expr: String): List<Char> { | |
val out = mutableListOf<Char>() | |
var i = 0 | |
while (i < expr.length) { | |
if (expr[i] == '[') { | |
val start = expr[i + 1] | |
val end = expr[i + 3] | |
out.addAll(makeRange(start, end)) | |
i += 4 | |
} else { | |
out.add(if (expr[i] != ']') expr[i] else ')') | |
if (expr[i] in setOf('(', '|')) { | |
i++ | |
continue | |
} else if (i < expr.length - 1) { | |
if (expr[i + 1] !in setOf('*', '?', '+', ')', '|', ']')) { | |
out.add('.') | |
} | |
} | |
i++ | |
} | |
} | |
return out | |
} | |
/* | |
used in the toPosfix function to determine the precedence of operators in the regular expression. | |
It assigns a numerical value to each operator to establish their relative precedence. | |
The keys in the Precedence map represent the operators, and the corresponding values | |
represent their precedence. | |
A lower value indicates a lower precedence, | |
while a higher value indicates a higher precedence. | |
*/ | |
val Precedence = mapOf('|' to 0, // NFA Union Operator | |
'.' to 1, //NFA Concat Operator | |
'?' to 2, // NFA zero or one Operator | |
'*' to 2, // NFA Closure Operator | |
'+' to 2 , // NFA one or more Operator | |
) | |
/* | |
* converts the regex into a postfix notation i.e operators are placed after their | |
* operands | |
* this is done to preserve the order of operations | |
* | |
*/ | |
fun toPosfix(expr: List<Char>): String { | |
val out = mutableListOf<Char>() | |
val stack = Stack<Char>() | |
for (symb in expr) { | |
when (symb) { | |
'(' -> stack.push(symb) | |
')' -> { | |
try { | |
while (stack.peek() != '(') { | |
out.add(stack.pop()) | |
} | |
} catch (e: EmptyStackException) { | |
System.err.write("Invalid regex pattern.\n".toByteArray()) | |
System.exit(64) | |
} | |
stack.pop() | |
} | |
in setOf('+', '*', '?', '.', '|') -> { | |
while (!stack.isEmpty()) { | |
if (stack.peek() == '(') { | |
break | |
} else if (Precedence[symb]!! > Precedence[stack.peek()]!!) { | |
break | |
} else { | |
out.add(stack.pop()) | |
} | |
} | |
stack.push(symb) | |
} | |
else -> out.add(symb) | |
} | |
} | |
while (!stack.isEmpty()) { | |
out.add(stack.pop()) | |
} | |
return out.joinToString("") | |
} | |
// a state must have a transitions, epsilons, name, and a flag to know | |
// whether it is an acceptance state or initial state | |
// the name is just for debugging purposes though | |
//epsilons represents the set of states that can be reached | |
// by following an epsilon transition (a transition that does not have an input) | |
// transitions represents set of states that can be reached after consuming an input | |
class State(var isEnd: Boolean) { | |
val transition = mutableMapOf<Char, State>() | |
val epsilonTransitions = mutableListOf<State>() | |
fun addEpsilonTransition(to: State) { | |
epsilonTransitions.add(to) | |
} | |
fun addTransition(to: State, symbol: Char) { | |
transition[symbol] = to | |
} | |
} | |
// an NFA must have an initial and acceptance state | |
class NFA(val start: State, val end: State) | |
// handlers that handle the symbols/literals present in the regex expression | |
//currently supported operators are *|?+ hence {m,n} /quantifiers are not supported for now | |
fun fromEpsilon(): NFA { | |
val start = State(false) | |
val end = State(true) | |
start.addEpsilonTransition(end) | |
return NFA(start, end) | |
} | |
fun fromSymbol(symbol: Char): NFA { | |
val start = State(false) | |
val end = State(true) | |
start.addTransition(end, symbol) | |
return NFA(start, end) | |
} | |
fun concatenate(first: NFA, second: NFA): NFA { | |
first.end.addEpsilonTransition(second.start) | |
first.end.isEnd = false | |
return NFA(first.start, second.end) | |
} | |
fun union(first: NFA, second: NFA): NFA { | |
val start = State(false) | |
start.addEpsilonTransition(first.start) | |
start.addEpsilonTransition(second.start) | |
val end = State(true) | |
first.end.addEpsilonTransition(end) | |
first.end.isEnd = false | |
second.end.addEpsilonTransition(end) | |
second.end.isEnd = false | |
return NFA(start, end) | |
} | |
fun closure(nfa: NFA): NFA { | |
val start = State(false) | |
val end = State(true) | |
start.addEpsilonTransition(nfa.start) | |
start.addEpsilonTransition(end) | |
nfa.end.addEpsilonTransition(nfa.start) | |
nfa.end.addEpsilonTransition(end) | |
nfa.end.isEnd = false | |
return NFA(start, end) | |
} | |
fun oneOrMore(nfa: NFA): NFA { | |
val start = State(false) | |
val end = State(true) | |
start.addEpsilonTransition(nfa.start) | |
nfa.end.addEpsilonTransition(nfa.start) | |
nfa.end.addEpsilonTransition(end) | |
nfa.end.isEnd = false | |
return NFA(start, end) | |
} | |
fun zeroOrOne(nfa: NFA): NFA { | |
val start = State(false) | |
val end = State(true) | |
start.addEpsilonTransition(nfa.start) | |
start.addEpsilonTransition(end) | |
nfa.end.addEpsilonTransition(end) | |
nfa.end.isEnd = false | |
return NFA(start, end) | |
} | |
// convert the expanded expression to an NFA | |
fun toNFA(posfixExpr: String): NFA { | |
if (posfixExpr.isEmpty()) { | |
return fromEpsilon() | |
} | |
val stack = Stack<NFA>() | |
try { | |
for (symb in posfixExpr) { | |
when (symb) { | |
'*' -> stack.push(closure(stack.pop())) | |
'+' -> stack.push(oneOrMore(stack.pop())) | |
'?' -> stack.push(zeroOrOne(stack.pop())) | |
'|' -> { | |
val right = stack.pop() | |
val left = stack.pop() | |
stack.push(union(left, right)) | |
} | |
'.' -> { | |
val right = stack.pop() | |
val left = stack.pop() | |
stack.push(concatenate(left, right)) | |
} | |
else -> stack.push(fromSymbol(symb)) | |
} | |
} | |
} catch (e: EmptyStackException) { | |
System.err.write("Invalid regex pattern.\n".toByteArray()) | |
System.exit(64) | |
} | |
return stack.pop() | |
} | |
fun setNextState(state: State, nextStates: MutableList<State>, visited: MutableList<State>) { | |
if (state !in visited) { | |
visited.add(state) | |
nextStates.add(state) | |
if (state.epsilonTransitions.isNotEmpty()) { | |
for (stt in state.epsilonTransitions) { | |
setNextState(stt, nextStates, visited) | |
} | |
} | |
} | |
} | |
/* | |
start transition based on the initial state, | |
follow the transitions based on the pattern's characters | |
check if a match exists in the final set of states | |
*/ | |
fun search(nfa: NFA, word: String): Boolean { | |
val current = mutableListOf<State>() | |
setNextState(nfa.start, current, mutableListOf()) | |
for (symbol in word) { | |
val nextStates = mutableListOf<State>() | |
for (state in current) { | |
if (symbol in state.transition) { | |
setNextState(state.transition[symbol]!!, nextStates, mutableListOf()) | |
} | |
} | |
current.clear() | |
current.addAll(nextStates) | |
} | |
for (state in current) { | |
if (state.isEnd) { | |
return true | |
} | |
} | |
return false | |
} | |
class Regex(expr: String) { | |
val nfa: NFA | |
init { | |
val postfixExpr = toPosfix(preProcess(expr)) | |
nfa = toNFA(postfixExpr) | |
} | |
fun match(word: String): Boolean { | |
return search(nfa, word) | |
} | |
} | |
fun generateRandomString(regex: Regex, requiredLength:Int): String { | |
val nfa = regex.nfa | |
val random = Random.Default | |
val sb = StringBuilder() | |
var currentLength=0 | |
var currentState = nfa.start | |
while (currentLength< requiredLength && !currentState.isEnd) { | |
val availableTransitions = currentState.transition.keys.toList() | |
if (availableTransitions.isNotEmpty()) { | |
val randomTransition = availableTransitions.random(random) | |
val nextState = currentState.transition[randomTransition]!! | |
sb.append(randomTransition) | |
currentState = nextState | |
currentLength++ | |
} else { | |
// No available transitions, backtrack using epsilon transitions | |
// uses backtracking..can it be avoided? Dunno not sure honestly | |
val epsilonTransitions = currentState.epsilonTransitions | |
if (epsilonTransitions.isNotEmpty()) { | |
val randomEpsilonTransition = epsilonTransitions.random(random) | |
currentState = randomEpsilonTransition | |
} else { | |
// No available transitions or epsilon transitions, terminate the loop | |
break | |
} | |
} | |
} | |
return sb.toString() | |
} | |
fun main() { | |
val r = Regex("(ab)*") | |
r.match("abab") | |
repeat(10){ | |
println(generateRandomString(r, 5)) | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment