Skip to content

Instantly share code, notes, and snippets.

@GibsonRuitiari
Created June 17, 2023 15:19
Show Gist options
  • Save GibsonRuitiari/14c5b5199945474ca8b01004a7db711f to your computer and use it in GitHub Desktop.
Save GibsonRuitiari/14c5b5199945474ca8b01004a7db711f to your computer and use it in GitHub Desktop.
import java.util.*
import kotlin.random.Random
// for references see https://swtch.com/~rsc/regexp/regexp-bytecode.c.txt
// https://swtch.com/~rsc/regexp/regexp1.html
fun makeRange(start: Char, end: Char): List<Char> {
val out = mutableListOf('(')
for (s in start until end) {
out.add(s)
out.add('|')
}
out.add(end)
return out
}
/*
add additional symbols and transformations to the input regular
expression to make it compatible with the subsequent conversion to postfix notation
*/
fun preProcess(expr: String): List<Char> {
val out = mutableListOf<Char>()
var i = 0
while (i < expr.length) {
if (expr[i] == '[') {
val start = expr[i + 1]
val end = expr[i + 3]
out.addAll(makeRange(start, end))
i += 4
} else {
out.add(if (expr[i] != ']') expr[i] else ')')
if (expr[i] in setOf('(', '|')) {
i++
continue
} else if (i < expr.length - 1) {
if (expr[i + 1] !in setOf('*', '?', '+', ')', '|', ']')) {
out.add('.')
}
}
i++
}
}
return out
}
/*
used in the toPosfix function to determine the precedence of operators in the regular expression.
It assigns a numerical value to each operator to establish their relative precedence.
The keys in the Precedence map represent the operators, and the corresponding values
represent their precedence.
A lower value indicates a lower precedence,
while a higher value indicates a higher precedence.
*/
val Precedence = mapOf('|' to 0, // NFA Union Operator
'.' to 1, //NFA Concat Operator
'?' to 2, // NFA zero or one Operator
'*' to 2, // NFA Closure Operator
'+' to 2 , // NFA one or more Operator
)
/*
* converts the regex into a postfix notation i.e operators are placed after their
* operands
* this is done to preserve the order of operations
*
*/
fun toPosfix(expr: List<Char>): String {
val out = mutableListOf<Char>()
val stack = Stack<Char>()
for (symb in expr) {
when (symb) {
'(' -> stack.push(symb)
')' -> {
try {
while (stack.peek() != '(') {
out.add(stack.pop())
}
} catch (e: EmptyStackException) {
System.err.write("Invalid regex pattern.\n".toByteArray())
System.exit(64)
}
stack.pop()
}
in setOf('+', '*', '?', '.', '|') -> {
while (!stack.isEmpty()) {
if (stack.peek() == '(') {
break
} else if (Precedence[symb]!! > Precedence[stack.peek()]!!) {
break
} else {
out.add(stack.pop())
}
}
stack.push(symb)
}
else -> out.add(symb)
}
}
while (!stack.isEmpty()) {
out.add(stack.pop())
}
return out.joinToString("")
}
// a state must have a transitions, epsilons, name, and a flag to know
// whether it is an acceptance state or initial state
// the name is just for debugging purposes though
//epsilons represents the set of states that can be reached
// by following an epsilon transition (a transition that does not have an input)
// transitions represents set of states that can be reached after consuming an input
class State(var isEnd: Boolean) {
val transition = mutableMapOf<Char, State>()
val epsilonTransitions = mutableListOf<State>()
fun addEpsilonTransition(to: State) {
epsilonTransitions.add(to)
}
fun addTransition(to: State, symbol: Char) {
transition[symbol] = to
}
}
// an NFA must have an initial and acceptance state
class NFA(val start: State, val end: State)
// handlers that handle the symbols/literals present in the regex expression
//currently supported operators are *|?+ hence {m,n} /quantifiers are not supported for now
fun fromEpsilon(): NFA {
val start = State(false)
val end = State(true)
start.addEpsilonTransition(end)
return NFA(start, end)
}
fun fromSymbol(symbol: Char): NFA {
val start = State(false)
val end = State(true)
start.addTransition(end, symbol)
return NFA(start, end)
}
fun concatenate(first: NFA, second: NFA): NFA {
first.end.addEpsilonTransition(second.start)
first.end.isEnd = false
return NFA(first.start, second.end)
}
fun union(first: NFA, second: NFA): NFA {
val start = State(false)
start.addEpsilonTransition(first.start)
start.addEpsilonTransition(second.start)
val end = State(true)
first.end.addEpsilonTransition(end)
first.end.isEnd = false
second.end.addEpsilonTransition(end)
second.end.isEnd = false
return NFA(start, end)
}
fun closure(nfa: NFA): NFA {
val start = State(false)
val end = State(true)
start.addEpsilonTransition(nfa.start)
start.addEpsilonTransition(end)
nfa.end.addEpsilonTransition(nfa.start)
nfa.end.addEpsilonTransition(end)
nfa.end.isEnd = false
return NFA(start, end)
}
fun oneOrMore(nfa: NFA): NFA {
val start = State(false)
val end = State(true)
start.addEpsilonTransition(nfa.start)
nfa.end.addEpsilonTransition(nfa.start)
nfa.end.addEpsilonTransition(end)
nfa.end.isEnd = false
return NFA(start, end)
}
fun zeroOrOne(nfa: NFA): NFA {
val start = State(false)
val end = State(true)
start.addEpsilonTransition(nfa.start)
start.addEpsilonTransition(end)
nfa.end.addEpsilonTransition(end)
nfa.end.isEnd = false
return NFA(start, end)
}
// convert the expanded expression to an NFA
fun toNFA(posfixExpr: String): NFA {
if (posfixExpr.isEmpty()) {
return fromEpsilon()
}
val stack = Stack<NFA>()
try {
for (symb in posfixExpr) {
when (symb) {
'*' -> stack.push(closure(stack.pop()))
'+' -> stack.push(oneOrMore(stack.pop()))
'?' -> stack.push(zeroOrOne(stack.pop()))
'|' -> {
val right = stack.pop()
val left = stack.pop()
stack.push(union(left, right))
}
'.' -> {
val right = stack.pop()
val left = stack.pop()
stack.push(concatenate(left, right))
}
else -> stack.push(fromSymbol(symb))
}
}
} catch (e: EmptyStackException) {
System.err.write("Invalid regex pattern.\n".toByteArray())
System.exit(64)
}
return stack.pop()
}
fun setNextState(state: State, nextStates: MutableList<State>, visited: MutableList<State>) {
if (state !in visited) {
visited.add(state)
nextStates.add(state)
if (state.epsilonTransitions.isNotEmpty()) {
for (stt in state.epsilonTransitions) {
setNextState(stt, nextStates, visited)
}
}
}
}
/*
start transition based on the initial state,
follow the transitions based on the pattern's characters
check if a match exists in the final set of states
*/
fun search(nfa: NFA, word: String): Boolean {
val current = mutableListOf<State>()
setNextState(nfa.start, current, mutableListOf())
for (symbol in word) {
val nextStates = mutableListOf<State>()
for (state in current) {
if (symbol in state.transition) {
setNextState(state.transition[symbol]!!, nextStates, mutableListOf())
}
}
current.clear()
current.addAll(nextStates)
}
for (state in current) {
if (state.isEnd) {
return true
}
}
return false
}
class Regex(expr: String) {
val nfa: NFA
init {
val postfixExpr = toPosfix(preProcess(expr))
nfa = toNFA(postfixExpr)
}
fun match(word: String): Boolean {
return search(nfa, word)
}
}
fun generateRandomString(regex: Regex, requiredLength:Int): String {
val nfa = regex.nfa
val random = Random.Default
val sb = StringBuilder()
var currentLength=0
var currentState = nfa.start
while (currentLength< requiredLength && !currentState.isEnd) {
val availableTransitions = currentState.transition.keys.toList()
if (availableTransitions.isNotEmpty()) {
val randomTransition = availableTransitions.random(random)
val nextState = currentState.transition[randomTransition]!!
sb.append(randomTransition)
currentState = nextState
currentLength++
} else {
// No available transitions, backtrack using epsilon transitions
// uses backtracking..can it be avoided? Dunno not sure honestly
val epsilonTransitions = currentState.epsilonTransitions
if (epsilonTransitions.isNotEmpty()) {
val randomEpsilonTransition = epsilonTransitions.random(random)
currentState = randomEpsilonTransition
} else {
// No available transitions or epsilon transitions, terminate the loop
break
}
}
}
return sb.toString()
}
fun main() {
val r = Regex("(ab)*")
r.match("abab")
repeat(10){
println(generateRandomString(r, 5))
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment