Skip to content

Instantly share code, notes, and snippets.

@zhenghaoz
Created October 24, 2016 08:19
Show Gist options
  • Save zhenghaoz/3a87bb84c8024fd4823547562cc00a6a to your computer and use it in GitHub Desktop.
Save zhenghaoz/3a87bb84c8024fd4823547562cc00a6a to your computer and use it in GitHub Desktop.
Regexp to minimal DFA
#include <iostream>
#include <cstdio>
#include <vector>
#include <string>
#include <stack>
#include <cstdlib>
#include <ctime>
#include <queue>
#include <vector>
#include <algorithm>
using namespace std;
// #define NFA_DEBUG
#define DFA_DEBUG
#define INPUT_EP -64
#define STATE_INIT 0x1
#define STATE_ACCEPT 0x2
#define DICT_SIZE 127
#define MIN(a,b) ((a)<(b)?(a):(b))
char header[] = "\\documentclass{article}\n\
\\usepackage{pgf}\n\
\\usepackage{tikz}\n\
\\usetikzlibrary{arrows,automata}\n\
\\usepackage[latin1]{inputenc}\n\
\\usepackage{geometry}\n\
\\geometry{papersize={32cm,32cm}}\n\
\\begin{document}\n\
\\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.8cm,semithick]\n";
char footer[] = "\\end{tikzpicture}\n\\end{document}\n";
/************
* NFA Part *
************/
int stateId = 0; // 记录当前可分配的状态ID
// 节点数据结构
struct State
{
int tag; // 访问标记(用于访问)
int attr; // 状态属性(起始?终止?)
int id; // 状态ID
vector<char> inputs; // 进入后继状态需要的输入
vector<State*> nexts; // 后继状态指针
State(): attr(0), tag(-1), id(stateId++) {}
// 增加一条状态转换
void addNextState(char input, State *next)
{
inputs.push_back(input);
nexts.push_back(next);
}
};
// NFA数据结构
struct NFA
{
State *in; // 入口
State *out; // 出口
int left; // 对应子表达式的偏移量
NFA() = default;
NFA(State *in, State *out, int left): in(in), out(out), left(left) {}
};
// 打印状态
// 参数
// out :输出流
// node :状态指针
// lastid :上个状态的ID
// 返回 :输出流
ostream &printState(ostream& out, State *node, int& lastid)
{
cout << "\\node[state";
if (node->attr & STATE_INIT) cout << ",initial";
if (node->attr & STATE_ACCEPT) cout << ",accepting";
cout << "](" << node->id << ")";
if (lastid != -1) cout << "[below right of=" << lastid << "]";
lastid = node->id;
cout << "{$" << node->id << "$};" << endl;
return out;
}
// 打印NFA
// 参数
// out :输出流
// nfa :NFA
// 返回 :输出流
ostream &operator<<(ostream& out, const NFA &nfa)
{
cout << header;
int tag = rand();
int lastid = -1;
queue<State*> q;
q.push(nfa.in);
nfa.in->tag = tag;
printState(out, nfa.in, lastid);
while (!q.empty()) {
State *node = q.front();
q.pop();
for (int i = 0; i < node->nexts.size(); i++) {
char input = node->inputs[i];
State *next = node->nexts[i];
if (next->tag != tag) {
next->tag = tag;
printState(out, next, lastid);
q.push(next);
}
cout << "\\path(" << node->id << ") edge[bend right] node{$";
if (input == INPUT_EP) cout << "\\epsilon";
else cout << input;
cout << "$}(" << next->id << ");" << endl;
}
}
cout << footer << endl;
return out;
}
// 编译字符
// 参数
// exp :正则表达式字符串
// i :字符所在位置
// 返回值 :NFA
NFA compile(const string &exp, int i)
{
State *in = new State();
State *out = new State();
#ifdef NFA_DEBUG
clog << "compile " << in->id << "-" << exp[i] << "->" << out->id << endl;
#endif
in->addNextState(exp[i], out);
return NFA(in, out, i);
}
// 串联NFA
// 参数
// lhs :在前的NFA
// rhs :在后的NFA
// 返回值 :NFA
NFA cascade(const NFA &lhs, const NFA &rhs)
{
#ifdef NFA_DEBUG
clog << "cascade " << lhs.in->id << "->" << lhs.out->id << " and " << rhs.in->id << "->" << rhs.out->id << endl;
#endif
lhs.out->inputs = rhs.in->inputs;
lhs.out->nexts = rhs.in->nexts;
delete rhs.in;
return NFA(lhs.in, rhs.out, lhs.left);
}
// 并联NFA
// 参数
// lhs :NFA1
// rhs :NFA2
// 返回值 :NFA
NFA multiple(const NFA &lhs, const NFA &rhs)
{
#ifdef NFA_DEBUG
clog << "multiple " << lhs.in->id << "->" << lhs.out->id << " and " << rhs.in->id << "->" << rhs.out->id << endl;
#endif
State *in = new State();
State *out = new State();
in->addNextState(INPUT_EP, lhs.in);
in->addNextState(INPUT_EP, rhs.in);
lhs.out->addNextState(INPUT_EP, out);
rhs.out->addNextState(INPUT_EP, out);
return NFA(in, out, MIN(lhs.left, rhs.left));
}
// 重复NFA
// 参数
// nfa :需要重复的NFA
// 返回值 :NFA
NFA repeat(const NFA &nfa)
{
State *in = new State();
State *out = new State();
in->addNextState(INPUT_EP, nfa.in);
in->addNextState(INPUT_EP, out);
nfa.out->addNextState(INPUT_EP, nfa.in);
nfa.out->addNextState(INPUT_EP, out);
return NFA(in, out, nfa.left);
}
// 自动串联NFA栈中符合条件的NFA
// 参数
// nfaStack :NFA栈
// opstack :符号栈
void autoCascade(stack<NFA> &nfaStack, stack<int> &opStack, const string &exp, int next)
{
// 下一个字符不应该是*
if (next < exp.length() && exp[next] == '*')
return;
if (nfaStack.size() >= 2 &&
(opStack.empty() || opStack.top()+1 < nfaStack.top().left)) {
#ifdef NFA_DEBUG
clog << "autoCascade ";
if (opStack.empty()) clog << "empty opstack";
else clog << opStack.top()+1 << "!=" << nfaStack.top().left;
clog << endl;
#endif
NFA nfa2 = nfaStack.top();
nfaStack.pop();
NFA nfa1 = nfaStack.top();
nfaStack.pop();
nfaStack.push(cascade(nfa1, nfa2));
}
}
// 自动并联NFA栈中符合条件的NFA
// 参数
// nfaStack :NFA栈
// opstack :符号栈
void autoMultiple(stack<NFA> &nfaStack, stack<int> &opStack, const string &exp)
{
while (!opStack.empty() && exp[opStack.top()] == '|') {
NFA nfa1 = nfaStack.top();
nfaStack.pop();
NFA nfa2 = nfaStack.top();
nfaStack.pop();
opStack.pop();
nfaStack.push(multiple(nfa1, nfa2));
}
}
// 编译正则表达式
// 参数
// exp :正则表达式字符串
// 返回值 :NFA
NFA compile(const string &exp)
{
stack<NFA> nfaStack; // NFA栈
stack<int> opStack; // 操作符栈
for (int i = 0; i < exp.length(); i++) {
switch (exp[i]) {
case '|':
case '(':
// 显显式操作符入栈
opStack.push(i);
break;
case ')':
// 应用左括号前的所有的操作符
autoMultiple(nfaStack, opStack, exp);
nfaStack.top().left = opStack.top();
opStack.pop();
break;
case '*': {
// 重复栈顶NFA
NFA nfa = nfaStack.top();
nfaStack.pop();
nfaStack.push(repeat(nfa));
break;
} default:
// 生成单字符NFA
nfaStack.push(compile(exp, i));
}
// 自动并联NFA栈中符合条件的NFA
autoCascade(nfaStack, opStack, exp, i+1);
}
// 应用所有剩余操作符
autoMultiple(nfaStack, opStack, exp);
// 添加起始状态和接受状态标记
if (!nfaStack.empty()) {
NFA nfa = nfaStack.top();
nfa.in->attr |= STATE_INIT;
nfa.out->attr |= STATE_ACCEPT;
return nfa;
}
// 正则表达式为空
State* state = new State();
state->attr |= STATE_ACCEPT | STATE_ACCEPT;
return NFA(state, state, 0);
}
/************
* DFA Part *
************/
// DFA数据结构
struct DFA
{
vector<int> attrs; // 状态属性
vector<char> inputs; // 输入字符集合
vector<vector<int>> nexts; // 状态转换函数
DFA(const vector<char> inputs,
const vector<vector<int>> &nexts,
const vector<int> &attrs):
inputs(inputs), nexts(nexts), attrs(attrs) {}
};
ostream &operator<<(ostream &out, const DFA &dfa)
{
cout << header;
// 绘制状态
for (int i = 0; i < dfa.attrs.size(); i++) {
cout << "\\node[state";
if (dfa.attrs[i] & STATE_INIT) cout << ",initial";
if (dfa.attrs[i] & STATE_ACCEPT) cout << ",accepting";
cout << "](" << i << ")";
if (i)
cout << "[below right of=" << i-1 << "]";
cout << "{$" << i << "$};" << endl;
}
// 绘制状态转换
for (int i = 0; i < dfa.attrs.size(); i++)
for (int j = 0; j < dfa.inputs.size(); j++)
if (dfa.nexts[i][j] != -1) {
cout << "\\path(" << i << ")edge[";
if (dfa.nexts[i][j] == i)
cout << "loop above";
else
cout << "bend right";
cout << "]node{" << dfa.inputs[j] << "}(" << dfa.nexts[i][j] << ");" << endl;
}
cout << footer;
return out;
}
// 简化DFA
// 参数
// dfa :DFA
// 返回值 :DFA
DFA simplex(const DFA &dfa)
{
vector<int> attrs = dfa.attrs;
vector<vector<int>> nexts = dfa.nexts;
while (true) {
// 查找等价的状态
int a = -1, b = -1;
for (int i = 0; i < attrs.size(); i++)
for (int j = i+1; j < attrs.size(); j++)
if ((attrs[i] & STATE_ACCEPT) == (attrs[j] & STATE_ACCEPT)) {
bool equal = true;
for (int k = 0; k < dfa.inputs.size(); k++)
if (nexts[i][k] != nexts[j][k] &&
(nexts[i][k] != i || nexts[j][k] != j)) {
equal = false;
break;
}
if (equal) {
a = i;
b = j;
break;
}
}
// 已经最简
if (a == -1)
break;
// 简化
#ifdef DFA_DEBUG
clog << "simplex find " << a << " equals " << b << endl;
#endif
attrs[a] |= attrs[b];
attrs.erase(attrs.begin()+b);
nexts.erase(nexts.begin()+b);
for (int i = 0; i < nexts.size(); i++)
for (int j = 0; j < dfa.inputs.size(); j++)
if (nexts[i][j] == b)
nexts[i][j] = a;
else if (nexts[i][j] > b)
nexts[i][j]--;
}
return DFA(dfa.inputs, nexts, attrs);
}
// 状态集合
struct StateSet
{
vector<State*> states;
int attr = 0;
StateSet() = default;
StateSet(const vector<State*> states, int attr): states(states), attr(attr) {}
bool empty() { return states.empty(); }
};
bool operator==(const StateSet &lhs, const StateSet &rhs)
{
return lhs.states == rhs.states && lhs.attr == rhs.attr;
}
bool operator!=(const StateSet &lhs, const StateSet &rhs)
{
return !(lhs==rhs);
}
// 合并状态集合
// 参数
// set1 :集合1
// set2 :集合2
// 返回值 :集合
StateSet unions(const StateSet &set1, const StateSet &set2)
{
vector<State*> states;
int i = 0, j = 0;
while (i < set1.states.size() && j < set2.states.size()) {
if (set1.states[i]->id < set2.states[j]->id) {
states.push_back(set1.states[i]);
i++;
} else if (set1.states[i]->id > set2.states[j]->id) {
states.push_back(set2.states[j]);
j++;
} else {
states.push_back(set1.states[i]);
i++;
j++;
}
}
while (i < set1.states.size()) {
states.push_back(set1.states[i]);
i++;
}
while (j < set2.states.size()) {
states.push_back(set2.states[j]);
j++;
}
return StateSet(states, set1.attr | set2.attr);
}
// 获取某个状态通过空转换可以到达的闭包
// 参数
// state :给定的状态
// 返回值 :闭包
StateSet closure(State *state)
{
// 集合和属性
vector<State*> states;
states.push_back(state);
int attr = state->attr;
// 广度优先搜索
queue<State*> q;
q.push(state);
int tag = rand();
state->tag = tag;
while (!q.empty()) {
State *temp = q.front();
q.pop();
for (int i = 0; i < temp->inputs.size(); i++)
if (temp->inputs[i] == INPUT_EP && temp->nexts[i]->tag != tag) {
temp->nexts[i]->tag = tag;
states.push_back(temp->nexts[i]);
attr |= temp->nexts[i]->attr;
q.push(temp->nexts[i]);
}
}
sort(states.begin(), states.end(), [](State *lhs, State *rhs)->bool{
return lhs->id < rhs->id;
});
#ifdef DFA_DEBUG
clog << "closure of " << state->id << " :";
for (State *s: states)
clog << ' ' << s->id;
clog << endl;
#endif
return StateSet(states, attr);
}
// 获取正则表达式对应的输入字符集合
// 参数
// exp :正则表达式
// 返回值 :字符集合
vector<char> getInputs(const string &exp)
{
vector<char> inputs;
vector<bool> memo(DICT_SIZE, false);
for (char c : exp)
if (c != '(' && c != ')' && c != '|' && c != '*' && !memo[c]) {
memo[c] = true;
inputs.push_back(c);
}
return inputs;
}
// 求解状态集合的状态输入某个状态能够到达的状态集合
// 参数
// set :状态集合
// c :输入的字符
// 返回值 :状态集合
StateSet nextStateSet(const StateSet &set, char c)
{
StateSet set2;
for (State *s : set.states)
for (int i = 0; i < s->inputs.size(); i++)
if (s->inputs[i] == c)
set2 = unions(set2, closure(s->nexts[i]));
#ifdef DFA_DEBUG
clog << "next of";
for (State *s: set.states)
clog << ' ' << s->id;
clog << " :";
for (State *s: set2.states)
clog << ' ' << s->id;
clog << endl;
#endif
return set2;
}
// 将NFA确定化
// 参数
// nfa :NFA
// inputs :输入字符集合
// 返回值 :DFA
DFA definitize(const NFA &nfa, vector<char> inputs)
{
// 转换函数
vector<vector<int>> nexts;
// 确定化
StateSet initSet = closure(nfa.in);
vector<StateSet> sets;
sets.push_back(initSet);
nexts.push_back(vector<int>(inputs.size(), -1));
queue<int> q;
q.push(0);
while (!q.empty()) {
int setid = q.front();
q.pop();
for (int i = 0; i < inputs.size(); i++) {
StateSet nextSet = nextStateSet(sets[setid], inputs[i]);
if (nextSet.empty())
continue;
int j = 0;
while (j < sets.size() && nextSet != sets[j])
j++;
nexts[setid][i] = j;
if (j == sets.size()) {
q.push(j);
sets.push_back(nextSet);
nexts.push_back(vector<int>(inputs.size(), -1));
}
}
}
vector<int> attrs(sets.size());
for (int i = 0; i < sets.size(); i++)
attrs[i] = sets[i].attr;
return DFA(inputs, nexts, attrs);
}
int main(int argc, char const *argv[])
{
string exp(argv[1]);
cout << simplex(definitize(compile(exp), getInputs(exp)));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment