Created
October 24, 2016 08:19
-
-
Save zhenghaoz/3a87bb84c8024fd4823547562cc00a6a to your computer and use it in GitHub Desktop.
Regexp to minimal DFA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cstdio> | |
#include <vector> | |
#include <string> | |
#include <stack> | |
#include <cstdlib> | |
#include <ctime> | |
#include <queue> | |
#include <vector> | |
#include <algorithm> | |
using namespace std; | |
// #define NFA_DEBUG | |
#define DFA_DEBUG | |
#define INPUT_EP -64 | |
#define STATE_INIT 0x1 | |
#define STATE_ACCEPT 0x2 | |
#define DICT_SIZE 127 | |
#define MIN(a,b) ((a)<(b)?(a):(b)) | |
char header[] = "\\documentclass{article}\n\ | |
\\usepackage{pgf}\n\ | |
\\usepackage{tikz}\n\ | |
\\usetikzlibrary{arrows,automata}\n\ | |
\\usepackage[latin1]{inputenc}\n\ | |
\\usepackage{geometry}\n\ | |
\\geometry{papersize={32cm,32cm}}\n\ | |
\\begin{document}\n\ | |
\\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.8cm,semithick]\n"; | |
char footer[] = "\\end{tikzpicture}\n\\end{document}\n"; | |
/************ | |
* NFA Part * | |
************/ | |
int stateId = 0; // 记录当前可分配的状态ID | |
// 节点数据结构 | |
struct State | |
{ | |
int tag; // 访问标记(用于访问) | |
int attr; // 状态属性(起始?终止?) | |
int id; // 状态ID | |
vector<char> inputs; // 进入后继状态需要的输入 | |
vector<State*> nexts; // 后继状态指针 | |
State(): attr(0), tag(-1), id(stateId++) {} | |
// 增加一条状态转换 | |
void addNextState(char input, State *next) | |
{ | |
inputs.push_back(input); | |
nexts.push_back(next); | |
} | |
}; | |
// NFA数据结构 | |
struct NFA | |
{ | |
State *in; // 入口 | |
State *out; // 出口 | |
int left; // 对应子表达式的偏移量 | |
NFA() = default; | |
NFA(State *in, State *out, int left): in(in), out(out), left(left) {} | |
}; | |
// 打印状态 | |
// 参数 | |
// out :输出流 | |
// node :状态指针 | |
// lastid :上个状态的ID | |
// 返回 :输出流 | |
ostream &printState(ostream& out, State *node, int& lastid) | |
{ | |
cout << "\\node[state"; | |
if (node->attr & STATE_INIT) cout << ",initial"; | |
if (node->attr & STATE_ACCEPT) cout << ",accepting"; | |
cout << "](" << node->id << ")"; | |
if (lastid != -1) cout << "[below right of=" << lastid << "]"; | |
lastid = node->id; | |
cout << "{$" << node->id << "$};" << endl; | |
return out; | |
} | |
// 打印NFA | |
// 参数 | |
// out :输出流 | |
// nfa :NFA | |
// 返回 :输出流 | |
ostream &operator<<(ostream& out, const NFA &nfa) | |
{ | |
cout << header; | |
int tag = rand(); | |
int lastid = -1; | |
queue<State*> q; | |
q.push(nfa.in); | |
nfa.in->tag = tag; | |
printState(out, nfa.in, lastid); | |
while (!q.empty()) { | |
State *node = q.front(); | |
q.pop(); | |
for (int i = 0; i < node->nexts.size(); i++) { | |
char input = node->inputs[i]; | |
State *next = node->nexts[i]; | |
if (next->tag != tag) { | |
next->tag = tag; | |
printState(out, next, lastid); | |
q.push(next); | |
} | |
cout << "\\path(" << node->id << ") edge[bend right] node{$"; | |
if (input == INPUT_EP) cout << "\\epsilon"; | |
else cout << input; | |
cout << "$}(" << next->id << ");" << endl; | |
} | |
} | |
cout << footer << endl; | |
return out; | |
} | |
// 编译字符 | |
// 参数 | |
// exp :正则表达式字符串 | |
// i :字符所在位置 | |
// 返回值 :NFA | |
NFA compile(const string &exp, int i) | |
{ | |
State *in = new State(); | |
State *out = new State(); | |
#ifdef NFA_DEBUG | |
clog << "compile " << in->id << "-" << exp[i] << "->" << out->id << endl; | |
#endif | |
in->addNextState(exp[i], out); | |
return NFA(in, out, i); | |
} | |
// 串联NFA | |
// 参数 | |
// lhs :在前的NFA | |
// rhs :在后的NFA | |
// 返回值 :NFA | |
NFA cascade(const NFA &lhs, const NFA &rhs) | |
{ | |
#ifdef NFA_DEBUG | |
clog << "cascade " << lhs.in->id << "->" << lhs.out->id << " and " << rhs.in->id << "->" << rhs.out->id << endl; | |
#endif | |
lhs.out->inputs = rhs.in->inputs; | |
lhs.out->nexts = rhs.in->nexts; | |
delete rhs.in; | |
return NFA(lhs.in, rhs.out, lhs.left); | |
} | |
// 并联NFA | |
// 参数 | |
// lhs :NFA1 | |
// rhs :NFA2 | |
// 返回值 :NFA | |
NFA multiple(const NFA &lhs, const NFA &rhs) | |
{ | |
#ifdef NFA_DEBUG | |
clog << "multiple " << lhs.in->id << "->" << lhs.out->id << " and " << rhs.in->id << "->" << rhs.out->id << endl; | |
#endif | |
State *in = new State(); | |
State *out = new State(); | |
in->addNextState(INPUT_EP, lhs.in); | |
in->addNextState(INPUT_EP, rhs.in); | |
lhs.out->addNextState(INPUT_EP, out); | |
rhs.out->addNextState(INPUT_EP, out); | |
return NFA(in, out, MIN(lhs.left, rhs.left)); | |
} | |
// 重复NFA | |
// 参数 | |
// nfa :需要重复的NFA | |
// 返回值 :NFA | |
NFA repeat(const NFA &nfa) | |
{ | |
State *in = new State(); | |
State *out = new State(); | |
in->addNextState(INPUT_EP, nfa.in); | |
in->addNextState(INPUT_EP, out); | |
nfa.out->addNextState(INPUT_EP, nfa.in); | |
nfa.out->addNextState(INPUT_EP, out); | |
return NFA(in, out, nfa.left); | |
} | |
// 自动串联NFA栈中符合条件的NFA | |
// 参数 | |
// nfaStack :NFA栈 | |
// opstack :符号栈 | |
void autoCascade(stack<NFA> &nfaStack, stack<int> &opStack, const string &exp, int next) | |
{ | |
// 下一个字符不应该是* | |
if (next < exp.length() && exp[next] == '*') | |
return; | |
if (nfaStack.size() >= 2 && | |
(opStack.empty() || opStack.top()+1 < nfaStack.top().left)) { | |
#ifdef NFA_DEBUG | |
clog << "autoCascade "; | |
if (opStack.empty()) clog << "empty opstack"; | |
else clog << opStack.top()+1 << "!=" << nfaStack.top().left; | |
clog << endl; | |
#endif | |
NFA nfa2 = nfaStack.top(); | |
nfaStack.pop(); | |
NFA nfa1 = nfaStack.top(); | |
nfaStack.pop(); | |
nfaStack.push(cascade(nfa1, nfa2)); | |
} | |
} | |
// 自动并联NFA栈中符合条件的NFA | |
// 参数 | |
// nfaStack :NFA栈 | |
// opstack :符号栈 | |
void autoMultiple(stack<NFA> &nfaStack, stack<int> &opStack, const string &exp) | |
{ | |
while (!opStack.empty() && exp[opStack.top()] == '|') { | |
NFA nfa1 = nfaStack.top(); | |
nfaStack.pop(); | |
NFA nfa2 = nfaStack.top(); | |
nfaStack.pop(); | |
opStack.pop(); | |
nfaStack.push(multiple(nfa1, nfa2)); | |
} | |
} | |
// 编译正则表达式 | |
// 参数 | |
// exp :正则表达式字符串 | |
// 返回值 :NFA | |
NFA compile(const string &exp) | |
{ | |
stack<NFA> nfaStack; // NFA栈 | |
stack<int> opStack; // 操作符栈 | |
for (int i = 0; i < exp.length(); i++) { | |
switch (exp[i]) { | |
case '|': | |
case '(': | |
// 显显式操作符入栈 | |
opStack.push(i); | |
break; | |
case ')': | |
// 应用左括号前的所有的操作符 | |
autoMultiple(nfaStack, opStack, exp); | |
nfaStack.top().left = opStack.top(); | |
opStack.pop(); | |
break; | |
case '*': { | |
// 重复栈顶NFA | |
NFA nfa = nfaStack.top(); | |
nfaStack.pop(); | |
nfaStack.push(repeat(nfa)); | |
break; | |
} default: | |
// 生成单字符NFA | |
nfaStack.push(compile(exp, i)); | |
} | |
// 自动并联NFA栈中符合条件的NFA | |
autoCascade(nfaStack, opStack, exp, i+1); | |
} | |
// 应用所有剩余操作符 | |
autoMultiple(nfaStack, opStack, exp); | |
// 添加起始状态和接受状态标记 | |
if (!nfaStack.empty()) { | |
NFA nfa = nfaStack.top(); | |
nfa.in->attr |= STATE_INIT; | |
nfa.out->attr |= STATE_ACCEPT; | |
return nfa; | |
} | |
// 正则表达式为空 | |
State* state = new State(); | |
state->attr |= STATE_ACCEPT | STATE_ACCEPT; | |
return NFA(state, state, 0); | |
} | |
/************ | |
* DFA Part * | |
************/ | |
// DFA数据结构 | |
struct DFA | |
{ | |
vector<int> attrs; // 状态属性 | |
vector<char> inputs; // 输入字符集合 | |
vector<vector<int>> nexts; // 状态转换函数 | |
DFA(const vector<char> inputs, | |
const vector<vector<int>> &nexts, | |
const vector<int> &attrs): | |
inputs(inputs), nexts(nexts), attrs(attrs) {} | |
}; | |
ostream &operator<<(ostream &out, const DFA &dfa) | |
{ | |
cout << header; | |
// 绘制状态 | |
for (int i = 0; i < dfa.attrs.size(); i++) { | |
cout << "\\node[state"; | |
if (dfa.attrs[i] & STATE_INIT) cout << ",initial"; | |
if (dfa.attrs[i] & STATE_ACCEPT) cout << ",accepting"; | |
cout << "](" << i << ")"; | |
if (i) | |
cout << "[below right of=" << i-1 << "]"; | |
cout << "{$" << i << "$};" << endl; | |
} | |
// 绘制状态转换 | |
for (int i = 0; i < dfa.attrs.size(); i++) | |
for (int j = 0; j < dfa.inputs.size(); j++) | |
if (dfa.nexts[i][j] != -1) { | |
cout << "\\path(" << i << ")edge["; | |
if (dfa.nexts[i][j] == i) | |
cout << "loop above"; | |
else | |
cout << "bend right"; | |
cout << "]node{" << dfa.inputs[j] << "}(" << dfa.nexts[i][j] << ");" << endl; | |
} | |
cout << footer; | |
return out; | |
} | |
// 简化DFA | |
// 参数 | |
// dfa :DFA | |
// 返回值 :DFA | |
DFA simplex(const DFA &dfa) | |
{ | |
vector<int> attrs = dfa.attrs; | |
vector<vector<int>> nexts = dfa.nexts; | |
while (true) { | |
// 查找等价的状态 | |
int a = -1, b = -1; | |
for (int i = 0; i < attrs.size(); i++) | |
for (int j = i+1; j < attrs.size(); j++) | |
if ((attrs[i] & STATE_ACCEPT) == (attrs[j] & STATE_ACCEPT)) { | |
bool equal = true; | |
for (int k = 0; k < dfa.inputs.size(); k++) | |
if (nexts[i][k] != nexts[j][k] && | |
(nexts[i][k] != i || nexts[j][k] != j)) { | |
equal = false; | |
break; | |
} | |
if (equal) { | |
a = i; | |
b = j; | |
break; | |
} | |
} | |
// 已经最简 | |
if (a == -1) | |
break; | |
// 简化 | |
#ifdef DFA_DEBUG | |
clog << "simplex find " << a << " equals " << b << endl; | |
#endif | |
attrs[a] |= attrs[b]; | |
attrs.erase(attrs.begin()+b); | |
nexts.erase(nexts.begin()+b); | |
for (int i = 0; i < nexts.size(); i++) | |
for (int j = 0; j < dfa.inputs.size(); j++) | |
if (nexts[i][j] == b) | |
nexts[i][j] = a; | |
else if (nexts[i][j] > b) | |
nexts[i][j]--; | |
} | |
return DFA(dfa.inputs, nexts, attrs); | |
} | |
// 状态集合 | |
struct StateSet | |
{ | |
vector<State*> states; | |
int attr = 0; | |
StateSet() = default; | |
StateSet(const vector<State*> states, int attr): states(states), attr(attr) {} | |
bool empty() { return states.empty(); } | |
}; | |
bool operator==(const StateSet &lhs, const StateSet &rhs) | |
{ | |
return lhs.states == rhs.states && lhs.attr == rhs.attr; | |
} | |
bool operator!=(const StateSet &lhs, const StateSet &rhs) | |
{ | |
return !(lhs==rhs); | |
} | |
// 合并状态集合 | |
// 参数 | |
// set1 :集合1 | |
// set2 :集合2 | |
// 返回值 :集合 | |
StateSet unions(const StateSet &set1, const StateSet &set2) | |
{ | |
vector<State*> states; | |
int i = 0, j = 0; | |
while (i < set1.states.size() && j < set2.states.size()) { | |
if (set1.states[i]->id < set2.states[j]->id) { | |
states.push_back(set1.states[i]); | |
i++; | |
} else if (set1.states[i]->id > set2.states[j]->id) { | |
states.push_back(set2.states[j]); | |
j++; | |
} else { | |
states.push_back(set1.states[i]); | |
i++; | |
j++; | |
} | |
} | |
while (i < set1.states.size()) { | |
states.push_back(set1.states[i]); | |
i++; | |
} | |
while (j < set2.states.size()) { | |
states.push_back(set2.states[j]); | |
j++; | |
} | |
return StateSet(states, set1.attr | set2.attr); | |
} | |
// 获取某个状态通过空转换可以到达的闭包 | |
// 参数 | |
// state :给定的状态 | |
// 返回值 :闭包 | |
StateSet closure(State *state) | |
{ | |
// 集合和属性 | |
vector<State*> states; | |
states.push_back(state); | |
int attr = state->attr; | |
// 广度优先搜索 | |
queue<State*> q; | |
q.push(state); | |
int tag = rand(); | |
state->tag = tag; | |
while (!q.empty()) { | |
State *temp = q.front(); | |
q.pop(); | |
for (int i = 0; i < temp->inputs.size(); i++) | |
if (temp->inputs[i] == INPUT_EP && temp->nexts[i]->tag != tag) { | |
temp->nexts[i]->tag = tag; | |
states.push_back(temp->nexts[i]); | |
attr |= temp->nexts[i]->attr; | |
q.push(temp->nexts[i]); | |
} | |
} | |
sort(states.begin(), states.end(), [](State *lhs, State *rhs)->bool{ | |
return lhs->id < rhs->id; | |
}); | |
#ifdef DFA_DEBUG | |
clog << "closure of " << state->id << " :"; | |
for (State *s: states) | |
clog << ' ' << s->id; | |
clog << endl; | |
#endif | |
return StateSet(states, attr); | |
} | |
// 获取正则表达式对应的输入字符集合 | |
// 参数 | |
// exp :正则表达式 | |
// 返回值 :字符集合 | |
vector<char> getInputs(const string &exp) | |
{ | |
vector<char> inputs; | |
vector<bool> memo(DICT_SIZE, false); | |
for (char c : exp) | |
if (c != '(' && c != ')' && c != '|' && c != '*' && !memo[c]) { | |
memo[c] = true; | |
inputs.push_back(c); | |
} | |
return inputs; | |
} | |
// 求解状态集合的状态输入某个状态能够到达的状态集合 | |
// 参数 | |
// set :状态集合 | |
// c :输入的字符 | |
// 返回值 :状态集合 | |
StateSet nextStateSet(const StateSet &set, char c) | |
{ | |
StateSet set2; | |
for (State *s : set.states) | |
for (int i = 0; i < s->inputs.size(); i++) | |
if (s->inputs[i] == c) | |
set2 = unions(set2, closure(s->nexts[i])); | |
#ifdef DFA_DEBUG | |
clog << "next of"; | |
for (State *s: set.states) | |
clog << ' ' << s->id; | |
clog << " :"; | |
for (State *s: set2.states) | |
clog << ' ' << s->id; | |
clog << endl; | |
#endif | |
return set2; | |
} | |
// 将NFA确定化 | |
// 参数 | |
// nfa :NFA | |
// inputs :输入字符集合 | |
// 返回值 :DFA | |
DFA definitize(const NFA &nfa, vector<char> inputs) | |
{ | |
// 转换函数 | |
vector<vector<int>> nexts; | |
// 确定化 | |
StateSet initSet = closure(nfa.in); | |
vector<StateSet> sets; | |
sets.push_back(initSet); | |
nexts.push_back(vector<int>(inputs.size(), -1)); | |
queue<int> q; | |
q.push(0); | |
while (!q.empty()) { | |
int setid = q.front(); | |
q.pop(); | |
for (int i = 0; i < inputs.size(); i++) { | |
StateSet nextSet = nextStateSet(sets[setid], inputs[i]); | |
if (nextSet.empty()) | |
continue; | |
int j = 0; | |
while (j < sets.size() && nextSet != sets[j]) | |
j++; | |
nexts[setid][i] = j; | |
if (j == sets.size()) { | |
q.push(j); | |
sets.push_back(nextSet); | |
nexts.push_back(vector<int>(inputs.size(), -1)); | |
} | |
} | |
} | |
vector<int> attrs(sets.size()); | |
for (int i = 0; i < sets.size(); i++) | |
attrs[i] = sets[i].attr; | |
return DFA(inputs, nexts, attrs); | |
} | |
int main(int argc, char const *argv[]) | |
{ | |
string exp(argv[1]); | |
cout << simplex(definitize(compile(exp), getInputs(exp))); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment