Last active
October 24, 2016 08:18
-
-
Save zhenghaoz/435706b11c192f0358f43ef9b515ef5c to your computer and use it in GitHub Desktop.
Regexp to NFA(LaTeX)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <cstdio> | |
#include <vector> | |
#include <string> | |
#include <stack> | |
#include <cstdlib> | |
#include <ctime> | |
#include <queue> | |
using namespace std; | |
// #define NFA_DEBUG | |
#define INPUT_EP -64 | |
#define STATE_INIT 0x1 | |
#define STATE_ACCEPT 0x2 | |
#define MIN(a,b) ((a)<(b)?(a):(b)) | |
char header[] = "\\documentclass{article}\n\ | |
\\usepackage{pgf}\n\ | |
\\usepackage{tikz}\n\ | |
\\usetikzlibrary{arrows,automata}\n\ | |
\\usepackage[latin1]{inputenc}\n\ | |
\\usepackage{geometry}\n\ | |
\\geometry{papersize={32cm,32cm}}\n\ | |
\\begin{document}\n\ | |
\\begin{tikzpicture}[->,>=stealth',shorten >=1pt,auto,node distance=2.8cm,semithick]\n"; | |
char footer[] = "\\end{tikzpicture}\n\\end{document}\n"; | |
int stateId = 0; // 记录当前可分配的状态ID | |
// 节点数据结构 | |
struct State | |
{ | |
int tag; // 访问标记(用于访问) | |
int attr; // 状态属性(起始?终止?) | |
int id; // 状态ID | |
vector<char> inputs; // 进入后继状态需要的输入 | |
vector<State*> nexts; // 后继状态指针 | |
State(): attr(0), tag(-1), id(stateId++) {} | |
// 增加一条状态转换 | |
void addNextState(char input, State *next) | |
{ | |
inputs.push_back(input); | |
nexts.push_back(next); | |
} | |
}; | |
// NFA数据结构 | |
struct NFA | |
{ | |
State *in; // 入口 | |
State *out; // 出口 | |
int left; // 对应子表达式的偏移量 | |
NFA() = default; | |
NFA(State *in, State *out, int left): in(in), out(out), left(left) {} | |
}; | |
// 打印状态 | |
// 参数 | |
// out :输出流 | |
// node :状态指针 | |
// lastid :上个状态的ID | |
// 返回 :输出流 | |
ostream &printState(ostream& out, State *node, int& lastid) | |
{ | |
cout << "\\node[state"; | |
if (node->attr & STATE_INIT) cout << ",initial"; | |
if (node->attr & STATE_ACCEPT) cout << ",accepting"; | |
cout << "](" << node->id << ")"; | |
if (lastid != -1) cout << "[below right of=" << lastid << "]"; | |
lastid = node->id; | |
cout << "{$" << node->id << "$};" << endl; | |
return out; | |
} | |
// 打印NFA | |
// 参数 | |
// out :输出流 | |
// nfa :NFA | |
// 返回 :输出流 | |
ostream &operator<<(ostream& out, const NFA &nfa) | |
{ | |
cout << header; | |
int tag = rand(); | |
int lastid = -1; | |
queue<State*> q; | |
q.push(nfa.in); | |
nfa.in->tag = tag; | |
printState(out, nfa.in, lastid); | |
while (!q.empty()) { | |
State *node = q.front(); | |
q.pop(); | |
for (int i = 0; i < node->nexts.size(); i++) { | |
char input = node->inputs[i]; | |
State *next = node->nexts[i]; | |
if (next->tag != tag) { | |
next->tag = tag; | |
printState(out, next, lastid); | |
q.push(next); | |
} | |
cout << "\\path(" << node->id << ") edge[bend right] node{$"; | |
if (input == INPUT_EP) cout << "\\epsilon"; | |
else cout << input; | |
cout << "$}(" << next->id << ");" << endl; | |
} | |
} | |
cout << footer << endl; | |
return out; | |
} | |
// 编译字符 | |
// 参数 | |
// exp :正则表达式字符串 | |
// i :字符所在位置 | |
// 返回值 :NFA | |
NFA compile(const string &exp, int i) | |
{ | |
State *in = new State(); | |
State *out = new State(); | |
#ifdef NFA_DEBUG | |
clog << "compile " << in->id << "-" << exp[i] << "->" << out->id << endl; | |
#endif | |
in->addNextState(exp[i], out); | |
return NFA(in, out, i); | |
} | |
// 串联NFA | |
// 参数 | |
// lhs :在前的NFA | |
// rhs :在后的NFA | |
// 返回值 :NFA | |
NFA cascade(const NFA &lhs, const NFA &rhs) | |
{ | |
#ifdef NFA_DEBUG | |
clog << "cascade " << lhs.in->id << "->" << lhs.out->id << " and " << rhs.in->id << "->" << rhs.out->id << endl; | |
#endif | |
lhs.out->inputs = rhs.in->inputs; | |
lhs.out->nexts = rhs.in->nexts; | |
delete rhs.in; | |
return NFA(lhs.in, rhs.out, lhs.left); | |
} | |
// 并联NFA | |
// 参数 | |
// lhs :NFA1 | |
// rhs :NFA2 | |
// 返回值 :NFA | |
NFA multiple(const NFA &lhs, const NFA &rhs) | |
{ | |
#ifdef NFA_DEBUG | |
clog << "multiple " << lhs.in->id << "->" << lhs.out->id << " and " << rhs.in->id << "->" << rhs.out->id << endl; | |
#endif | |
State *in = new State(); | |
State *out = new State(); | |
in->addNextState(INPUT_EP, lhs.in); | |
in->addNextState(INPUT_EP, rhs.in); | |
lhs.out->addNextState(INPUT_EP, out); | |
rhs.out->addNextState(INPUT_EP, out); | |
return NFA(in, out, MIN(lhs.left, rhs.left)); | |
} | |
// 重复NFA | |
// 参数 | |
// nfa :需要重复的NFA | |
// 返回值 :NFA | |
NFA repeat(const NFA &nfa) | |
{ | |
State *in = new State(); | |
State *out = new State(); | |
in->addNextState(INPUT_EP, nfa.in); | |
in->addNextState(INPUT_EP, out); | |
nfa.out->addNextState(INPUT_EP, nfa.in); | |
nfa.out->addNextState(INPUT_EP, out); | |
return NFA(in, out, nfa.left); | |
} | |
// 自动串联NFA栈中符合条件的NFA | |
// 参数 | |
// nfaStack :NFA栈 | |
// opstack :符号栈 | |
void autoCascade(stack<NFA> &nfaStack, stack<int> &opStack, const string &exp, int next) | |
{ | |
// 下一个字符不应该是* | |
if (next < exp.length() && exp[next] == '*') | |
return; | |
if (nfaStack.size() >= 2 && | |
(opStack.empty() || opStack.top()+1 < nfaStack.top().left)) { | |
#ifdef NFA_DEBUG | |
clog << "autoCascade "; | |
if (opStack.empty()) clog << "empty opstack"; | |
else clog << opStack.top()+1 << "!=" << nfaStack.top().left; | |
clog << endl; | |
#endif | |
NFA nfa2 = nfaStack.top(); | |
nfaStack.pop(); | |
NFA nfa1 = nfaStack.top(); | |
nfaStack.pop(); | |
nfaStack.push(cascade(nfa1, nfa2)); | |
} | |
} | |
// 自动并联NFA栈中符合条件的NFA | |
// 参数 | |
// nfaStack :NFA栈 | |
// opstack :符号栈 | |
void autoMultiple(stack<NFA> &nfaStack, stack<int> &opStack, const string &exp) | |
{ | |
while (!opStack.empty() && exp[opStack.top()] == '|') { | |
NFA nfa1 = nfaStack.top(); | |
nfaStack.pop(); | |
NFA nfa2 = nfaStack.top(); | |
nfaStack.pop(); | |
opStack.pop(); | |
nfaStack.push(multiple(nfa1, nfa2)); | |
} | |
} | |
// 编译正则表达式 | |
// 参数 | |
// exp :正则表达式字符串 | |
// 返回值 :NFA | |
NFA compile(const string &exp) | |
{ | |
stack<NFA> nfaStack; // NFA栈 | |
stack<int> opStack; // 操作符栈 | |
for (int i = 0; i < exp.length(); i++) { | |
switch (exp[i]) { | |
case '|': | |
case '(': | |
// 显显式操作符入栈 | |
opStack.push(i); | |
break; | |
case ')': | |
// 应用左括号前的所有的操作符 | |
autoMultiple(nfaStack, opStack, exp); | |
nfaStack.top().left = opStack.top(); | |
opStack.pop(); | |
break; | |
case '*': { | |
// 重复栈顶NFA | |
NFA nfa = nfaStack.top(); | |
nfaStack.pop(); | |
nfaStack.push(repeat(nfa)); | |
break; | |
} default: | |
// 生成单字符NFA | |
nfaStack.push(compile(exp, i)); | |
} | |
// 自动并联NFA栈中符合条件的NFA | |
autoCascade(nfaStack, opStack, exp, i+1); | |
} | |
// 应用所有剩余操作符 | |
autoMultiple(nfaStack, opStack, exp); | |
// 添加起始状态和接受状态标记 | |
if (!nfaStack.empty()) { | |
NFA nfa = nfaStack.top(); | |
nfa.in->attr |= STATE_INIT; | |
nfa.out->attr |= STATE_ACCEPT; | |
return nfa; | |
} | |
// 正则表达式为空 | |
State* state = new State(); | |
state->attr |= STATE_ACCEPT | STATE_ACCEPT; | |
return NFA(state, state, 0); | |
} | |
int main(int argc, char const *argv[]) | |
{ | |
string exp(argv[1]); | |
cout << compile(exp); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment