Created
November 20, 2016 02:57
-
-
Save zhenghaoz/ce0b561b9e58818a8ca7af9aaec773db to your computer and use it in GitHub Desktop.
Experimental LL(1) Parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <string> | |
#include <cstdio> | |
#include <cstring> | |
#include <map> | |
#include <climits> | |
#include <cctype> | |
#include <vector> | |
#include <set> | |
#include <algorithm> | |
#include <iomanip> | |
#define IS_NON_TERMINAL(c) (isupper(c)) | |
#define IS_EMPTY(c) ((c)=='_') | |
#define IS_TERMIAL(c) (!IS_NON_TERMINAL(c)&&!IS_EMPTY(c)) | |
using namespace std; | |
class CFG | |
{ | |
map<char, vector<string>> rightSides; // 产生式 | |
map<char, int> empty; // EMPTY表 | |
map<char, set<char>> firsts; // FIRST集 | |
map<char, set<char>> follows; // FOLLOW集 | |
map<char, map<char, string>> table; // 预测分析表 | |
set<char> nonTerminals; // 非终结符 | |
char start; // 起始非终结符 | |
bool analyzed = false; // 分析完成标志 | |
// 检查产生式的右侧是否存在空串 | |
bool containEmpty(char nonTerminal) | |
{ | |
for (const string& rightSide : rightSides[nonTerminal]) | |
if (rightSide == "_") | |
return true; | |
return false; | |
} | |
// 检查产生式是否能够产生空串 | |
int productEmpty(char nonTerminal) | |
{ | |
for (const string& rightSide : rightSides[nonTerminal]) { | |
bool determined = true; | |
bool toEmpty = true; | |
for (char c : rightSide) | |
if (IS_TERMIAL(c)) { // 存在终结符,一定不能推出空串 | |
toEmpty = false; | |
break; | |
} else if (IS_NON_TERMINAL(c)) { | |
if (empty[c] == 0) { // 存在非空的非终结符,一定不能推出空串 | |
toEmpty = false; | |
break; | |
} else if (empty[c] == -1) {// 存在不确定的非终结符 | |
determined = false; | |
} | |
} | |
// 确定并且可以推出空串 | |
if (determined && toEmpty) | |
return 1; | |
// 忽略不确定因素可以推出空串 | |
if (toEmpty) | |
return -1; | |
} | |
return 0; | |
} | |
// 标记可为空的非终结符 | |
void generateEmpty() | |
{ | |
for (char c : nonTerminals) | |
empty[c] = -1; | |
// 初始化,将能够直接产生空串的非终结符标记为1 | |
for (char c : nonTerminals) | |
if (containEmpty(c)) | |
empty[c] = 1; | |
while (true) { | |
// 检查产生式是否有/不可能产生空串 | |
for (char c : nonTerminals) | |
if (empty[c] == -1) | |
empty[c] = productEmpty(c); | |
// 检查是否求解完成 | |
bool finished = true; | |
for (auto it : empty) | |
if (it.second == -1) | |
finished = false; | |
if (finished) | |
break; | |
} | |
} | |
// 生成某个非终结符的FIRST集 | |
set<char> first(char nonTerminal) | |
{ | |
if (!firsts[nonTerminal].empty()) // 已经存在 | |
return firsts[nonTerminal]; | |
set<char> ret; | |
for (const string& rightSide : rightSides[nonTerminal]) | |
if (rightSide == "_") // 产生空串 | |
ret.insert('_'); | |
else { | |
auto it = rightSide.begin(); | |
// 处理可以推出空串的非终结符 | |
while (it != rightSide.end() && IS_NON_TERMINAL(*it) && empty[*it]) { | |
for (char terminal : first(*it)) | |
if (!IS_EMPTY(terminal)) | |
ret.insert(terminal); | |
it++; | |
} | |
if (it != rightSide.end()) { // 处理非终结符后首个终结符/非空非终结符 | |
if (IS_NON_TERMINAL(*it)) { | |
const set<char>& firstSet = first(*it); | |
ret.insert(firstSet.begin(), firstSet.end()); | |
} else { | |
ret.insert(*it); | |
} | |
} else { // 产生式可以为空 | |
ret.insert('_'); | |
} | |
} | |
firsts[nonTerminal] = ret; | |
return ret; | |
} | |
// 生成FIRST集 | |
void generateFirst() | |
{ | |
generateEmpty(); | |
for (char c : nonTerminals) | |
firsts[c] = first(c); | |
} | |
// 生成FOLLOW集 | |
void generateFollow() | |
{ | |
// 生成FIRST集 | |
generateFirst(); | |
// 初始化FOLLOW集 | |
follows[start].insert('#'); | |
for (char nonTerminal : nonTerminals) | |
for (const string& rightSide : rightSides[nonTerminal]) { // 遍历产生式 | |
auto nonTerminalIt = rightSide.begin(); | |
while (nonTerminalIt+1 != rightSide.end()) { // 更新非终结符的FOLLOW集 | |
if (IS_NON_TERMINAL(*nonTerminalIt)) { | |
auto followIt = nonTerminalIt+1; | |
// 处理可以推出空串的非终结符 | |
while (followIt != rightSide.end() && IS_NON_TERMINAL(*followIt) && empty[*followIt]) { | |
for (char c : first(*followIt)) | |
if (!IS_EMPTY(c)) | |
follows[*nonTerminalIt].insert(c); | |
followIt++; | |
} | |
// 处理非终结符后首个终结符/非空非终结符 | |
if (followIt != rightSide.end()) { | |
if (IS_TERMIAL(*followIt)) { | |
follows[*nonTerminalIt].insert(*followIt); | |
} else if (IS_NON_TERMINAL(*followIt)) { | |
for (char c : first(*followIt)) | |
follows[*nonTerminalIt].insert(c); | |
} | |
} | |
} | |
nonTerminalIt++; | |
} | |
} | |
// 迭代更新FOLLOW集 | |
while (true) { | |
// 统计FOLLOW集总大小 | |
int breforeCount = 0; | |
for (char nonTerminal : nonTerminals) | |
breforeCount += follows[nonTerminal].size(); | |
for (char nonTerminal : nonTerminals) | |
for (const string& rightSide : rightSides[nonTerminal]) { // 遍历产生式 | |
auto nonTerminalIt = rightSide.begin(); | |
while (nonTerminalIt != rightSide.end()) { // 更新非终结符的FOLLOW集 | |
auto followIt = nonTerminalIt+1; | |
while (followIt != rightSide.end() && | |
((IS_NON_TERMINAL(*followIt) && empty[*followIt] == 1)|| IS_EMPTY(*followIt))) | |
followIt++; | |
if (followIt == rightSide.end()) | |
follows[*nonTerminalIt].insert(follows[nonTerminal].begin(), follows[nonTerminal].end()); | |
nonTerminalIt++; | |
} | |
} | |
// 统计FOLLOW集总大小 | |
int afterCount = 0; | |
for (char nonTerminal : nonTerminals) | |
afterCount += follows[nonTerminal].size(); | |
// FOLLOW集没有变化,则结束迭代 | |
if (breforeCount == afterCount) | |
break; | |
} | |
} | |
void analyze() | |
{ | |
if (analyzed) | |
return; | |
analyzed = true; | |
firsts.clear(); | |
follows.clear(); | |
// 生成FOLLOW集 | |
generateFollow(); | |
for (char nonTerminal : nonTerminals) | |
for (const string& rightSide : rightSides[nonTerminal]) { | |
// 生成SELECT集,根据SELECT集构造预测分析表 | |
set<char> select; | |
auto it = rightSide.begin(); | |
// 处理可为空的非终结符 | |
while (it != rightSide.end() && | |
((IS_NON_TERMINAL(*it) && empty[*it] == 1) || IS_EMPTY(*it))) { | |
for (char c : firsts[*it]) | |
if (IS_EMPTY(c)) | |
select.insert(c); | |
it++; | |
} | |
if (it != rightSide.end()) { // 存在非空的非终结符/终结符 | |
if (IS_NON_TERMINAL(*it)) | |
select.insert(firsts[*it].begin(), firsts[*it].end()); | |
else if (IS_TERMIAL(*it)) | |
select.insert(*it); | |
} else { // 产生式可能为空 | |
select.insert(follows[nonTerminal].begin(), follows[nonTerminal].end()); | |
} | |
for (char c : select) | |
table[nonTerminal][c] = rightSide; | |
} | |
} | |
public: | |
CFG(char c): start(c) {} | |
// 向CFG添加一条产生式,成功返回0,失败返回-1 | |
int addProduction(const string& productions) | |
{ | |
// 添加产生式后需要重新分析 | |
analyzed = false; | |
// 获取左侧非终结符 | |
char leftSide; | |
int ptr = 0; | |
if (ptr < productions.length() && IS_NON_TERMINAL(productions[ptr])) { | |
leftSide = productions[ptr]; | |
nonTerminals.insert(leftSide); | |
} else return -1; | |
ptr++; | |
// 检查冒号':' | |
if (ptr >= productions.length() || productions[ptr] != ':') | |
return -1; | |
ptr++; | |
// 获取右侧产生体 | |
int end; | |
while ((end = productions.find('|', ptr)) != string::npos) { | |
rightSides[leftSide].emplace_back(productions.begin()+ptr, productions.begin()+end); | |
ptr = end+1; | |
} | |
if (ptr < productions.length()) { | |
rightSides[leftSide].emplace_back(productions.begin()+ptr, productions.end()); | |
} else return -1; | |
return 0; | |
} | |
// 打印文法分析结果 | |
void print() | |
{ | |
// 分析文法 | |
analyze(); | |
// 输出起始非终结符 | |
cout << "Start nonTerminal: " << start << endl; | |
// 输出产生式 | |
cout << "Productions:" << endl; | |
for (char nonTerminal : nonTerminals) { | |
cout << nonTerminal << ":"; | |
const vector<string>& rightSide = rightSides[nonTerminal]; | |
for (int i = 0; i < rightSide.size(); i++) { | |
if (i) putchar('|'); | |
cout << rightSide[i]; | |
} | |
cout << endl; | |
} | |
// 输出产生空串的非终结符表 | |
cout << "Nonterminals product empty string:" << endl; | |
for (char nonTerminal : nonTerminals) | |
cout << nonTerminal << ' '; | |
cout << endl; | |
for (char nonTerminal : nonTerminals) | |
cout << empty[nonTerminal] << ' '; | |
cout << endl; | |
// 输出FIRST集 | |
cout << "First sets:" << endl; | |
for (char nonTerminal : nonTerminals) { | |
cout << "FIRST(" << nonTerminal << "):"; | |
for (char terminal : firsts[nonTerminal]) | |
cout << ' ' << terminal; | |
cout << endl; | |
} | |
// 输出FOLLOW集 | |
cout << "Follow sets:" << endl; | |
for (char nonTerminal : nonTerminals) { | |
cout << "FOLLOW(" << nonTerminal << "):"; | |
for (char terminal : follows[nonTerminal]) | |
cout << ' ' << terminal; | |
cout << endl; | |
} | |
// 输出预测分析表 | |
cout << "Predict parsing table:" << endl; | |
for (char nonTerminal : nonTerminals) { | |
cout << nonTerminal; | |
for (auto it : table[nonTerminal]) | |
cout << "\t" << it.first << ":" << it.second << endl; | |
} | |
} | |
// 分析句子,成功返回0,失败返回-1 | |
int parse(const string& str) | |
{ | |
// 分析文法 | |
analyze(); | |
// 输出表头 | |
cout << left << setw(str.length()+4) << "Stack" | |
<< left << setw(str.length()+4) << "String" << "Description" << endl; | |
// 初始化分析栈 | |
string stack = {'#', start}; | |
auto it = str.begin(); | |
while (it != str.end()) { | |
// 显示分析栈内容 | |
string content = string(stack.rbegin(), stack.rend()); | |
cout << left << setw(str.length()+4) << content | |
<< left << setw(str.length()+4) << string(it, str.end()); | |
char top = stack.back(); | |
stack.pop_back(); | |
if (IS_TERMIAL(top) && top == *it) { // 匹配成功 | |
cout << "match \'" << *it << '\''; | |
it++; | |
} else if (IS_NON_TERMINAL(top) && !table[top][*it].empty()) { // 推导 | |
const string& rightSide = table[top][*it]; | |
for (int i = rightSide.size()-1; i >= 0; i--) | |
if (!IS_EMPTY(rightSide[i])) | |
stack.push_back(rightSide[i]); | |
cout << top << "->" << rightSide; | |
} else { // 匹配失败 | |
cout << "parse failed" << endl; | |
return -1; | |
} | |
cout << endl; | |
} | |
return 0; | |
} | |
}; | |
int main(int argc, char const *argv[]) | |
{ | |
// 程序描述 | |
cout << "Experimental LL(1) Parser v0.1" << endl; | |
cout << "This is an experimental LL(1) parser with lots of limits. Grammers are represented by following rules:" << endl; | |
cout << "Upper case letter : represents nonterminal" << endl; | |
cout << "_ : represents empty string" << endl; | |
cout << ": : represents ->" << endl; | |
cout << "| : represents alternative" << endl; | |
cout << "Other characters : represents terminal" << endl; | |
cout << "For example, 'A:a|B|aB|_' represents A->a|B|aB|$\\epsilon$'." << endl << endl; | |
// 输入文法 | |
cout << "STEP1: Input grammers" << endl; | |
cout << "Input the start non-terminal:"; | |
char start; | |
cin >> start; | |
CFG cfg(start); | |
cout << "Input the number of productions:"; | |
int productionNum; | |
cin >> productionNum; | |
cout << "Input productions:" << endl; | |
for (int i = 0; i < productionNum; i++) { | |
string productions; | |
cin >> productions; | |
cfg.addProduction(productions); | |
} | |
// 分析文法 | |
cout << endl << "STEP2: Analyze grammers" << endl; | |
cfg.print(); | |
// 分析字符串 | |
cout << endl << "STEP3: Parse string" << endl; | |
cout << "Input the number of strings:" << endl; | |
int strungNum; | |
cin >> strungNum; | |
for (int i = 0; i < strungNum; i++) { | |
cout << "Input string:"; | |
string str; | |
cin >> str; | |
cfg.parse(str); | |
} | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment