Skip to content

Instantly share code, notes, and snippets.

@zhenghaoz
Created November 20, 2016 02:57
Show Gist options
  • Save zhenghaoz/ce0b561b9e58818a8ca7af9aaec773db to your computer and use it in GitHub Desktop.
Save zhenghaoz/ce0b561b9e58818a8ca7af9aaec773db to your computer and use it in GitHub Desktop.
Experimental LL(1) Parser
#include <iostream>
#include <string>
#include <cstdio>
#include <cstring>
#include <map>
#include <climits>
#include <cctype>
#include <vector>
#include <set>
#include <algorithm>
#include <iomanip>
#define IS_NON_TERMINAL(c) (isupper(c))
#define IS_EMPTY(c) ((c)=='_')
#define IS_TERMIAL(c) (!IS_NON_TERMINAL(c)&&!IS_EMPTY(c))
using namespace std;
class CFG
{
map<char, vector<string>> rightSides; // 产生式
map<char, int> empty; // EMPTY表
map<char, set<char>> firsts; // FIRST集
map<char, set<char>> follows; // FOLLOW集
map<char, map<char, string>> table; // 预测分析表
set<char> nonTerminals; // 非终结符
char start; // 起始非终结符
bool analyzed = false; // 分析完成标志
// 检查产生式的右侧是否存在空串
bool containEmpty(char nonTerminal)
{
for (const string& rightSide : rightSides[nonTerminal])
if (rightSide == "_")
return true;
return false;
}
// 检查产生式是否能够产生空串
int productEmpty(char nonTerminal)
{
for (const string& rightSide : rightSides[nonTerminal]) {
bool determined = true;
bool toEmpty = true;
for (char c : rightSide)
if (IS_TERMIAL(c)) { // 存在终结符,一定不能推出空串
toEmpty = false;
break;
} else if (IS_NON_TERMINAL(c)) {
if (empty[c] == 0) { // 存在非空的非终结符,一定不能推出空串
toEmpty = false;
break;
} else if (empty[c] == -1) {// 存在不确定的非终结符
determined = false;
}
}
// 确定并且可以推出空串
if (determined && toEmpty)
return 1;
// 忽略不确定因素可以推出空串
if (toEmpty)
return -1;
}
return 0;
}
// 标记可为空的非终结符
void generateEmpty()
{
for (char c : nonTerminals)
empty[c] = -1;
// 初始化,将能够直接产生空串的非终结符标记为1
for (char c : nonTerminals)
if (containEmpty(c))
empty[c] = 1;
while (true) {
// 检查产生式是否有/不可能产生空串
for (char c : nonTerminals)
if (empty[c] == -1)
empty[c] = productEmpty(c);
// 检查是否求解完成
bool finished = true;
for (auto it : empty)
if (it.second == -1)
finished = false;
if (finished)
break;
}
}
// 生成某个非终结符的FIRST集
set<char> first(char nonTerminal)
{
if (!firsts[nonTerminal].empty()) // 已经存在
return firsts[nonTerminal];
set<char> ret;
for (const string& rightSide : rightSides[nonTerminal])
if (rightSide == "_") // 产生空串
ret.insert('_');
else {
auto it = rightSide.begin();
// 处理可以推出空串的非终结符
while (it != rightSide.end() && IS_NON_TERMINAL(*it) && empty[*it]) {
for (char terminal : first(*it))
if (!IS_EMPTY(terminal))
ret.insert(terminal);
it++;
}
if (it != rightSide.end()) { // 处理非终结符后首个终结符/非空非终结符
if (IS_NON_TERMINAL(*it)) {
const set<char>& firstSet = first(*it);
ret.insert(firstSet.begin(), firstSet.end());
} else {
ret.insert(*it);
}
} else { // 产生式可以为空
ret.insert('_');
}
}
firsts[nonTerminal] = ret;
return ret;
}
// 生成FIRST集
void generateFirst()
{
generateEmpty();
for (char c : nonTerminals)
firsts[c] = first(c);
}
// 生成FOLLOW集
void generateFollow()
{
// 生成FIRST集
generateFirst();
// 初始化FOLLOW集
follows[start].insert('#');
for (char nonTerminal : nonTerminals)
for (const string& rightSide : rightSides[nonTerminal]) { // 遍历产生式
auto nonTerminalIt = rightSide.begin();
while (nonTerminalIt+1 != rightSide.end()) { // 更新非终结符的FOLLOW集
if (IS_NON_TERMINAL(*nonTerminalIt)) {
auto followIt = nonTerminalIt+1;
// 处理可以推出空串的非终结符
while (followIt != rightSide.end() && IS_NON_TERMINAL(*followIt) && empty[*followIt]) {
for (char c : first(*followIt))
if (!IS_EMPTY(c))
follows[*nonTerminalIt].insert(c);
followIt++;
}
// 处理非终结符后首个终结符/非空非终结符
if (followIt != rightSide.end()) {
if (IS_TERMIAL(*followIt)) {
follows[*nonTerminalIt].insert(*followIt);
} else if (IS_NON_TERMINAL(*followIt)) {
for (char c : first(*followIt))
follows[*nonTerminalIt].insert(c);
}
}
}
nonTerminalIt++;
}
}
// 迭代更新FOLLOW集
while (true) {
// 统计FOLLOW集总大小
int breforeCount = 0;
for (char nonTerminal : nonTerminals)
breforeCount += follows[nonTerminal].size();
for (char nonTerminal : nonTerminals)
for (const string& rightSide : rightSides[nonTerminal]) { // 遍历产生式
auto nonTerminalIt = rightSide.begin();
while (nonTerminalIt != rightSide.end()) { // 更新非终结符的FOLLOW集
auto followIt = nonTerminalIt+1;
while (followIt != rightSide.end() &&
((IS_NON_TERMINAL(*followIt) && empty[*followIt] == 1)|| IS_EMPTY(*followIt)))
followIt++;
if (followIt == rightSide.end())
follows[*nonTerminalIt].insert(follows[nonTerminal].begin(), follows[nonTerminal].end());
nonTerminalIt++;
}
}
// 统计FOLLOW集总大小
int afterCount = 0;
for (char nonTerminal : nonTerminals)
afterCount += follows[nonTerminal].size();
// FOLLOW集没有变化,则结束迭代
if (breforeCount == afterCount)
break;
}
}
void analyze()
{
if (analyzed)
return;
analyzed = true;
firsts.clear();
follows.clear();
// 生成FOLLOW集
generateFollow();
for (char nonTerminal : nonTerminals)
for (const string& rightSide : rightSides[nonTerminal]) {
// 生成SELECT集,根据SELECT集构造预测分析表
set<char> select;
auto it = rightSide.begin();
// 处理可为空的非终结符
while (it != rightSide.end() &&
((IS_NON_TERMINAL(*it) && empty[*it] == 1) || IS_EMPTY(*it))) {
for (char c : firsts[*it])
if (IS_EMPTY(c))
select.insert(c);
it++;
}
if (it != rightSide.end()) { // 存在非空的非终结符/终结符
if (IS_NON_TERMINAL(*it))
select.insert(firsts[*it].begin(), firsts[*it].end());
else if (IS_TERMIAL(*it))
select.insert(*it);
} else { // 产生式可能为空
select.insert(follows[nonTerminal].begin(), follows[nonTerminal].end());
}
for (char c : select)
table[nonTerminal][c] = rightSide;
}
}
public:
CFG(char c): start(c) {}
// 向CFG添加一条产生式,成功返回0,失败返回-1
int addProduction(const string& productions)
{
// 添加产生式后需要重新分析
analyzed = false;
// 获取左侧非终结符
char leftSide;
int ptr = 0;
if (ptr < productions.length() && IS_NON_TERMINAL(productions[ptr])) {
leftSide = productions[ptr];
nonTerminals.insert(leftSide);
} else return -1;
ptr++;
// 检查冒号':'
if (ptr >= productions.length() || productions[ptr] != ':')
return -1;
ptr++;
// 获取右侧产生体
int end;
while ((end = productions.find('|', ptr)) != string::npos) {
rightSides[leftSide].emplace_back(productions.begin()+ptr, productions.begin()+end);
ptr = end+1;
}
if (ptr < productions.length()) {
rightSides[leftSide].emplace_back(productions.begin()+ptr, productions.end());
} else return -1;
return 0;
}
// 打印文法分析结果
void print()
{
// 分析文法
analyze();
// 输出起始非终结符
cout << "Start nonTerminal: " << start << endl;
// 输出产生式
cout << "Productions:" << endl;
for (char nonTerminal : nonTerminals) {
cout << nonTerminal << ":";
const vector<string>& rightSide = rightSides[nonTerminal];
for (int i = 0; i < rightSide.size(); i++) {
if (i) putchar('|');
cout << rightSide[i];
}
cout << endl;
}
// 输出产生空串的非终结符表
cout << "Nonterminals product empty string:" << endl;
for (char nonTerminal : nonTerminals)
cout << nonTerminal << ' ';
cout << endl;
for (char nonTerminal : nonTerminals)
cout << empty[nonTerminal] << ' ';
cout << endl;
// 输出FIRST集
cout << "First sets:" << endl;
for (char nonTerminal : nonTerminals) {
cout << "FIRST(" << nonTerminal << "):";
for (char terminal : firsts[nonTerminal])
cout << ' ' << terminal;
cout << endl;
}
// 输出FOLLOW集
cout << "Follow sets:" << endl;
for (char nonTerminal : nonTerminals) {
cout << "FOLLOW(" << nonTerminal << "):";
for (char terminal : follows[nonTerminal])
cout << ' ' << terminal;
cout << endl;
}
// 输出预测分析表
cout << "Predict parsing table:" << endl;
for (char nonTerminal : nonTerminals) {
cout << nonTerminal;
for (auto it : table[nonTerminal])
cout << "\t" << it.first << ":" << it.second << endl;
}
}
// 分析句子,成功返回0,失败返回-1
int parse(const string& str)
{
// 分析文法
analyze();
// 输出表头
cout << left << setw(str.length()+4) << "Stack"
<< left << setw(str.length()+4) << "String" << "Description" << endl;
// 初始化分析栈
string stack = {'#', start};
auto it = str.begin();
while (it != str.end()) {
// 显示分析栈内容
string content = string(stack.rbegin(), stack.rend());
cout << left << setw(str.length()+4) << content
<< left << setw(str.length()+4) << string(it, str.end());
char top = stack.back();
stack.pop_back();
if (IS_TERMIAL(top) && top == *it) { // 匹配成功
cout << "match \'" << *it << '\'';
it++;
} else if (IS_NON_TERMINAL(top) && !table[top][*it].empty()) { // 推导
const string& rightSide = table[top][*it];
for (int i = rightSide.size()-1; i >= 0; i--)
if (!IS_EMPTY(rightSide[i]))
stack.push_back(rightSide[i]);
cout << top << "->" << rightSide;
} else { // 匹配失败
cout << "parse failed" << endl;
return -1;
}
cout << endl;
}
return 0;
}
};
int main(int argc, char const *argv[])
{
// 程序描述
cout << "Experimental LL(1) Parser v0.1" << endl;
cout << "This is an experimental LL(1) parser with lots of limits. Grammers are represented by following rules:" << endl;
cout << "Upper case letter : represents nonterminal" << endl;
cout << "_ : represents empty string" << endl;
cout << ": : represents ->" << endl;
cout << "| : represents alternative" << endl;
cout << "Other characters : represents terminal" << endl;
cout << "For example, 'A:a|B|aB|_' represents A->a|B|aB|$\\epsilon$'." << endl << endl;
// 输入文法
cout << "STEP1: Input grammers" << endl;
cout << "Input the start non-terminal:";
char start;
cin >> start;
CFG cfg(start);
cout << "Input the number of productions:";
int productionNum;
cin >> productionNum;
cout << "Input productions:" << endl;
for (int i = 0; i < productionNum; i++) {
string productions;
cin >> productions;
cfg.addProduction(productions);
}
// 分析文法
cout << endl << "STEP2: Analyze grammers" << endl;
cfg.print();
// 分析字符串
cout << endl << "STEP3: Parse string" << endl;
cout << "Input the number of strings:" << endl;
int strungNum;
cin >> strungNum;
for (int i = 0; i < strungNum; i++) {
cout << "Input string:";
string str;
cin >> str;
cfg.parse(str);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment