Created
April 10, 2021 04:30
-
-
Save ShawSumma/e1fa4dcdf01b2916f58c54c90e677af0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module source.app; | |
import std.algorithm; | |
import std.stdio; | |
import std.string; | |
import std.array; | |
import std.file; | |
import std.conv : to; | |
import core.stdc.ctype; | |
class Grammar | |
{ | |
string start; | |
string[][][string] productions; | |
this(string start_ = "start:") | |
{ | |
start = start_; | |
} | |
void add(string left, string[] right) | |
{ | |
if (left !in productions) | |
{ | |
productions[left] = [right]; | |
} | |
else | |
{ | |
productions[left] ~= right; | |
} | |
} | |
bool isTerm(string sym) | |
{ | |
return !isNonTerm(sym); | |
} | |
bool isNonTerm(string sym) | |
{ | |
return !(sym !in productions); | |
} | |
override string toString() | |
{ | |
string ret; | |
foreach (key, rules; productions) | |
{ | |
ret ~= key[1 .. $]; | |
ret ~= ": "; | |
foreach (index1, rule; rules) | |
{ | |
if (index1 != 0) | |
{ | |
ret ~= " | "; | |
} | |
foreach (index2, value; rule) | |
{ | |
if (index2 != 0) | |
{ | |
ret ~= ", "; | |
} | |
if (isTerm(value)) | |
{ | |
if (value == "\n") | |
{ | |
ret ~= "'\\n'"; | |
} | |
else if (value == "\t") | |
{ | |
ret ~= "'\\t'"; | |
} | |
else if (value == "'") | |
{ | |
ret ~= "'\\'"; | |
} | |
else if (value == "\r") | |
{ | |
ret ~= "'\r'"; | |
} | |
else | |
{ | |
ret ~= "'" ~ value ~ "'"; | |
} | |
} | |
else | |
{ | |
ret ~= value[1 .. $]; | |
} | |
} | |
} | |
ret ~= '\n'; | |
} | |
return ret; | |
} | |
} | |
final class Row | |
{ | |
size_t dot; | |
string left; | |
string[] right; | |
size_t start; | |
size_t end; | |
Row[] completes; | |
this(size_t dot_, string left_, string[] right_, size_t start_, size_t end_, | |
Row[] completes_ = null) | |
{ | |
dot = dot_; | |
left = left_; | |
right = right_; | |
completes = completes_; | |
start = start_; | |
end = end_; | |
} | |
string next() | |
{ | |
if (dot < right.length) | |
{ | |
return right[dot]; | |
} | |
return ""; | |
} | |
bool isComplete() | |
{ | |
return dot == right.length; | |
} | |
override bool opEquals(Object obj) | |
{ | |
Row row = cast(Row) obj; | |
if (row is null) | |
{ | |
return false; | |
} | |
return left == row.left && right == row.right && dot == row.dot | |
&& start == row.start && end == row.end; | |
} | |
} | |
final class Table | |
{ | |
size_t k; | |
Row[] rows; | |
this(size_t k_) | |
{ | |
k = k_; | |
} | |
void addRow(Row row, Row complete = null) | |
{ | |
if (!rows.canFind(row)) | |
{ | |
rows ~= row; | |
} | |
if (complete !is null && !row.completes.canFind(complete)) | |
{ | |
row.completes ~= complete; | |
} | |
} | |
size_t length() | |
{ | |
return rows.length; | |
} | |
} | |
class Parser | |
{ | |
Grammar grammar; | |
Table[] tables; | |
string[][] words; | |
this(Grammar grammar_) | |
{ | |
grammar = grammar_; | |
} | |
Node parse(string str) | |
{ | |
string[] nextWords; | |
foreach (chr; str) | |
{ | |
nextWords ~= [chr]; | |
} | |
run(nextWords); | |
Row[] comp = completes; | |
if (comp.length != 1) | |
{ | |
throw new Exception("parse error: unknown"); | |
} | |
Node ret = makeNode(comp[0]); | |
return ret.children[0]; | |
} | |
void run(string[] words_) | |
{ | |
grammar.productions["GAMMA"] = [[grammar.start]]; | |
scope (exit) | |
{ | |
grammar.productions.remove("GAMMA"); | |
} | |
words = null; | |
foreach (word; words_) | |
{ | |
words ~= [word]; | |
} | |
tables = null; | |
foreach (i; 0 .. words.length + 1) | |
{ | |
addTable(i); | |
} | |
tables[0].addRow(new Row(0, "", ["GAMMA"], 0, 0)); | |
size_t[] last; | |
foreach (i; 0 .. words.length + 1) | |
{ | |
size_t[][] cur = tables[i].rows.map!(x => [x.start, x.end]).array; | |
if (cur.length == 0) | |
{ | |
throw new Exception("parse error: char #" ~ to!string(last[0] + 1)); | |
} | |
last = cur[0]; | |
size_t index = 0; | |
while (index < tables[i].rows.length) | |
{ | |
Row row = tables[i].rows[index]; | |
if (!row.isComplete) | |
{ | |
if (grammar.isNonTerm(row.next)) | |
{ | |
predict(row); | |
} | |
else | |
{ | |
scan(row); | |
} | |
} | |
else | |
{ | |
complete(row); | |
} | |
index++; | |
} | |
} | |
} | |
void addTable(size_t k) | |
{ | |
tables ~= new Table(k); | |
} | |
void predict(Row row) | |
{ | |
string nextRow = row.next; | |
if (string[][]* rules = nextRow in grammar.productions) | |
{ | |
foreach (rule; *rules) | |
{ | |
Row newRow = new Row(0, nextRow, rule, row.end, row.end); | |
tables[row.end].addRow(newRow); | |
} | |
} | |
} | |
void scan(Row row) | |
{ | |
string nextSymbol = row.next; | |
if (row.end < words.length) | |
{ | |
string actual = words[row.end][0]; | |
if (nextSymbol == actual) | |
{ | |
Row newRow = new Row(1, nextSymbol, [actual], row.end, row.end + 1); | |
tables[row.end + 1].addRow(newRow); | |
} | |
} | |
} | |
void complete(Row row) | |
{ | |
size_t index = 0; | |
while (index < tables[row.start].rows.length) | |
{ | |
Row oldRow = tables[row.start].rows[index]; | |
if (!oldRow.isComplete && oldRow.right[oldRow.dot] == row.left) | |
{ | |
Row newRow = new Row(oldRow.dot + 1, oldRow.left, oldRow.right, | |
oldRow.start, row.end, oldRow.completes.dup); | |
tables[row.end].addRow(newRow, row); | |
} | |
index++; | |
} | |
} | |
Node makeNode(Row row, Row[] relatives = null) | |
{ | |
return new Node(this, row, relatives); | |
} | |
Row[] completes() | |
{ | |
Row[] completes = null; | |
foreach (row; tables[$ - 1].rows) | |
{ | |
if (row.left == "GAMMA") | |
{ | |
completes ~= row; | |
} | |
} | |
return completes; | |
} | |
} | |
class Node | |
{ | |
bool term; | |
string type; | |
string data; | |
Node[] children; | |
this(string type_, string data_) | |
{ | |
type = type_; | |
data = data_; | |
term = true; | |
} | |
this(string type_, Node[] children_) | |
{ | |
type = type_; | |
children = children_; | |
term = false; | |
} | |
bool isTerm() | |
{ | |
return term; | |
} | |
this(Parser parser, Row row, Row[] relatives) | |
{ | |
if (row.left.length == 1) | |
{ | |
type = row.left; | |
data = row.left; | |
term = true; | |
} | |
else | |
{ | |
type = row.left; | |
term = false; | |
foreach (subRow; row.completes) | |
{ | |
children ~= parser.makeNode(subRow); | |
} | |
if (row.completes.length != 0) | |
{ | |
relatives ~= row; | |
} | |
if (row.left == "GAMMA") | |
{ | |
foreach (relative; relatives) | |
{ | |
if (relative.start < parser.words.length) | |
{ | |
string word = parser.words[relative.start][0]; | |
children ~= new Node(word, word); | |
} | |
} | |
} | |
} | |
} | |
} | |
void pprint(Node node, size_t level = 0) | |
{ | |
foreach (i; 0 .. level) | |
{ | |
write(" "); | |
} | |
if (node.isTerm) | |
{ | |
writeln("term: ", node.data); | |
} | |
else | |
{ | |
writeln("node: ", node.type[1 .. $]); | |
} | |
foreach (child; node.children) | |
{ | |
pprint(child, level + 1); | |
} | |
} | |
string nameToString(Node name) | |
{ | |
string ret; | |
foreach (child; name.children) | |
{ | |
if (child.isTerm) | |
{ | |
ret ~= child.type; | |
} | |
else | |
{ | |
ret ~= child.nameToString; | |
} | |
} | |
return ret; | |
} | |
class Walker | |
{ | |
size_t symno; | |
Grammar grammar; | |
string[] ignores; | |
string[string] names; | |
string[][string] nameFlags; | |
this() | |
{ | |
names["start"] = ":start"; | |
grammar = new Grammar(":start"); | |
} | |
string genSym(Args...)(Args args) | |
{ | |
symno++; | |
string parens; | |
string[] flags; | |
static foreach (arg; args) | |
{ | |
flags ~= arg; | |
} | |
string name = ":tmp#" ~ symno.to!string; | |
nameFlags[name] = flags; | |
// writeln(name, ": ", flags); | |
return name; | |
} | |
string[] getFlags(Node node) | |
{ | |
if (string[]* pflags = node.type in nameFlags) | |
{ | |
return *pflags; | |
} | |
return null; | |
} | |
Node clean(Node node) | |
{ | |
if (node.isTerm) | |
{ | |
return node; | |
} | |
string[] baseFlags = getFlags(node); | |
Node[] children; | |
foreach (child; node.children) | |
{ | |
Node cchild = clean(child); | |
if (cchild.isTerm) | |
{ | |
children ~= cchild; | |
continue; | |
} | |
string[] flags = getFlags(cchild); | |
if (flags.canFind("discard")) | |
{ | |
continue; | |
} | |
if (flags.canFind("inline")) | |
{ | |
children ~= cchild.children; | |
continue; | |
} | |
children ~= cchild; | |
} | |
if (baseFlags.canFind("norec")) | |
{ | |
Node[] nextChildren; | |
foreach (child; children) | |
{ | |
if (!child.isTerm && child.type == node.type) | |
{ | |
nextChildren ~= child.children; | |
} | |
else | |
{ | |
nextChildren ~= child; | |
} | |
} | |
children = nextChildren; | |
} | |
if (baseFlags.canFind("term")) | |
{ | |
return new Node(node.type, node.nameToString); | |
} | |
else | |
{ | |
return new Node(node.type, children); | |
} | |
} | |
string[] walk(Node node) | |
{ | |
if (ignores.length == 0) | |
{ | |
return walk2(node); | |
} | |
string name = genSym("inline"); | |
string[] after = walk2(node); | |
grammar.add(name, after); | |
foreach (ignore; ignores) | |
{ | |
grammar.add(name, [ignore, name]); | |
} | |
return [name]; | |
} | |
string[] walk2(Node node) | |
{ | |
switch (node.type) | |
{ | |
default: | |
throw new Exception("err: " ~ node.type); | |
case ":ignore": | |
string name = genSym("discard"); | |
grammar.add(name, walk(node.children[$ - 3])); | |
ignores ~= name; | |
walk(node.children[$ - 1]); | |
ignores.length--; | |
return null; | |
case ":block": | |
walk(node.children[2]); | |
return null; | |
case ":rules": | |
foreach (child; node.children) | |
{ | |
if (child.type != ":ws") | |
{ | |
walk(child); | |
} | |
} | |
return null; | |
case ":rule": | |
string varName = node.children[0].nameToString; | |
string name1 = ':' ~ varName; | |
string[] flags; | |
if (node.children[2].children.length == 6) | |
{ | |
Node cur = node.children[2].children[2]; | |
while (cur.children.length == 5) | |
{ | |
flags ~= cur.children[0].nameToString; | |
cur = cur.children[$ - 1]; | |
} | |
flags ~= cur.children[0].nameToString; | |
} | |
nameFlags[name1] = flags; | |
// writeln(name1, ": ", flags); | |
if (flags.length != 0) | |
{ | |
string name2 = genSym("inline"); | |
grammar.add(name1, [name2]); | |
grammar.add(name2, walk(node.children[$ - 1])); | |
} | |
else | |
{ | |
grammar.add(name1, walk(node.children[$ - 1])); | |
} | |
return null; | |
case ":xor": | |
if (node.children.length != 1) | |
{ | |
string name = genSym("inline"); | |
grammar.add(name, walk(node.children[0])); | |
grammar.add(name, walk(node.children[$ - 1])); | |
return [name]; | |
} | |
else | |
{ | |
return walk(node.children[0]); | |
} | |
case ":and": | |
string[] rets; | |
if (node.children.length != 1) | |
{ | |
// string name = genSym("inline"); | |
// grammar.add(name, walk(node.children[0]) ~ walk(node.children[$ - 1])); | |
// return [name]; | |
return walk(node.children[0]) ~ walk(node.children[$ - 1]); | |
} | |
else | |
{ | |
return walk(node.children[0]); | |
} | |
case ":postfix": | |
if (node.children.length != 1) | |
{ | |
char op = node.children[$ - 1].data[0]; | |
string name = genSym("inline"); | |
final switch (op) | |
{ | |
case '+': | |
string[] rules = walk(node.children[0]); | |
grammar.add(name, rules ~ name); | |
grammar.add(name, rules); | |
break; | |
case '*': | |
grammar.add(name, walk(node.children[0]) ~ name); | |
grammar.add(name, []); | |
break; | |
case '!': | |
nameFlags[name] = ["discard"]; | |
grammar.add(name, walk(node.children[0])); | |
break; | |
case '?': | |
grammar.add(name, walk(node.children[0])); | |
grammar.add(name, []); | |
break; | |
} | |
return [name]; | |
} | |
else | |
{ | |
return walk(node.children[0]); | |
} | |
case ":single": | |
return walk(node.children[0]); | |
case ":wrap": | |
return walk(node.children[2]); | |
case ":name": | |
return [':' ~ node.nameToString]; | |
case ":char_range": | |
char first = node.children[0].children[1].children[0].type[0]; | |
char last = node.children[$ - 1].children[1].children[0].type[0]; | |
string name = genSym("inline"); | |
foreach (char c; first .. last + 1) | |
{ | |
grammar.add(name, [[c]]); | |
} | |
return [name]; | |
case ":char": | |
return [node.children[1].children[0].type]; | |
// return walk2(node.children[1]); | |
case ":char_body": | |
if (node.children.length == 1) | |
{ | |
return [node.children[0].children[0].type]; | |
} | |
final switch (node.children[1].data[0]) | |
{ | |
case 'n': | |
return ["\n"]; | |
case 'r': | |
return ["\n"]; | |
case 't': | |
return ["\t"]; | |
} | |
} | |
} | |
} | |
Walker grammar(string of) | |
{ | |
Grammar grammar = new Grammar(":rules"); | |
grammar.add(":rules", []); | |
grammar.add(":rules", [":rules", ":rules"]); | |
grammar.add(":rules", [":ws", ":rule"]); | |
grammar.add(":rules", [":ws", ":ignore"]); | |
grammar.add(":rule", [":name", ":ws", ":rule_args", ":ws", ":xor"]); | |
grammar.add(":ignore", [ | |
"%", ":ws", "i", "g", "n", "o", "r", "e", ":ws", ":xor", ":ws", | |
":block" | |
]); | |
grammar.add(":rule_args", [":"]); | |
grammar.add(":rule_args", ["(", ":ws", ")", ":ws", ":"]); | |
grammar.add(":rule_args", ["(", ":ws", ":rule_args_body", ")", ":ws", ":"]); | |
grammar.add(":rule_args_body", [ | |
":name", ":ws", ",", ":ws", ":rule_args_body" | |
]); | |
grammar.add(":rule_args_body", [":name"]); | |
grammar.add(":block", ["{", ":ws", ":rules", ":ws", "}"]); | |
grammar.add(":xor", [":xor", ":ws", "|", ":ws", ":and"]); | |
grammar.add(":xor", [":and"]); | |
grammar.add(":and", [":and", ":ws", ",", ":ws", ":postfix"]); | |
grammar.add(":and", [":postfix"]); | |
grammar.add(":postfix", [":postfix", "!"]); | |
grammar.add(":postfix", [":single", "*"]); | |
grammar.add(":postfix", [":single", "?"]); | |
grammar.add(":postfix", [":single", "+"]); | |
grammar.add(":postfix", [":single"]); | |
grammar.add(":single", [":name"]); | |
grammar.add(":single", [":char"]); | |
grammar.add(":single", [":char_range"]); | |
grammar.add(":single", [":wrap"]); | |
grammar.add(":wrap", ["(", ":ws", ":xor", ":ws", ")"]); | |
grammar.add(":char_range", [":char", ":ws", ".", ".", ":ws", ":char"]); | |
grammar.add(":raw_char", ["'", ":any", "'"]); | |
grammar.add(":char", ["'", ":any", "'"]); | |
grammar.add(":char_body", [":any"]); | |
grammar.add(":char_body", ["\\", "n"]); | |
grammar.add(":char_body", ["\\", "r"]); | |
grammar.add(":char_body", ["\\", "t"]); | |
grammar.add(":char_body", ["\\", "'"]); | |
grammar.add(":name", ["_", ":name"]); | |
grammar.add(":name", [":letter", ":name"]); | |
grammar.add(":name", [":letter"]); | |
grammar.add(":name", ["_"]); | |
foreach (char i; 'a' .. 'z' + 1) | |
{ | |
grammar.add(":letter", [[i]]); | |
} | |
foreach (char i; 'A' .. 'Z' + 1) | |
{ | |
grammar.add(":letter", [[i]]); | |
} | |
foreach (char i; 0 .. char.max) | |
{ | |
if (isprint(i)) | |
{ | |
grammar.add(":any", [[i]]); | |
} | |
} | |
grammar.add(":ws", []); | |
grammar.add(":ws", [" ", ":ws"]); | |
grammar.add(":ws", ["\t", ":ws"]); | |
grammar.add(":ws", ["\r", ":ws"]); | |
grammar.add(":ws", ["\n", ":ws"]); | |
// grammar.add(":ws", [":ws", ":ws"]); | |
Parser parser = new Parser(grammar); | |
Node ast = parser.parse(of); | |
Walker walker = new Walker; | |
walker.walk(ast); | |
return walker; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment