Created
June 3, 2023 15:26
-
-
Save jahav/fe8da165571b439ba7ee3417fb5f494e to your computer and use it in GitHub Desktop.
A PoC converter that converts ANTLR lexer to Rolex lexer.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Text; | |
using Antlr4.Runtime; | |
internal class Program | |
{ | |
private static void Main(string[] args) | |
{ | |
Console.WriteLine("Hello, World!"); | |
var f = string.Join("\n", File.ReadAllLines(@"c:\Users\havli\source\repos\Antlr2Rolex\FormulaLexer.g4")); | |
var inputStream = new AntlrInputStream(f); | |
var lexer = new ANTLRv4Lexer(inputStream); | |
var input = new CommonTokenStream(lexer); | |
var parser = new ANTLRv4Parser(input); | |
var ctx = parser.grammarSpec(); | |
var visitor = new Visitor(); | |
var rolexLexer = visitor.Visit(ctx); | |
Console.WriteLine(rolexLexer); | |
} | |
private class Visitor : ANTLRv4ParserBaseVisitor<string> | |
{ | |
private static readonly HashSet<char> _regexEscape = "{}()[]?*.+-$^'" | |
.Select(x => x).ToHashSet(); | |
private List<string> _tokenNames = new(); | |
private Dictionary<string, string> _patterns = new(); | |
public override string VisitGrammarSpec(ANTLRv4Parser.GrammarSpecContext context) | |
{ | |
var lexerRules = context.rules().ruleSpec().Select(rs => rs.lexerRuleSpec()).Where(rs => rs is not null).ToList(); | |
_tokenNames.AddRange(lexerRules.Where(lr => lr.FRAGMENT() is null).Select(lr => lr.TOKEN_REF().GetText())); | |
var maxLoops = context.rules().ruleSpec().Count(); | |
var loop = 0; | |
while (true) | |
{ | |
var noErrors = true; | |
foreach (var lexerRule in lexerRules) | |
{ | |
var name = lexerRule.TOKEN_REF().GetText(); | |
var isPatternDone = _patterns.ContainsKey(name); | |
if (isPatternDone) | |
continue; | |
try | |
{ | |
var pattern = Visit(lexerRule); | |
_patterns.Add(name, pattern); | |
} | |
catch (MissingDependencyPatternException e) | |
{ | |
// continue, each pass should add at least one fragment of token | |
// so I will get it all eventually | |
noErrors = false; | |
} | |
} | |
if (loop++ > maxLoops) | |
throw new Exception("Infinite loop " + loop); | |
if (noErrors) | |
break; | |
} | |
var sb = new StringBuilder(); | |
foreach (var tokenName in _tokenNames) | |
{ | |
sb.Append(tokenName).Append(" = '").Append(_patterns[tokenName]).Append("'\n"); | |
} | |
return sb.ToString(); | |
} | |
// lexerRuleSpec | |
// : FRAGMENT? TOKEN_REF optionsSpec? COLON lexerRuleBlock SEMI | |
// ; | |
public override string VisitLexerRuleSpec(ANTLRv4Parser.LexerRuleSpecContext context) | |
{ | |
//Console.WriteLine("VisitLexerRuleSpec " + context.ToStringTree()); | |
var isFragment = context.FRAGMENT() is not null; | |
var name = context.TOKEN_REF().GetText(); | |
var res = VisitChildren(context); | |
return res; | |
} | |
public override string VisitLexerRuleBlock(ANTLRv4Parser.LexerRuleBlockContext context) | |
{ | |
return VisitChildren(context); | |
} | |
// lexerElement | |
// : lexerAtom ebnfSuffix? | |
// | lexerBlock ebnfSuffix? | |
// | actionBlock QUESTION? | |
// ; | |
public override string VisitLexerElement(ANTLRv4Parser.LexerElementContext context) | |
{ | |
var pattern = Visit(context.children[0]); | |
var hasEbnfSuffix = context.children.Count > 1; | |
if (hasEbnfSuffix) | |
{ | |
var repeatCharacter = context.children[1].GetText(); | |
pattern = MakeBlock(MakeBlock(pattern) + repeatCharacter); | |
} | |
return pattern; | |
} | |
// lexerBlock | |
// : LPAREN lexerAltList RPAREN | |
// ; | |
// | |
public override string VisitLexerBlock(ANTLRv4Parser.LexerBlockContext context) | |
{ | |
var pattern = Visit(context.children[1]); | |
return MakeBlock(pattern); | |
} | |
// lexerAltList | |
// : lexerAlt (OR lexerAlt)* | |
// ; | |
public override string VisitLexerAltList(ANTLRv4Parser.LexerAltListContext context) | |
{ | |
var firstChildPattern = Visit(context.children[0]); | |
var pattern = MakeBlock(firstChildPattern); | |
for (var childIndex = 2; childIndex < context.ChildCount; childIndex += 2) | |
{ | |
var childPattern = Visit(context.children[childIndex]); | |
pattern += "|" + MakeBlock(childPattern); | |
} | |
return MakeBlock(pattern); | |
} | |
// lexerAtom | |
// : characterRange | |
// | terminal | |
// | notSet | |
// | LEXER_CHAR_SET | |
// | DOT elementOptions? | |
// ; | |
public override string VisitLexerAtom(ANTLRv4Parser.LexerAtomContext context) | |
{ | |
if (context.characterRange() is not null) | |
{ | |
return Visit(context.characterRange()); | |
} | |
if (context.terminal() is not null) | |
{ | |
return Visit(context.terminal()); | |
} | |
if (context.LEXER_CHAR_SET() is not null) | |
{ | |
var regexCharSet = context.LEXER_CHAR_SET().GetText(); | |
return regexCharSet; | |
} | |
throw new NotImplementedException(); | |
} | |
public override string VisitCharacterRange(ANTLRv4Parser.CharacterRangeContext context) | |
{ | |
var first = context.STRING_LITERAL(0).GetText()[1..^1]; | |
var startChar = ConvertUnicodeCodepoint(first); | |
var second = context.STRING_LITERAL(1).GetText()[1..^1]; | |
var endChar = ConvertUnicodeCodepoint(second); | |
var range = $"[{startChar}-{endChar}]"; | |
return range; | |
static string ConvertUnicodeCodepoint(string codepoint) | |
{ | |
if (codepoint.StartsWith("\\u{") && codepoint.EndsWith('}')) | |
{ | |
return "\\u" + codepoint[3..^1]; | |
} | |
return codepoint; | |
} | |
} | |
public override string VisitTerminal(ANTLRv4Parser.TerminalContext context) | |
{ | |
var stringLiteral = context.STRING_LITERAL()?.GetText(); | |
if (stringLiteral is not null) | |
{ | |
return EscapeLiteralForRegEx(stringLiteral[1..^1]); | |
} | |
// else token and reference to a fragment | |
var tokenRef = context.TOKEN_REF().GetText(); | |
if (_patterns.TryGetValue(tokenRef, out var pattern)) | |
return pattern; | |
throw new MissingDependencyPatternException(tokenRef); | |
} | |
protected override string AggregateResult(string aggregate, string nextResult) | |
{ | |
var combined = aggregate + nextResult; | |
return combined; | |
} | |
private string EscapeLiteralForRegEx(string literal) | |
{ | |
var sb = new StringBuilder(literal.Length); | |
foreach (var c in literal) | |
{ | |
if (_regexEscape.Contains(c)) | |
sb.Append('\\'); | |
sb.Append(c); | |
} | |
return sb.ToString(); | |
} | |
private string MakeBlock(string pattern) | |
{ | |
return "(" + pattern + ")"; | |
} | |
} | |
internal class MissingDependencyPatternException : Exception | |
{ | |
public MissingDependencyPatternException(string name) | |
{ | |
Name = name; | |
} | |
public string Name { get; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment