Last active
February 16, 2019 03:44
-
-
Save trcio/6a9276c7891ba83d6a834f9f2269c340 to your computer and use it in GitHub Desktop.
SE 3377 CLI Parser and Tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <iostream> | |
#include <iomanip> | |
#include <list> | |
#include <stack> | |
#include <string> | |
using namespace std; | |
enum class TokenType | |
{ | |
Command, | |
Option, | |
Argument, | |
SpecialCharacter, | |
Comment | |
}; | |
class Token | |
{ | |
public: | |
string Value; | |
TokenType Type; | |
Token(string value, TokenType type) | |
{ | |
Value = value; | |
Type = type; | |
}; | |
string TypeToString() | |
{ | |
switch (Type) | |
{ | |
case TokenType::Command: | |
return "Command"; | |
case TokenType::Option: | |
return "Option"; | |
case TokenType::Argument: | |
return "Argument"; | |
case TokenType::SpecialCharacter: | |
return "Special Character"; | |
case TokenType::Comment: | |
return "Comment"; | |
} | |
return ""; | |
}; | |
void Print() | |
{ | |
// max() ensures there is always space between the value and type of token | |
cout << left << setw(max(50, (int) Value.length() + 5)) << setfill(' ') << Value << TypeToString() << endl; | |
}; | |
}; | |
class Parser | |
{ | |
private: | |
stack<char> CharStack; | |
string Input, Buffer; | |
TokenType CurrentType; | |
bool IsInsideOfPair(char c) | |
{ | |
return !CharStack.empty() && CharStack.top() == c; | |
}; | |
bool IsInsideOfQuotes() | |
{ | |
return IsInsideOfPair('\'') || IsInsideOfPair('\"'); | |
}; | |
bool IsEscaped(int i) | |
{ | |
// true if there are an odd # of consecutive \'s behind the 'i' index | |
int count = 0; | |
for (int j = i - 1; j >= 0; j++) | |
{ | |
if (Input[j] == '\\') | |
count++; | |
else | |
break; | |
} | |
return count % 2 > 0; | |
}; | |
bool HandleSpecialCharacter(int i) | |
{ | |
char c = Input[i]; | |
// escape all characters with a backslash behind them, outside of single quotes | |
if (IsEscaped(i) && !IsInsideOfPair('\'')) | |
return false; | |
// escape all characters inside of single quotes | |
if (c != '\'' && IsInsideOfPair('\'')) | |
return false; | |
if (c == '-' && Buffer.length() < 1) | |
{ | |
CurrentType = TokenType::Option; | |
return false; | |
} | |
if (c == '\'') | |
{ | |
if (IsInsideOfPair('\'')) | |
CharStack.pop(); | |
else | |
CharStack.push('\''); | |
AddSpecialToken("\'"); | |
} | |
else if (c == '\"') | |
{ | |
if (IsInsideOfPair('\"')) | |
CharStack.pop(); | |
else | |
CharStack.push('\"'); | |
AddSpecialToken("\""); | |
} | |
else if (c == '|') | |
{ | |
AddSpecialToken("|"); | |
CurrentType = TokenType::Command; | |
} | |
else if (c == ';') | |
{ | |
AddSpecialToken(";"); | |
CurrentType = TokenType::Command; | |
} | |
else if (c == '\\') | |
AddSpecialToken("\\"); | |
else if (c == '$') | |
AddSpecialToken("$"); | |
else if (c == '!') | |
AddSpecialToken("!"); | |
else if (c == '>') | |
AddSpecialToken(">"); | |
else if (c == '<') | |
AddSpecialToken("<"); | |
else if (c == '~') | |
AddSpecialToken("~"); | |
else if (c == '(') | |
AddSpecialToken("("); | |
else if (c == ')') | |
AddSpecialToken(")"); | |
else if (c == '{') | |
AddSpecialToken("{"); | |
else if (c == '}') | |
AddSpecialToken("}"); | |
else if (c == '[') | |
AddSpecialToken("["); | |
else if (c == ']') | |
AddSpecialToken("]"); | |
else if (c == ' ' && IsInsideOfQuotes()) | |
return false; | |
else if (c != ' ') | |
return false; | |
return true; | |
}; | |
void AddCurrentToken() | |
{ | |
Tokens.push_back(Token(Buffer, CurrentType)); | |
CurrentType = TokenType::Argument; | |
Buffer = ""; | |
}; | |
void AddSpecialToken(string v) | |
{ | |
if (Buffer.length() > 0) | |
AddCurrentToken(); | |
Tokens.push_back(Token(v, TokenType::SpecialCharacter)); | |
}; | |
void ParseInput() | |
{ | |
// loop through each character of the input | |
for (int i = 0; i < Input.length(); i++) | |
{ | |
char c = Input[i]; | |
// handle comments | |
if (c == '#') | |
{ | |
// add the current token if it exists | |
if (Buffer.length() > 0) | |
AddCurrentToken(); | |
// add the comment token that contains the rest of the input string | |
Tokens.push_back(Token(Input.substr(i), TokenType::Comment)); | |
// break out of the loop because we're done here | |
break; | |
} | |
// a space means the token has ended, unless we're inside of quotes | |
if (c == ' ' && Buffer.length() > 0 && !IsInsideOfQuotes()) | |
AddCurrentToken(); | |
// if the character wasnt handled, add it to the buffer | |
if (!HandleSpecialCharacter(i)) | |
Buffer += c; | |
} | |
// make sure we add the token that ended with the loop | |
if (Buffer.length() > 0) | |
AddCurrentToken(); | |
}; | |
public: | |
list<Token> Tokens; | |
Parser(string input) | |
{ | |
Tokens = list<Token>(); | |
CharStack = stack<char>(); | |
Input = input; | |
Buffer = ""; | |
CurrentType = TokenType::Command; | |
ParseInput(); | |
}; | |
}; | |
int main() | |
{ | |
cout << endl; | |
while (true) | |
{ | |
string input; | |
getline(cin, input); | |
if (input == "QUIT") | |
break; | |
Parser p(input); | |
cout << string(80, '-') << endl; | |
cout << "Number of tokens: " << p.Tokens.size() << endl; | |
cout << "Command name: " << p.Tokens.front().Value << endl; | |
cout << string(30, '-') << endl; | |
for (Token t : p.Tokens) | |
{ | |
t.Print(); | |
} | |
cout << string(80, '-') << endl << endl; | |
} | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Collections.Generic; | |
namespace CLITokenizer | |
{ | |
public class Parser | |
{ | |
public List<Token> Tokens { get; } | |
private Stack<char> CharStack { get; } | |
private string Input { get; } | |
private string Buffer { get; set; } | |
private TokenType CurrentType { get; set; } | |
public Parser(string input) | |
{ | |
Tokens = new List<Token>(); | |
CharStack = new Stack<char>(); | |
Buffer = string.Empty; | |
CurrentType = TokenType.Command; | |
Input = input; | |
Parse(); | |
} | |
private void Parse() | |
{ | |
for (var i = 0; i < Input.Length; i++) | |
{ | |
var c = Input[i]; | |
if (c == '#') | |
{ | |
if (Buffer.Length > 0) | |
AddCurrentToken(); | |
Tokens.Add(new Token { Value = Input.Substring(i), Type = TokenType.Comment }); | |
break; | |
} | |
if (c == ' ' && Buffer.Length > 0 && !IsInsideOfQuotes()) | |
AddCurrentToken(); | |
if (!HandleSpecialCharacter(i)) | |
Buffer += c; | |
} | |
if (Buffer.Length > 0) | |
AddCurrentToken(); | |
} | |
private bool HandleSpecialCharacter(int i) | |
{ | |
var c = Input[i]; | |
// escape all characters with a backslash behind them, outside of single quotes | |
if (IsEscaped(i) && !IsInsideOfPair('\'')) | |
return false; | |
// escape all characters inside of single quotes | |
if (c != '\'' && IsInsideOfPair('\'')) | |
return false; | |
if (c == '-' && Buffer.Length < 1) | |
{ | |
CurrentType = TokenType.Option; | |
return false; | |
} | |
if (c == '\'') | |
{ | |
if (IsInsideOfPair('\'')) | |
CharStack.Pop(); | |
else | |
CharStack.Push('\''); | |
AddSpecialToken("\'"); | |
} | |
else if (c == '\"') | |
{ | |
if (IsInsideOfPair('\"')) | |
CharStack.Pop(); | |
else | |
CharStack.Push('\"'); | |
AddSpecialToken("\""); | |
} | |
else if (c == '|') | |
{ | |
AddSpecialToken("|"); | |
CurrentType = TokenType.Command; | |
} | |
else if (c == ';') | |
{ | |
AddSpecialToken(";"); | |
CurrentType = TokenType.Command; | |
} | |
else if (c == '\\') | |
AddSpecialToken("\\"); | |
else if (c == '$') | |
AddSpecialToken("$"); | |
else if (c == '!') | |
AddSpecialToken("!"); | |
else if (c == '>') | |
AddSpecialToken(">"); | |
else if (c == '<') | |
AddSpecialToken("<"); | |
else if (c == '~') | |
AddSpecialToken("~"); | |
else if (c == '(') | |
AddSpecialToken("("); | |
else if (c == ')') | |
AddSpecialToken(")"); | |
else if (c == '{') | |
AddSpecialToken("{"); | |
else if (c == '}') | |
AddSpecialToken("}"); | |
else if (c == '[') | |
AddSpecialToken("["); | |
else if (c == ']') | |
AddSpecialToken("]"); | |
else if (c == ' ' && IsInsideOfQuotes()) | |
return false; | |
else if (c != ' ') | |
return false; | |
return true; | |
} | |
private bool IsInsideOfPair(char c) | |
{ | |
return CharStack.Count > 0 && CharStack.Peek() == c; | |
} | |
private bool IsInsideOfQuotes() | |
{ | |
return IsInsideOfPair('\'') || IsInsideOfPair('\"'); | |
} | |
private void AddSpecialToken(string t) | |
{ | |
if (Buffer.Length > 0) | |
AddCurrentToken(); | |
Tokens.Add(new Token { Value = t, Type = TokenType.SpecialCharacter }); | |
} | |
private bool IsEscaped(int i) | |
{ | |
// escaped if there are an odd # of \'s behind current index, no space | |
var count = 0; | |
for (var j = i - 1; j >= 0; j--) | |
{ | |
if (Input[j] == '\\') | |
count++; | |
else | |
break; | |
} | |
return count % 2 > 0; | |
} | |
private void AddCurrentToken() | |
{ | |
Tokens.Add(new Token {Value = Buffer, Type = CurrentType}); | |
CurrentType = TokenType.Argument; | |
Buffer = string.Empty; | |
} | |
public class Token | |
{ | |
public string Value { get; set; } | |
public TokenType Type { get; set; } | |
} | |
public enum TokenType | |
{ | |
Command, | |
Option, | |
Argument, | |
SpecialCharacter, | |
Comment | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment