Created
August 31, 2014 16:50
-
-
Save matutter/dca37447baf4cdd3b10f to your computer and use it in GitHub Desktop.
Reusable lexical tokenizer
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // Calc_line_interpreter.cpp : Defines the entry point for the console application. | |
| // | |
| /// TODO: Typed lambdas | |
| #include "stdafx.h" | |
| #include <iostream> | |
| #include <string> | |
| #include "LangDefine.h" | |
| #include <regex> | |
| #include <fstream> | |
| #include <streambuf> | |
| using namespace std; | |
| void define(LangDef *l); | |
| int main(int argc, char* argv[]) | |
| { | |
| string buffer = ""; | |
| ifstream file(argv[1] ); | |
| if (file.is_open()) { | |
| file.seekg(0, std::ios::end); | |
| buffer.reserve(file.tellg()); | |
| file.seekg(0, std::ios::beg); | |
| buffer.assign( istreambuf_iterator<char>(file), istreambuf_iterator<char>() ); | |
| LangDef lang = LangDef("Calculator Language", false); | |
| define(&lang); | |
| lang.Compile(); | |
| TokenProvider tp( buffer, lang); | |
| } | |
| else | |
| cout << "no such file" << endl; | |
| return 0; | |
| } | |
| void define(LangDef *lang) { | |
| lang->SetCharset(regex("[a-zA-Z0-9]|[.]|[+]|[-]|[*]|[/]|[=]|[(]|[)]")); | |
| Identity Real; | |
| Real.Name = "Real"; | |
| Real.RegX = regex("[0-9]+[.][0-9]+"); | |
| Real.Meta = _VALUE_TYPE; | |
| Real.Nick = "REAL"; | |
| Identity Integer; | |
| Integer.Name = "Integer"; | |
| Integer.RegX = regex("[0-9]+"); | |
| Integer.Meta = _VALUE_TYPE; | |
| Integer.Nick = "INTEGER"; | |
| Identity EOC; | |
| EOC.Name = "End of command"; | |
| EOC.RegX = regex("[.]"); | |
| EOC.Meta = _EOC; | |
| EOC.Nick = "EOC"; | |
| Identity Division_op; | |
| Division_op.Name = "Division"; | |
| Division_op.RegX = regex("[/]"); | |
| Division_op.Meta = _OPERATOR_DIVI; | |
| Division_op.Nick = "OPER"; | |
| Identity Addition_op; | |
| Addition_op.Name = "Addition"; | |
| Addition_op.RegX = regex("[+]"); | |
| Addition_op.Meta = _OPERATOR_PLUS; | |
| Addition_op.Nick = "OPER"; | |
| Identity Subtraction_op; | |
| Subtraction_op.Name = "Subtraction"; | |
| Subtraction_op.RegX = regex("[-]"); | |
| Subtraction_op.Meta = _OPERATOR_MINU; | |
| Subtraction_op.Nick = "OPER"; | |
| Identity Multiplication_op; | |
| Multiplication_op.Name = "Multiplication"; | |
| Multiplication_op.RegX = regex("[*]"); | |
| Multiplication_op.Meta = _OPERATOR_MULT; | |
| Multiplication_op.Nick = "OPER"; | |
| Identity Assignment_op; | |
| Assignment_op.Name = "Assignment"; | |
| Assignment_op.RegX = regex("[\=]"); | |
| Assignment_op.Meta = _OPERATOR_EQUA; | |
| Assignment_op.Nick = "OPER"; | |
| Identity IDENT; | |
| IDENT.Name = "Ident"; | |
| IDENT.RegX = regex("[a-zA-Z][a-zA-Z0-9]+"); | |
| IDENT.Meta = _IDENT; | |
| IDENT.Nick = "IDENT"; | |
| Identity Space; | |
| Space.Name = "Space"; | |
| Space.RegX = regex("[[:s:]]"); | |
| Space.Meta = _WHITESPACE; | |
| Space.Mute = true; | |
| Identity Comment; | |
| Comment.Name = "Comment"; | |
| Comment.RegX = regex("[/][/].+"); | |
| Comment.Meta = _COMMENT; | |
| lang->AddDef(Space); | |
| lang->AddDef(IDENT); | |
| lang->AddDef(Real); | |
| lang->AddDef(Integer); | |
| lang->AddDef(Comment); | |
| lang->AddDef(Space); | |
| lang->AddDef(Assignment_op); | |
| lang->AddDef(Multiplication_op); | |
| lang->AddDef(Subtraction_op); | |
| lang->AddDef(Addition_op); | |
| lang->AddDef(Division_op); | |
| lang->AddDef(EOC); | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <string> | |
| #include <vector> | |
| #include <regex> | |
| #include <iostream> | |
| /* affinity levels ergo, same level of presidence */ | |
| #define _OPERATOR_PLUS 0xf0 | |
| #define _OPERATOR_MINU 0xf1 | |
| #define _OPERATOR_MULT 0xf2 | |
| #define _OPERATOR_DIVI 0xf3 | |
| #define _OPERATOR_EQUA 0xf4 | |
| #define _OPERATOR 0x0 | |
| #define _INTEGER 0x1 | |
| #define _REAL 0x2 | |
| #define _EOC 0x3 | |
| #define _IDENT 0x4 | |
| #define _RESERVED 0x5 | |
| #define _WHITESPACE 0x6 | |
| #define _COMMENT 0x7 | |
| #define _VALUE_TYPE 0x8 | |
| #pragma once | |
| using namespace std; | |
| class Identity { | |
| public: | |
| string Name = ""; | |
| regex RegX; | |
| vector<string> tests; | |
| vector<Identity> AffinityPairs; | |
| bool SharesAffinity = 0; | |
| bool Mute = false; | |
| unsigned short Meta = 0; | |
| string Nick = ""; | |
| Identity() {} | |
| Identity(string n, regex reg, vector<string> t, int p) { | |
| this->Name = n; | |
| this->RegX = reg; | |
| this->tests = t; | |
| this->Meta = p; | |
| } | |
| Identity(string n, regex reg, vector<string> t, int p, bool i) { | |
| this->Name = n; | |
| this->RegX = reg; | |
| this->tests = t; | |
| this->Meta = p; | |
| this->Mute = i; | |
| } | |
| void SelfTest() { | |
| cout << this->Name << ":::" << endl; | |
| for (std::vector<string>::iterator it = this->tests.begin(); it != this->tests.end(); ++it) | |
| cout << (regex_match(*it, this->RegX) ? "matches " + *it + "\n" : ""); | |
| } | |
| bool match(string s) { | |
| return regex_match(s, this->RegX); | |
| } | |
| bool match(unsigned short n) { | |
| return n == this->Meta; | |
| } | |
| }; | |
| class LangDef | |
| { | |
| public: | |
| /* todo: typeInfo here? */ | |
| //class Typedef {}; | |
| /* volatile */ | |
| string Charset = ""; | |
| vector<Identity> Identities = vector<Identity>(); | |
| /* non-vol */ | |
| std::string Language_Name = ""; | |
| bool Verbose = false; | |
| LangDef(); | |
| LangDef(string name, bool verbose) { | |
| this->Language_Name = name; | |
| this->Verbose = verbose; | |
| } | |
| /* add def */ | |
| bool Define(string name, regex reg, vector<string> tests, int pre) { | |
| this->Identities.push_back(Identity(name, reg, tests, pre)); | |
| if ( this->Verbose ) | |
| this->Identities.back().SelfTest(); | |
| return true; | |
| } | |
| bool AddDef(Identity i) { | |
| this->Identities.push_back(i); | |
| return true; | |
| } | |
| /* adds a charset for the language to use */ | |
| void SetCharset(regex reg) { | |
| string s; | |
| for (int i = 0; i < 255; i++, s = (char)i) | |
| if ( regex_match( s, reg) ) | |
| this->Charset += s; | |
| if (this->Verbose) | |
| cout << "CHARSET" << endl << this->Charset << endl; | |
| } | |
| /* do some futures */ | |
| void Compile() { | |
| //TODO: abstract type stage | |
| } | |
| }; | |
| class Partial { | |
| public: | |
| string symbol = ""; | |
| int index = 0; | |
| int length = 0; | |
| Partial( string s, int i ) { | |
| this->index = i; | |
| this->length = i + s.length(); | |
| this->symbol = s; | |
| } | |
| }; | |
| class TokenProvider { | |
| public: | |
| TokenProvider(); | |
| TokenProvider( string s, LangDef l ) { | |
| Tokenize(s, l); | |
| } | |
| void Tokenize(string source, LangDef lang) { | |
| unsigned int cursor = 0; | |
| unsigned int offset = 0; | |
| unsigned int safety = 0; | |
| vector<Partial> fragments; | |
| while ( cursor + offset < source.length()/* && ++safety < source.length()*/ ) { | |
| vector<Identity>::iterator idit = lang.Identities.begin(); | |
| while (idit != lang.Identities.end()) { | |
| smatch fragment; | |
| string sub = source.substr(cursor, source.length() - cursor ); | |
| if (sub[0] == '=') { | |
| cout << ""; | |
| } | |
| if (regex_search(sub, fragment, idit->RegX)) | |
| { | |
| string frag = fragment.str(); | |
| int loc = sub.find(frag); | |
| if (loc != 0) { | |
| idit++; | |
| continue; | |
| } | |
| if (idit->Mute) { | |
| if (loc == 0) | |
| cursor++; | |
| break; | |
| } | |
| fragments.push_back(Partial(frag, cursor + loc)); | |
| cursor += loc + frag.length(); | |
| cout << idit->Nick << pad(20, idit->Nick.length()) << frag << endl; | |
| //cout << "[" << loc << " - " << loc + frag.length() << "] " << fragments.back().symbol << "\t" << sub << endl; | |
| } | |
| else | |
| idit++; | |
| } // end identity loop | |
| } // end cursor loop | |
| } | |
| string pad(int size, int off) { | |
| string s = ""; | |
| for (; off != size; off++) | |
| s += " "; | |
| return s; | |
| } | |
| }; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment