Skip to content

Instantly share code, notes, and snippets.

@matutter
Created August 31, 2014 16:50
Show Gist options
  • Select an option

  • Save matutter/dca37447baf4cdd3b10f to your computer and use it in GitHub Desktop.

Select an option

Save matutter/dca37447baf4cdd3b10f to your computer and use it in GitHub Desktop.
Reusable lexical tokenizer
// Calc_line_interpreter.cpp : Defines the entry point for the console application.
//
/// TODO: Typed lambdas
#include "stdafx.h"
#include <iostream>
#include <string>
#include "LangDefine.h"
#include <regex>
#include <fstream>
#include <streambuf>
using namespace std;
void define(LangDef *l);
int main(int argc, char* argv[])
{
string buffer = "";
ifstream file(argv[1] );
if (file.is_open()) {
file.seekg(0, std::ios::end);
buffer.reserve(file.tellg());
file.seekg(0, std::ios::beg);
buffer.assign( istreambuf_iterator<char>(file), istreambuf_iterator<char>() );
LangDef lang = LangDef("Calculator Language", false);
define(&lang);
lang.Compile();
TokenProvider tp( buffer, lang);
}
else
cout << "no such file" << endl;
return 0;
}
void define(LangDef *lang) {
lang->SetCharset(regex("[a-zA-Z0-9]|[.]|[+]|[-]|[*]|[/]|[=]|[(]|[)]"));
Identity Real;
Real.Name = "Real";
Real.RegX = regex("[0-9]+[.][0-9]+");
Real.Meta = _VALUE_TYPE;
Real.Nick = "REAL";
Identity Integer;
Integer.Name = "Integer";
Integer.RegX = regex("[0-9]+");
Integer.Meta = _VALUE_TYPE;
Integer.Nick = "INTEGER";
Identity EOC;
EOC.Name = "End of command";
EOC.RegX = regex("[.]");
EOC.Meta = _EOC;
EOC.Nick = "EOC";
Identity Division_op;
Division_op.Name = "Division";
Division_op.RegX = regex("[/]");
Division_op.Meta = _OPERATOR_DIVI;
Division_op.Nick = "OPER";
Identity Addition_op;
Addition_op.Name = "Addition";
Addition_op.RegX = regex("[+]");
Addition_op.Meta = _OPERATOR_PLUS;
Addition_op.Nick = "OPER";
Identity Subtraction_op;
Subtraction_op.Name = "Subtraction";
Subtraction_op.RegX = regex("[-]");
Subtraction_op.Meta = _OPERATOR_MINU;
Subtraction_op.Nick = "OPER";
Identity Multiplication_op;
Multiplication_op.Name = "Multiplication";
Multiplication_op.RegX = regex("[*]");
Multiplication_op.Meta = _OPERATOR_MULT;
Multiplication_op.Nick = "OPER";
Identity Assignment_op;
Assignment_op.Name = "Assignment";
Assignment_op.RegX = regex("[\=]");
Assignment_op.Meta = _OPERATOR_EQUA;
Assignment_op.Nick = "OPER";
Identity IDENT;
IDENT.Name = "Ident";
IDENT.RegX = regex("[a-zA-Z][a-zA-Z0-9]+");
IDENT.Meta = _IDENT;
IDENT.Nick = "IDENT";
Identity Space;
Space.Name = "Space";
Space.RegX = regex("[[:s:]]");
Space.Meta = _WHITESPACE;
Space.Mute = true;
Identity Comment;
Comment.Name = "Comment";
Comment.RegX = regex("[/][/].+");
Comment.Meta = _COMMENT;
lang->AddDef(Space);
lang->AddDef(IDENT);
lang->AddDef(Real);
lang->AddDef(Integer);
lang->AddDef(Comment);
lang->AddDef(Space);
lang->AddDef(Assignment_op);
lang->AddDef(Multiplication_op);
lang->AddDef(Subtraction_op);
lang->AddDef(Addition_op);
lang->AddDef(Division_op);
lang->AddDef(EOC);
}
#include <string>
#include <vector>
#include <regex>
#include <iostream>
/* affinity levels ergo, same level of presidence */
#define _OPERATOR_PLUS 0xf0
#define _OPERATOR_MINU 0xf1
#define _OPERATOR_MULT 0xf2
#define _OPERATOR_DIVI 0xf3
#define _OPERATOR_EQUA 0xf4
#define _OPERATOR 0x0
#define _INTEGER 0x1
#define _REAL 0x2
#define _EOC 0x3
#define _IDENT 0x4
#define _RESERVED 0x5
#define _WHITESPACE 0x6
#define _COMMENT 0x7
#define _VALUE_TYPE 0x8
#pragma once
using namespace std;
class Identity {
public:
string Name = "";
regex RegX;
vector<string> tests;
vector<Identity> AffinityPairs;
bool SharesAffinity = 0;
bool Mute = false;
unsigned short Meta = 0;
string Nick = "";
Identity() {}
Identity(string n, regex reg, vector<string> t, int p) {
this->Name = n;
this->RegX = reg;
this->tests = t;
this->Meta = p;
}
Identity(string n, regex reg, vector<string> t, int p, bool i) {
this->Name = n;
this->RegX = reg;
this->tests = t;
this->Meta = p;
this->Mute = i;
}
void SelfTest() {
cout << this->Name << ":::" << endl;
for (std::vector<string>::iterator it = this->tests.begin(); it != this->tests.end(); ++it)
cout << (regex_match(*it, this->RegX) ? "matches " + *it + "\n" : "");
}
bool match(string s) {
return regex_match(s, this->RegX);
}
bool match(unsigned short n) {
return n == this->Meta;
}
};
class LangDef
{
public:
/* todo: typeInfo here? */
//class Typedef {};
/* volatile */
string Charset = "";
vector<Identity> Identities = vector<Identity>();
/* non-vol */
std::string Language_Name = "";
bool Verbose = false;
LangDef();
LangDef(string name, bool verbose) {
this->Language_Name = name;
this->Verbose = verbose;
}
/* add def */
bool Define(string name, regex reg, vector<string> tests, int pre) {
this->Identities.push_back(Identity(name, reg, tests, pre));
if ( this->Verbose )
this->Identities.back().SelfTest();
return true;
}
bool AddDef(Identity i) {
this->Identities.push_back(i);
return true;
}
/* adds a charset for the language to use */
void SetCharset(regex reg) {
string s;
for (int i = 0; i < 255; i++, s = (char)i)
if ( regex_match( s, reg) )
this->Charset += s;
if (this->Verbose)
cout << "CHARSET" << endl << this->Charset << endl;
}
/* do some futures */
void Compile() {
//TODO: abstract type stage
}
};
class Partial {
public:
string symbol = "";
int index = 0;
int length = 0;
Partial( string s, int i ) {
this->index = i;
this->length = i + s.length();
this->symbol = s;
}
};
class TokenProvider {
public:
TokenProvider();
TokenProvider( string s, LangDef l ) {
Tokenize(s, l);
}
void Tokenize(string source, LangDef lang) {
unsigned int cursor = 0;
unsigned int offset = 0;
unsigned int safety = 0;
vector<Partial> fragments;
while ( cursor + offset < source.length()/* && ++safety < source.length()*/ ) {
vector<Identity>::iterator idit = lang.Identities.begin();
while (idit != lang.Identities.end()) {
smatch fragment;
string sub = source.substr(cursor, source.length() - cursor );
if (sub[0] == '=') {
cout << "";
}
if (regex_search(sub, fragment, idit->RegX))
{
string frag = fragment.str();
int loc = sub.find(frag);
if (loc != 0) {
idit++;
continue;
}
if (idit->Mute) {
if (loc == 0)
cursor++;
break;
}
fragments.push_back(Partial(frag, cursor + loc));
cursor += loc + frag.length();
cout << idit->Nick << pad(20, idit->Nick.length()) << frag << endl;
//cout << "[" << loc << " - " << loc + frag.length() << "] " << fragments.back().symbol << "\t" << sub << endl;
}
else
idit++;
} // end identity loop
} // end cursor loop
}
string pad(int size, int off) {
string s = "";
for (; off != size; off++)
s += " ";
return s;
}
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment