Created
September 29, 2012 03:31
-
-
Save JarrettBillingsley/3803064 to your computer and use it in GitHub Desktop.
Language sketch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// ===================================================================================================================== | |
// MemPool interface (???) | |
// "Interfaces" could be fat pointers.. pointer to a "vtbl" and the "this" object. | |
// Of course if the type is statically known, you don't have to pass the vtbl around. | |
struct MemPoolMark { ... } | |
interface MemPool | |
{ | |
function mark(): MemPoolMark | |
function freeToMark(m: &MemPoolMark) | |
} | |
// ===================================================================================================================== | |
// Error Type | |
// Generics indicated with brackets | |
enum Error[T] | |
{ | |
Error(string) // string is a built-in type, probably UTF-8 | |
OK(T) | |
} | |
// Macros? Need something to replace the preprocessor. Also everything is an expression, so we can use the "match" | |
// (switch replacement) in here as an expression. | |
macro checkError[T](val: Error[T]): T = | |
match(val) | |
{ | |
case Error(m): return Error(m) // "return" is of polymorphic type so it unifies with anything else. | |
case OK(v): v | |
} | |
// ===================================================================================================================== | |
// JSON Parser | |
// public is visible outside this module, private is module-local.. I suppose some kind of package protection would | |
// be useful too (though more sanely-defined than in D) | |
public: | |
// enum is how you define sum types. | |
enum JsonValue | |
{ | |
Null // Commas optional | |
Bool(bool) | |
Number(double) | |
String(string), | |
Object(Hash[string, JsonValue]) // again, square brackets for generics | |
Array(Array[JsonValue]) | |
} | |
function parseJson(s: string, pool: MemPool): Error[JsonValue] | |
{ | |
let l: Lexer // variables declared with 'let'; default-inited unless otherwise specified (like D) | |
#checkError(l.begin(s, pool)) | |
let ret = | |
if(l.tok.type == Tok.LBrace) | |
#checkError(l.parseObject()) // macros are instantiated with #name.. maybe. conflicts with length operator | |
else if(l.type == Token.LBracket) | |
#checkError(l.parseArray()) | |
else | |
l.error("JSON must have an object or an array as its top-level value") | |
// tag() would be a builtin, kinda like sizeof() in C, that gets you the tag of an enum type (for when a match | |
// would be too cumbersome). | |
if(tag(ret) == Error.Error) | |
return ret | |
#checkError(l.expect(Tok.EOF)) | |
return Error.OK(ret) | |
} | |
private: | |
// This enum "derives" from an integer and therefore can only have nullary constructors. Works much like an enum | |
// in D, though more complete (Tok is an actual type, can't implicitly convert between Tok and uint, must explicitly | |
// cast from Tok to uint, auto-generated functions for it -- Tok_toString, Tok_parse, Tok_fromInteger) | |
enum Tok : uint | |
{ | |
String | |
Number | |
True | |
False | |
Null | |
Comma | |
Colon | |
LBrace | |
RBrace | |
LBracket | |
RBracket | |
EOF | |
} | |
// But here we make our OWN damn strings :P | |
// Designated initializer syntax borrowed from D | |
// (Both Tok and TokStrings could be generated with a macro I suppose) | |
const TokStrings = | |
[ | |
Tok.String: "string" | |
Tok.Number: "number" | |
Tok.True: "true" | |
Tok.False: "false" | |
Tok.Null: "null" | |
Tok.Comma: "," | |
Tok.Colon: ":" | |
Tok.LBrace: "{" | |
Tok.RBrace: "}" | |
Tok.LBracket: "[" | |
Tok.RBracket: "]" | |
Tok.EOF: "<EOF>" | |
] | |
// Plain old struct! | |
// You can also have structs in enums but I didn't use them here | |
struct Token | |
{ | |
type: Tok | |
line: uint | |
col: uint | |
strval: string | |
numval: float | |
} | |
struct Lexer | |
{ | |
line: uint | |
col: uint | |
source: string | |
position: uint | |
character: char | |
tok: Token | |
pool: MemPool | |
poolMark: MemPoolMark | |
} | |
// Okay, syntactic sugar time. | |
// When you declare "function Type.name(params..)", it's sugar for "function Type_name(this: &Type, params..)". | |
// Thus this function's name is actually Lexer_error, and its first parameter is a Lexer reference (whose name inside | |
// the function is 'this'). | |
function Lexer.error[T](msg: string, vararg): Error[T] | |
{ | |
.pool.freeToMark(.poolMark) // Furthermore, since accessing members of 'this' is very common, .memb is a shortcut | |
return Error.Error(msg.vformat(.pool, msg, vararg)) // Fully-qualified enum tags.. hmm | |
// Also note that this function has a polymorphic return type, meaning it can be used in any context where any | |
// Error[T] is expected and the proper type will be instantiated. | |
} | |
// Now lemme just talk a little about the error handling. There are no exceptions (probably?), but ADTs let us signal | |
// errors in-band while still allowing for the full range of possible return values AND forcing code to check for | |
// errors. Sweet. Unfortunately this makes stuff a little awkward, which is where that checkError macro from above | |
// comes in. #checkError(exp) will evaluate exp, and if it evaluates to an error value, returns the error value; | |
// otherwise it evaluates to the OK value of exp. Simple, though doesn't leave much room for cleaning up. That'd be | |
// the domain of a more complex error-handling mechanism.. | |
function Lexer.jsonError(msg: string, vararg): Error[JsonValue] = | |
.error(msg, vararg) | |
function Lexer.expected(msg: string): Error[Token] = | |
.error("({}:{}): '{}' expected; found '{}' instead", .tok.line, .tok.col, msg, TokStrings[.tok.type]) | |
function Lexer.begin(source: string, pool: MemPool) : Error[Token] | |
{ | |
.line = 1 | |
.col = 0 | |
.source = source | |
.position = 0 | |
.pool = pool | |
// all allocations after this point will be kept track of so that we can free them in a single chunk. That's what | |
// the ".pool.freeToMark" call is for up in Lexer.error. | |
.poolMark = pool.mark() | |
.nextChar() | |
return #checkError(.nextToken()) | |
} | |
function Lexer.expect(Tok type): Error[Token] | |
{ | |
if(.tok.type != type) | |
return Error.Error(.expected(TokStrings[type])) | |
let ret = .tok | |
if(type != Tok.EOF) | |
#checkError(.nextToken()) | |
return Error.OK(ret) | |
} | |
function Lexer.isEOF() = .character == meta(char, max) | |
function Lexer.isWhitespace() = .character in " \t\r\n" | |
function Lexer.isNewline() = .character in "\r\n" | |
function Lexer.isHexDigit() = (.character in '0' ... '9') || (.character in 'a' ... 'f') || (.character in 'A' ... 'F') | |
function Lexer.isDecimalDigit() = .character in '0' ... '9' | |
// char is a Unicode character. | |
function Lexer.hexDigitToInt(c: char): uint | |
{ | |
// Three things: ranges (X .. Y is inclusive X, exclusive Y; X ... Y is inclusive X and Y, just like Ruby). Ranges | |
// are not a type, just a mechanism. Second, you can put them on the RHS of 'in' and it acts as shortcut for a | |
// double-comparison. So this is the same as "c >= '0' && c <= '9'". Third, cast[Type]exp is the cast syntax, square | |
// brackets for consistency with generics. | |
if(c in '0' ... '9') | |
return cast[uint](c - '0') // result of char - char is int, not char! | |
else if(c in 'A' .. 'F') | |
return cast[uint](c - 'A') + 10 | |
else | |
return cast[uint](c - 'a') + 10 | |
} | |
function Lexer.readChar(): char | |
{ | |
if(.position >= #.source) // # is the size-of operator, not in the C sense, more like D's .length | |
return meta(char, max) // meta() is another built-in construct, used for querying info about types. | |
else | |
return .source.decode(.position) // decode would be of type function(idx: &uint):char and would decode one code | |
// unit from the UTF-8 string and advance idx. | |
} | |
function Lexer.nextChar() | |
{ | |
.col++ | |
.character = .readChar() | |
} | |
function Lexer.nextLine() | |
{ | |
while(.isNewline() && !.isEOF()) | |
{ | |
let old = .character | |
.nextChar() | |
if(.isNewline() && .character != old) | |
.nextChar() | |
.line++ | |
.col = 1 | |
} | |
} | |
function Lexer.readNumLiteral(): Error[double] | |
{ | |
let neg = false | |
let ret: double = 0 | |
// sign | |
if(.character == '-') | |
{ | |
neg = true | |
.nextChar() | |
if(!.isDecimalDigit()) | |
return .error("({}:{}): incomplete number token", .line, .col) | |
} | |
// integral part | |
if(.character == '0') | |
.nextChar() | |
else | |
{ | |
while(.isDecimalDigit()) | |
{ | |
ret = (ret * 10) + (.character - '0') | |
.nextChar() | |
} | |
} | |
if(.isEOF() || .character !in ".eE") // in/!in can also be used on constant arrays/strings. can be optimized. | |
return Error.OK(neg ? -ret : ret) // ternary operator.. do we need it since if() is an expression? | |
// fraction | |
if(.character == '.') | |
{ | |
.nextChar() | |
if(!.isDecimalDigit()) | |
return .error("({}:{}): incomplete number token", .line, .col) | |
let frac = 0.0 | |
let mag = 10.0 | |
while(.isDecimalDigit()) | |
{ | |
frac += (.character - '0') / mag | |
mag *= 10 | |
.nextChar() | |
} | |
ret += frac | |
} | |
// exponent | |
if(.character in "eE") | |
{ | |
.nextChar() | |
if(!.isDecimalDigit() && .character !in "+-") | |
return .error("({}:{}): incomplete number token", .line, .col) | |
let negExp = false | |
if(.character == '+') | |
.nextChar() | |
else if(.character == '-') | |
{ | |
negExp = true | |
.nextChar() | |
} | |
if(!.isDecimalDigit()) | |
return .error("({}:{}): incomplete number token", .line, .col) | |
let exp = 0.0 | |
while(.isDecimalDigit()) | |
{ | |
exp = (exp * 10) + (.character - '0') | |
.nextChar() | |
} | |
ret *= 10.pow(negExp? -exp : exp) | |
} | |
return Error.OK(neg ? -ret : ret) | |
} | |
function Lexer.readHexDigits(num: uint): Error[uint] | |
{ | |
let ret = 0 | |
for(i: 0 .. num) | |
{ | |
if(!.isHexDigit()) | |
return .error("({}:{}): Hexadecimal escape digits expected", .line, .col) | |
ret <<= 4 | |
ret |= .hexDigitToInt(.character) | |
.nextChar() | |
} | |
return Error.OK(ret) | |
} | |
function Lexer.readEscapeSequence(beginningLine: uint, beginningCol: uint): Error[char] | |
{ | |
.nextChar() | |
if(.isEOF()) | |
return .error("({}:{}): Unterminated string literal", beginningLine, beginningCol) | |
match(.character) | |
{ | |
case 'b': .nextChar(); return Error.OK('\b') // semicolons exist, they're useful for putting multiple statements | |
case 'f': .nextChar(); return Error.OK('\f') // on one line. | |
case 'n': .nextChar(); return Error.OK('\n') | |
case 'r': .nextChar(); return Error.OK('\r') | |
case 't': .nextChar(); return Error.OK('\t') | |
case '\\': .nextChar(); return Error.OK('\\') | |
case '\"': .nextChar(); return Error.OK('\"') | |
case '/': .nextChar(); return Error.OK('/') | |
case 'u': | |
.nextChar() | |
let x = #checkError(.readHexDigits(4)) | |
if(x in 0xD800 .. 0xDC00) | |
{ | |
if(.character != '\\') | |
return .error("({}:{}): second surrogate pair character expected", .line, .col) | |
.nextChar() | |
if(.character != 'u') | |
return .error("({}:{}): second surrogate pair character expected", .line, .col) | |
.nextChar() | |
let x2 = #checkError(.readHexDigits(4)) | |
if(x2 !in 0xDC00 .. 0xE000) | |
return .error("({}:{}): invalid surrogate pair sequence", .line, .col) | |
x = (x & ~0xD800) << 10 | |
x2 &= ~0xDC00 | |
return Error.OK(cast[char](0x10000 + (x << 10 | x2))) | |
} | |
else if(x in 0xDC00 .. 0xE000) | |
return .error("({}:{}): invalid surrogate pair sequence", .line, .col) | |
else | |
return Error.OK(cast[char]x) | |
case _: // this replaces "default:" | |
return .error("({}:{}): Invalid string escape sequence '\\{}'", .line, .col, .character) | |
} | |
} | |
function Lexer.readStringLiteral(): Error[string] | |
{ | |
let beginningLine = .line | |
let beginningCol = .col | |
let buf = StringBuilder(.pool) // using the memory pool to allocate the string as we build it | |
// Skip opening quote | |
.nextChar() | |
while(true) | |
{ | |
if(.isEOF()) | |
return .error("({}:{}): Unterminated string literal", beginningLine, beginningCol) | |
if(.character == '\\') | |
buf.addChar(#checkError(.readEscapeSequence(beginningLine, beginningCol))) | |
else if(.character == '"') | |
{ | |
.nextChar() | |
break | |
} | |
else if(.character < '\x20') | |
return .error("({}:{}): Invalid character in string token", .line, .col) | |
else | |
{ | |
buf.addChar(.character) | |
.nextChar() | |
} | |
} | |
return buf.finish() | |
} | |
void Lexer.nextToken(): Error[Token] | |
{ | |
while(true) | |
{ | |
.tok.line = .line | |
.tok.col = .col | |
match(.character) | |
{ | |
case '\r', '\n': | |
.nextLine() | |
continue | |
case '\"': | |
.tok.strval = #checkError(.readStringLiteral()) | |
.tok.type = Tok.String | |
return Error.OK(.tok) | |
case '{': | |
.nextChar() | |
.tok.type = Tok.LBrace | |
return | |
case '}': | |
.nextChar() | |
.tok.type = Tok.RBrace | |
return | |
case '[': | |
.nextChar() | |
.tok.type = Tok.LBracket | |
return | |
case ']': | |
.nextChar() | |
.tok.type = Tok.RBracket | |
return | |
case ',': | |
.nextChar() | |
.tok.type = Tok.Comma | |
return | |
case ':': | |
.nextChar() | |
.tok.type = Tok.Colon | |
return | |
case meta(char, max): | |
.tok.type = Tok.EOF | |
return | |
case 't': | |
if(!.source[mPosition .. $].startsWith("rue")) // slicing syntax?? why not? | |
return .error("({}:{}): true expected", :line, :col) | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.tok.type = Tok.True | |
return | |
case 'f': | |
if(!.source[.position .. $].startsWith("alse")) | |
return .error("({}:{}): false expected", :line, :col) | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.tok.type = Tok.False | |
return | |
case 'n': | |
if(!.source[.position .. $].startsWith("ull")) | |
return .error("({}:{}): null expected", :line, :col) | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.nextChar() | |
.tok.type = Tok.Null | |
return | |
case '-', '1' ... '9': // ranges work in cases too | |
.tok.numval = #checkError(.readNumLiteral()) | |
.tok.type = Tok.Number | |
return | |
case _: | |
if(.isWhitespace()) | |
{ | |
.nextChar() | |
continue | |
} | |
return .error("({}:{}): Invalid character '{}'", :line, :col, .character) | |
} | |
} | |
} | |
function Lexer.parseValue(): Error[JsonValue] | |
{ | |
match(.tok.type) | |
{ | |
case String: #checkError(.nextToken()); return Error.OK(JsonValue.String(.tok.strval)) | |
case Number: #checkError(.nextToken()); return Error.OK(JsonValue.Number(.tok.numval)) | |
case True: #checkError(.nextToken()); return Error.OK(JsonValue.Bool(true)) | |
case False: #checkError(.nextToken()); return Error.OK(JsonValue.Bool(false)) | |
case Null: #checkError(.nextToken()); return Error.OK(JsonValue.Null) | |
case LBrace: return .parseObject() | |
case LBracket: return .parseArray() | |
case _: return .error("({}:{}): value expected", .line, .col) | |
} | |
} | |
function Lexer.parseArray(): Error[JsonValue] | |
{ | |
let ret = ArrayBuilder[JsonValue](.pool) // Again, like the StringBuilder | |
#checkError(.expect(Tok.LBracket)) | |
if(.tok.type != Token.RBracket) | |
{ | |
ret.append(#checkError(.parseValue())) | |
while(.tok.type == Tok.Comma) | |
{ | |
#checkError(.nextToken()) | |
ret.append(#checkError(.parseValue())) | |
} | |
} | |
#checkError(.expect(Tok.RBracket)) | |
return Error.OK(JsonValue.Array(ret.finish())) | |
} | |
function Lexer.parsePair(): Error[(string, JsonValue)] // Tuple types?? maybe | |
{ | |
let key = #checkError(.expect(Tok.String)).strval | |
#checkError(.expect(Tok.Colon)) | |
let val = #checkError(.parseValue()) | |
return (key, val) | |
} | |
function Lexer.parseObject(): Error[JsonValue] | |
{ | |
let h = Hash[string, JsonValue](.pool) | |
#checkError(.expect(Tok.LBrace)) | |
if(.tok.type != Tok.RBrace) | |
{ | |
let (key, val) = #checkError(.parsePair()) // deconstructing assignment is always fun | |
h.insert(key, val) | |
while(.tok.type == Tok.Comma) | |
{ | |
#checkError(.nextToken()) | |
(key, val) = #checkError(.parsePair()) | |
h.insert(key, val) | |
} | |
} | |
#checkError(.expect(Tok.RBrace)) | |
return Error.OK(JsonValue.Object(h)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment