Skip to content

Instantly share code, notes, and snippets.

@JarrettBillingsley
Created September 29, 2012 03:31
Show Gist options
  • Save JarrettBillingsley/3803064 to your computer and use it in GitHub Desktop.
Save JarrettBillingsley/3803064 to your computer and use it in GitHub Desktop.
Language sketch
// =====================================================================================================================
// MemPool interface (???)
// "Interfaces" could be fat pointers.. pointer to a "vtbl" and the "this" object.
// Of course if the type is statically known, you don't have to pass the vtbl around.
struct MemPoolMark { ... }
interface MemPool
{
function mark(): MemPoolMark
function freeToMark(m: &MemPoolMark)
}
// =====================================================================================================================
// Error Type
// Generics indicated with brackets
enum Error[T]
{
Error(string) // string is a built-in type, probably UTF-8
OK(T)
}
// Macros? Need something to replace the preprocessor. Also everything is an expression, so we can use the "match"
// (switch replacement) in here as an expression.
macro checkError[T](val: Error[T]): T =
match(val)
{
case Error(m): return Error(m) // "return" is of polymorphic type so it unifies with anything else.
case OK(v): v
}
// =====================================================================================================================
// JSON Parser
// public is visible outside this module, private is module-local.. I suppose some kind of package protection would
// be useful too (though more sanely-defined than in D)
public:
// enum is how you define sum types.
enum JsonValue
{
Null // Commas optional
Bool(bool)
Number(double)
String(string),
Object(Hash[string, JsonValue]) // again, square brackets for generics
Array(Array[JsonValue])
}
function parseJson(s: string, pool: MemPool): Error[JsonValue]
{
let l: Lexer // variables declared with 'let'; default-inited unless otherwise specified (like D)
#checkError(l.begin(s, pool))
let ret =
if(l.tok.type == Tok.LBrace)
#checkError(l.parseObject()) // macros are instantiated with #name.. maybe. conflicts with length operator
else if(l.type == Token.LBracket)
#checkError(l.parseArray())
else
l.error("JSON must have an object or an array as its top-level value")
// tag() would be a builtin, kinda like sizeof() in C, that gets you the tag of an enum type (for when a match
// would be too cumbersome).
if(tag(ret) == Error.Error)
return ret
#checkError(l.expect(Tok.EOF))
return Error.OK(ret)
}
private:
// This enum "derives" from an integer and therefore can only have nullary constructors. Works much like an enum
// in D, though more complete (Tok is an actual type, can't implicitly convert between Tok and uint, must explicitly
// cast from Tok to uint, auto-generated functions for it -- Tok_toString, Tok_parse, Tok_fromInteger)
enum Tok : uint
{
String
Number
True
False
Null
Comma
Colon
LBrace
RBrace
LBracket
RBracket
EOF
}
// But here we make our OWN damn strings :P
// Designated initializer syntax borrowed from D
// (Both Tok and TokStrings could be generated with a macro I suppose)
const TokStrings =
[
Tok.String: "string"
Tok.Number: "number"
Tok.True: "true"
Tok.False: "false"
Tok.Null: "null"
Tok.Comma: ","
Tok.Colon: ":"
Tok.LBrace: "{"
Tok.RBrace: "}"
Tok.LBracket: "["
Tok.RBracket: "]"
Tok.EOF: "<EOF>"
]
// Plain old struct!
// You can also have structs in enums but I didn't use them here
struct Token
{
type: Tok
line: uint
col: uint
strval: string
numval: float
}
struct Lexer
{
line: uint
col: uint
source: string
position: uint
character: char
tok: Token
pool: MemPool
poolMark: MemPoolMark
}
// Okay, syntactic sugar time.
// When you declare "function Type.name(params..)", it's sugar for "function Type_name(this: &Type, params..)".
// Thus this function's name is actually Lexer_error, and its first parameter is a Lexer reference (whose name inside
// the function is 'this').
function Lexer.error[T](msg: string, vararg): Error[T]
{
.pool.freeToMark(.poolMark) // Furthermore, since accessing members of 'this' is very common, .memb is a shortcut
return Error.Error(msg.vformat(.pool, msg, vararg)) // Fully-qualified enum tags.. hmm
// Also note that this function has a polymorphic return type, meaning it can be used in any context where any
// Error[T] is expected and the proper type will be instantiated.
}
// Now lemme just talk a little about the error handling. There are no exceptions (probably?), but ADTs let us signal
// errors in-band while still allowing for the full range of possible return values AND forcing code to check for
// errors. Sweet. Unfortunately this makes stuff a little awkward, which is where that checkError macro from above
// comes in. #checkError(exp) will evaluate exp, and if it evaluates to an error value, returns the error value;
// otherwise it evaluates to the OK value of exp. Simple, though doesn't leave much room for cleaning up. That'd be
// the domain of a more complex error-handling mechanism..
function Lexer.jsonError(msg: string, vararg): Error[JsonValue] =
.error(msg, vararg)
function Lexer.expected(msg: string): Error[Token] =
.error("({}:{}): '{}' expected; found '{}' instead", .tok.line, .tok.col, msg, TokStrings[.tok.type])
function Lexer.begin(source: string, pool: MemPool) : Error[Token]
{
.line = 1
.col = 0
.source = source
.position = 0
.pool = pool
// all allocations after this point will be kept track of so that we can free them in a single chunk. That's what
// the ".pool.freeToMark" call is for up in Lexer.error.
.poolMark = pool.mark()
.nextChar()
return #checkError(.nextToken())
}
function Lexer.expect(Tok type): Error[Token]
{
if(.tok.type != type)
return Error.Error(.expected(TokStrings[type]))
let ret = .tok
if(type != Tok.EOF)
#checkError(.nextToken())
return Error.OK(ret)
}
function Lexer.isEOF() = .character == meta(char, max)
function Lexer.isWhitespace() = .character in " \t\r\n"
function Lexer.isNewline() = .character in "\r\n"
function Lexer.isHexDigit() = (.character in '0' ... '9') || (.character in 'a' ... 'f') || (.character in 'A' ... 'F')
function Lexer.isDecimalDigit() = .character in '0' ... '9'
// char is a Unicode character.
function Lexer.hexDigitToInt(c: char): uint
{
// Three things: ranges (X .. Y is inclusive X, exclusive Y; X ... Y is inclusive X and Y, just like Ruby). Ranges
// are not a type, just a mechanism. Second, you can put them on the RHS of 'in' and it acts as shortcut for a
// double-comparison. So this is the same as "c >= '0' && c <= '9'". Third, cast[Type]exp is the cast syntax, square
// brackets for consistency with generics.
if(c in '0' ... '9')
return cast[uint](c - '0') // result of char - char is int, not char!
else if(c in 'A' .. 'F')
return cast[uint](c - 'A') + 10
else
return cast[uint](c - 'a') + 10
}
function Lexer.readChar(): char
{
if(.position >= #.source) // # is the size-of operator, not in the C sense, more like D's .length
return meta(char, max) // meta() is another built-in construct, used for querying info about types.
else
return .source.decode(.position) // decode would be of type function(idx: &uint):char and would decode one code
// unit from the UTF-8 string and advance idx.
}
function Lexer.nextChar()
{
.col++
.character = .readChar()
}
function Lexer.nextLine()
{
while(.isNewline() && !.isEOF())
{
let old = .character
.nextChar()
if(.isNewline() && .character != old)
.nextChar()
.line++
.col = 1
}
}
function Lexer.readNumLiteral(): Error[double]
{
let neg = false
let ret: double = 0
// sign
if(.character == '-')
{
neg = true
.nextChar()
if(!.isDecimalDigit())
return .error("({}:{}): incomplete number token", .line, .col)
}
// integral part
if(.character == '0')
.nextChar()
else
{
while(.isDecimalDigit())
{
ret = (ret * 10) + (.character - '0')
.nextChar()
}
}
if(.isEOF() || .character !in ".eE") // in/!in can also be used on constant arrays/strings. can be optimized.
return Error.OK(neg ? -ret : ret) // ternary operator.. do we need it since if() is an expression?
// fraction
if(.character == '.')
{
.nextChar()
if(!.isDecimalDigit())
return .error("({}:{}): incomplete number token", .line, .col)
let frac = 0.0
let mag = 10.0
while(.isDecimalDigit())
{
frac += (.character - '0') / mag
mag *= 10
.nextChar()
}
ret += frac
}
// exponent
if(.character in "eE")
{
.nextChar()
if(!.isDecimalDigit() && .character !in "+-")
return .error("({}:{}): incomplete number token", .line, .col)
let negExp = false
if(.character == '+')
.nextChar()
else if(.character == '-')
{
negExp = true
.nextChar()
}
if(!.isDecimalDigit())
return .error("({}:{}): incomplete number token", .line, .col)
let exp = 0.0
while(.isDecimalDigit())
{
exp = (exp * 10) + (.character - '0')
.nextChar()
}
ret *= 10.pow(negExp? -exp : exp)
}
return Error.OK(neg ? -ret : ret)
}
function Lexer.readHexDigits(num: uint): Error[uint]
{
let ret = 0
for(i: 0 .. num)
{
if(!.isHexDigit())
return .error("({}:{}): Hexadecimal escape digits expected", .line, .col)
ret <<= 4
ret |= .hexDigitToInt(.character)
.nextChar()
}
return Error.OK(ret)
}
function Lexer.readEscapeSequence(beginningLine: uint, beginningCol: uint): Error[char]
{
.nextChar()
if(.isEOF())
return .error("({}:{}): Unterminated string literal", beginningLine, beginningCol)
match(.character)
{
case 'b': .nextChar(); return Error.OK('\b') // semicolons exist, they're useful for putting multiple statements
case 'f': .nextChar(); return Error.OK('\f') // on one line.
case 'n': .nextChar(); return Error.OK('\n')
case 'r': .nextChar(); return Error.OK('\r')
case 't': .nextChar(); return Error.OK('\t')
case '\\': .nextChar(); return Error.OK('\\')
case '\"': .nextChar(); return Error.OK('\"')
case '/': .nextChar(); return Error.OK('/')
case 'u':
.nextChar()
let x = #checkError(.readHexDigits(4))
if(x in 0xD800 .. 0xDC00)
{
if(.character != '\\')
return .error("({}:{}): second surrogate pair character expected", .line, .col)
.nextChar()
if(.character != 'u')
return .error("({}:{}): second surrogate pair character expected", .line, .col)
.nextChar()
let x2 = #checkError(.readHexDigits(4))
if(x2 !in 0xDC00 .. 0xE000)
return .error("({}:{}): invalid surrogate pair sequence", .line, .col)
x = (x & ~0xD800) << 10
x2 &= ~0xDC00
return Error.OK(cast[char](0x10000 + (x << 10 | x2)))
}
else if(x in 0xDC00 .. 0xE000)
return .error("({}:{}): invalid surrogate pair sequence", .line, .col)
else
return Error.OK(cast[char]x)
case _: // this replaces "default:"
return .error("({}:{}): Invalid string escape sequence '\\{}'", .line, .col, .character)
}
}
function Lexer.readStringLiteral(): Error[string]
{
let beginningLine = .line
let beginningCol = .col
let buf = StringBuilder(.pool) // using the memory pool to allocate the string as we build it
// Skip opening quote
.nextChar()
while(true)
{
if(.isEOF())
return .error("({}:{}): Unterminated string literal", beginningLine, beginningCol)
if(.character == '\\')
buf.addChar(#checkError(.readEscapeSequence(beginningLine, beginningCol)))
else if(.character == '"')
{
.nextChar()
break
}
else if(.character < '\x20')
return .error("({}:{}): Invalid character in string token", .line, .col)
else
{
buf.addChar(.character)
.nextChar()
}
}
return buf.finish()
}
void Lexer.nextToken(): Error[Token]
{
while(true)
{
.tok.line = .line
.tok.col = .col
match(.character)
{
case '\r', '\n':
.nextLine()
continue
case '\"':
.tok.strval = #checkError(.readStringLiteral())
.tok.type = Tok.String
return Error.OK(.tok)
case '{':
.nextChar()
.tok.type = Tok.LBrace
return
case '}':
.nextChar()
.tok.type = Tok.RBrace
return
case '[':
.nextChar()
.tok.type = Tok.LBracket
return
case ']':
.nextChar()
.tok.type = Tok.RBracket
return
case ',':
.nextChar()
.tok.type = Tok.Comma
return
case ':':
.nextChar()
.tok.type = Tok.Colon
return
case meta(char, max):
.tok.type = Tok.EOF
return
case 't':
if(!.source[mPosition .. $].startsWith("rue")) // slicing syntax?? why not?
return .error("({}:{}): true expected", :line, :col)
.nextChar()
.nextChar()
.nextChar()
.nextChar()
.tok.type = Tok.True
return
case 'f':
if(!.source[.position .. $].startsWith("alse"))
return .error("({}:{}): false expected", :line, :col)
.nextChar()
.nextChar()
.nextChar()
.nextChar()
.nextChar()
.tok.type = Tok.False
return
case 'n':
if(!.source[.position .. $].startsWith("ull"))
return .error("({}:{}): null expected", :line, :col)
.nextChar()
.nextChar()
.nextChar()
.nextChar()
.tok.type = Tok.Null
return
case '-', '1' ... '9': // ranges work in cases too
.tok.numval = #checkError(.readNumLiteral())
.tok.type = Tok.Number
return
case _:
if(.isWhitespace())
{
.nextChar()
continue
}
return .error("({}:{}): Invalid character '{}'", :line, :col, .character)
}
}
}
function Lexer.parseValue(): Error[JsonValue]
{
match(.tok.type)
{
case String: #checkError(.nextToken()); return Error.OK(JsonValue.String(.tok.strval))
case Number: #checkError(.nextToken()); return Error.OK(JsonValue.Number(.tok.numval))
case True: #checkError(.nextToken()); return Error.OK(JsonValue.Bool(true))
case False: #checkError(.nextToken()); return Error.OK(JsonValue.Bool(false))
case Null: #checkError(.nextToken()); return Error.OK(JsonValue.Null)
case LBrace: return .parseObject()
case LBracket: return .parseArray()
case _: return .error("({}:{}): value expected", .line, .col)
}
}
function Lexer.parseArray(): Error[JsonValue]
{
let ret = ArrayBuilder[JsonValue](.pool) // Again, like the StringBuilder
#checkError(.expect(Tok.LBracket))
if(.tok.type != Token.RBracket)
{
ret.append(#checkError(.parseValue()))
while(.tok.type == Tok.Comma)
{
#checkError(.nextToken())
ret.append(#checkError(.parseValue()))
}
}
#checkError(.expect(Tok.RBracket))
return Error.OK(JsonValue.Array(ret.finish()))
}
function Lexer.parsePair(): Error[(string, JsonValue)] // Tuple types?? maybe
{
let key = #checkError(.expect(Tok.String)).strval
#checkError(.expect(Tok.Colon))
let val = #checkError(.parseValue())
return (key, val)
}
function Lexer.parseObject(): Error[JsonValue]
{
let h = Hash[string, JsonValue](.pool)
#checkError(.expect(Tok.LBrace))
if(.tok.type != Tok.RBrace)
{
let (key, val) = #checkError(.parsePair()) // deconstructing assignment is always fun
h.insert(key, val)
while(.tok.type == Tok.Comma)
{
#checkError(.nextToken())
(key, val) = #checkError(.parsePair())
h.insert(key, val)
}
}
#checkError(.expect(Tok.RBrace))
return Error.OK(JsonValue.Object(h))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment