Last active
November 30, 2024 15:38
-
-
Save axelbdt/1e0d02156e2d2b568c4578f7213d8ea8 to your computer and use it in GitHub Desktop.
OCaml scanner adapted from the Crafting Interpreters book
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(* A straightforward adaptation of the Java code for the Scanner in the book Crafting Interpreters, by Robert Nystrom | |
Original Java code can be found here : https://github.com/munificent/craftinginterpreters/blob/master/java/com/craftinginterpreters/lox/Scanner.java *) | |
type token = | |
| LeftParen | |
| RightParen | |
| LeftBrace | |
| RightBrace | |
| Comma | |
| Dot | |
| Minus | |
| Plus | |
| Semicolon | |
| Star | |
| Equal | |
| EqualEqual | |
| Bang | |
| BangEqual | |
| Less | |
| LessEqual | |
| Greater | |
| GreaterEqual | |
| Slash | |
| String of string | |
| Number of (string * float) | |
| Identifier of string | |
| And | |
| Class | |
| Else | |
| False | |
| For | |
| Fun | |
| If | |
| Nil | |
| Or | |
| Return | |
| Super | |
| This | |
| True | |
| Var | |
| While | |
| EOF | |
let number_token_of_string string = | |
try Number (string, float_of_string string) | |
with Failure _ -> | |
print_endline ("Failed to parse number: " ^ string); | |
raise (Failure "Failed to parse number") | |
let keywords = | |
[ | |
("and", And); | |
("class", Class); | |
("else", Else); | |
("false", False); | |
("for", For); | |
("fun", Fun); | |
("if", If); | |
("nil", Nil); | |
("or", Or); | |
("print", Print); | |
("return", Return); | |
("super", Super); | |
("this", This); | |
("true", True); | |
("var", Var); | |
("while", While); | |
] | |
type error = UnknownCharacter of char | UnterminatedString | |
type error_log = { on_line : int; error : error } | |
let message_of_error error = | |
match error with | |
| UnknownCharacter c -> Printf.sprintf "Unexpected character: %c" c | |
| UnterminatedString -> "Unterminated string." | |
let string_of_error_log log = | |
Printf.sprintf "(%d: %s)" log.on_line (message_of_error log.error) | |
let string_of_errors errors = | |
List.map string_of_error_log errors | |
|> String.concat "; " |> Printf.sprintf "[%s]" | |
type scanner_state = { | |
source : string; | |
tokens : token list; | |
errors : error_log list; | |
index : int; | |
start : int; | |
line : int; | |
} | |
let string_of_state state = | |
Printf.sprintf | |
"{ tokens = %s; errors = %s; current_index = %d; current_line = %d }" | |
(string_of_tokens state.tokens) | |
(string_of_errors state.errors) | |
state.index state.line | |
let initial_state source = | |
{ source; tokens = []; errors = []; index = 0; start = 0; line = 1 } | |
type char_lookup_result = Found of char | OutOfBounds | |
let current_char state = | |
if state.index < String.length state.source then | |
Found state.source.[state.index] | |
else OutOfBounds | |
let start_new_lexeme state = { state with start = state.index } | |
let incr_index state = { state with index = state.index + 1 } | |
let incr_line state = { state with line = state.line + 1 } | |
let advance state = (current_char state, incr_index state) | |
let peek state = current_char state | |
let peek_next state = peek { state with index = state.index + 1 } | |
let add_token token state = { state with tokens = token :: state.tokens } | |
let add_number_token state = | |
let token = | |
number_token_of_string | |
(String.sub state.source state.start (state.index - state.start)) | |
in | |
add_token token state | |
let add_identifier_or_keyword_token state = | |
let string = | |
String.sub state.source state.start (state.index - state.start) | |
in | |
let token = | |
match List.assoc_opt string keywords with | |
| Some token -> token | |
| None -> Identifier string | |
in | |
add_token token state | |
let add_error error state = { state with errors = error :: state.errors } | |
let rec next_line state = | |
let next, state = advance state in | |
match next with | |
| OutOfBounds -> state | |
| Found '\n' -> state |> incr_line | |
| _ -> next_line state | |
let rec scan_string state = | |
let next, state = advance state in | |
match next with | |
| OutOfBounds -> | |
state |> add_error { on_line = state.line; error = UnterminatedString } | |
| Found '"' -> | |
let string = | |
let start_string = state.start + 1 in | |
let end_string = state.index - 1 in | |
String.sub state.source start_string (end_string - start_string) | |
in | |
state |> add_token (String string) | |
| Found _ -> scan_string state | |
let rec scan_number state = | |
match peek state with | |
| Found '0' .. '9' -> | |
let _, state = advance state in | |
scan_number state | |
| Found '.' -> ( | |
match peek_next state with | |
| Found '0' .. '9' -> | |
let _, state = advance state in | |
scan_number state | |
| _ -> state |> add_number_token) | |
| _ -> state |> add_number_token | |
let rec scan_identifier state = | |
match peek state with | |
| Found c -> ( | |
match c with | |
| 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' -> | |
let _, state = advance state in | |
scan_identifier state | |
| _ -> state |> add_identifier_or_keyword_token) | |
| OutOfBounds -> state |> add_identifier_or_keyword_token | |
let rec scan state = | |
let state = start_new_lexeme state in | |
let next, state = advance state in | |
match next with | |
| OutOfBounds -> state |> add_token EOF | |
| Found c -> | |
let new_state = | |
match c with | |
| '\t' | ' ' -> state | |
| '\n' -> state |> incr_line | |
| '(' -> state |> add_token LeftParen | |
| ')' -> state |> add_token RightParen | |
| '{' -> state |> add_token LeftBrace | |
| '}' -> state |> add_token RightBrace | |
| ',' -> state |> add_token Comma | |
| '.' -> state |> add_token Dot | |
| '-' -> state |> add_token Minus | |
| '+' -> state |> add_token Plus | |
| ';' -> state |> add_token Semicolon | |
| '*' -> state |> add_token Star | |
| '=' -> ( | |
match peek state with | |
| Found '=' -> state |> incr_index |> add_token EqualEqual | |
| _ -> state |> add_token Equal) | |
| '!' -> ( | |
match peek state with | |
| Found '=' -> state |> incr_index |> add_token BangEqual | |
| _ -> state |> add_token Bang) | |
| '<' -> ( | |
match peek state with | |
| Found '=' -> state |> incr_index |> add_token LessEqual | |
| _ -> state |> add_token Less) | |
| '>' -> ( | |
match peek state with | |
| Found '=' -> state |> incr_index |> add_token GreaterEqual | |
| _ -> state |> add_token Greater) | |
| '/' -> ( | |
match peek state with | |
| Found '/' -> state |> next_line | |
| _ -> state |> add_token Slash) | |
| '"' -> scan_string state | |
| '0' .. '9' -> scan_number state | |
| 'a' .. 'z' | 'A' .. 'Z' | '_' -> scan_identifier state | |
| _ -> | |
state | |
|> add_error { on_line = state.line; error = UnknownCharacter c } | |
in | |
scan new_state | |
let full_scan source = | |
let state = scan (initial_state source) in | |
(List.rev state.tokens, List.rev state.errors) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment