axelbdt · November 30, 2024 15:38
diff --git a/scanner.ml b/scanner.ml
 (* A straightforward adaptation of the Java code for the Scanner in the book Crafting Interpreters, by Robert Nystrom
 Original Java code can be found here : https://github.com/munificent/craftinginterpreters/blob/master/java/com/craftinginterpreters/lox/Scanner.java *)

 type token =
  | LeftParen
  | RightParen
  | LeftBrace
  | RightBrace
  | Comma
  | Dot
  | Minus
  | Plus
  | Semicolon
  | Star
  | Equal
  | EqualEqual
  | Bang
  | BangEqual
  | Less
  | LessEqual
  | Greater
  | GreaterEqual
  | Slash
  | String of string
  | Number of (string * float)
  | Identifier of string
  | And
  | Class
  | Else
  | False
  | For
  | Fun
  | If
  | Nil
  | Or
  | Print
  | Return
  | Super
  | This
  | True
  | Var
  | While
  | EOF

 let number_token_of_string string =
  try Number (string, float_of_string string)
  with Failure _ ->
    print_endline ("Failed to parse number: " ^ string);
    raise (Failure "Failed to parse number")

 let keywords =
  [
    ("and", And);
    ("class", Class);
    ("else", Else);
    ("false", False);
    ("for", For);
    ("fun", Fun);
    ("if", If);
    ("nil", Nil);
    ("or", Or);
    ("print", Print);
    ("return", Return);
    ("super", Super);
    ("this", This);
    ("true", True);
    ("var", Var);
    ("while", While);
  ]

 type error = UnknownCharacter of char | UnterminatedString
 type error_log = { on_line : int; error : error }

 let message_of_error error =
  match error with
  | UnknownCharacter c -> Printf.sprintf "Unexpected character: %c" c
  | UnterminatedString -> "Unterminated string."

 let string_of_error_log log =
  Printf.sprintf "(%d: %s)" log.on_line (message_of_error log.error)

 let string_of_errors errors =
  List.map string_of_error_log errors
  |> String.concat "; " |> Printf.sprintf "[%s]"

 type scanner_state = {
  source : string;
  tokens : token list;
  errors : error_log list;
  index : int;
  start : int;
  line : int;
 }

 let string_of_state state =
  Printf.sprintf
    "{ tokens = %s; errors = %s; current_index = %d; current_line = %d }"
    (string_of_tokens state.tokens)
    (string_of_errors state.errors)
    state.index state.line

 let initial_state source =
  { source; tokens = []; errors = []; index = 0; start = 0; line = 1 }

 type char_lookup_result = Found of char | OutOfBounds

 let current_char state =
  if state.index < String.length state.source then
    Found state.source.[state.index]
  else OutOfBounds

 let start_new_lexeme state = { state with start = state.index }
 let incr_index state = { state with index = state.index + 1 }
 let incr_line state = { state with line = state.line + 1 }
 let advance state = (current_char state, incr_index state)
 let peek state = current_char state
 let peek_next state = peek { state with index = state.index + 1 }
 let add_token token state = { state with tokens = token :: state.tokens }

 let add_number_token state =
  let token =
    number_token_of_string
      (String.sub state.source state.start (state.index - state.start))
  in
  add_token token state

 let add_identifier_or_keyword_token state =
  let string =
    String.sub state.source state.start (state.index - state.start)
  in
  let token =
    match List.assoc_opt string keywords with
    | Some token -> token
    | None -> Identifier string
  in
  add_token token state

 let add_error error state = { state with errors = error :: state.errors }

 let rec next_line state =
  let next, state = advance state in
  match next with
  | OutOfBounds -> state
  | Found '\n' -> state |> incr_line
  | _ -> next_line state

 let rec scan_string state =
  let next, state = advance state in
  match next with
  | OutOfBounds ->
      state |> add_error { on_line = state.line; error = UnterminatedString }
  | Found '"' ->
      let string =
        let start_string = state.start + 1 in
        let end_string = state.index - 1 in
        String.sub state.source start_string (end_string - start_string)
      in
      state |> add_token (String string)
  | Found _ -> scan_string state

 let rec scan_number state =
  match peek state with
  | Found '0' .. '9' ->
      let _, state = advance state in
      scan_number state
  | Found '.' -> (
      match peek_next state with
      | Found '0' .. '9' ->
          let _, state = advance state in
          scan_number state
      | _ -> state |> add_number_token)
  | _ -> state |> add_number_token

 let rec scan_identifier state =
  match peek state with
  | Found c -> (
      match c with
      | 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' ->
          let _, state = advance state in
          scan_identifier state
      | _ -> state |> add_identifier_or_keyword_token)
  | OutOfBounds -> state |> add_identifier_or_keyword_token

 let rec scan state =
  let state = start_new_lexeme state in
  let next, state = advance state in
  match next with
  | OutOfBounds -> state |> add_token EOF
  | Found c ->
      let new_state =
        match c with
        | '\t' | ' ' -> state
        | '\n' -> state |> incr_line
        | '(' -> state |> add_token LeftParen
        | ')' -> state |> add_token RightParen
        | '{' -> state |> add_token LeftBrace
        | '}' -> state |> add_token RightBrace
        | ',' -> state |> add_token Comma
        | '.' -> state |> add_token Dot
        | '-' -> state |> add_token Minus
        | '+' -> state |> add_token Plus
        | ';' -> state |> add_token Semicolon
        | '*' -> state |> add_token Star
        | '=' -> (
            match peek state with
            | Found '=' -> state |> incr_index |> add_token EqualEqual
            | _ -> state |> add_token Equal)
        | '!' -> (
            match peek state with
            | Found '=' -> state |> incr_index |> add_token BangEqual
            | _ -> state |> add_token Bang)
        | '<' -> (
            match peek state with
            | Found '=' -> state |> incr_index |> add_token LessEqual
            | _ -> state |> add_token Less)
        | '>' -> (
            match peek state with
            | Found '=' -> state |> incr_index |> add_token GreaterEqual
            | _ -> state |> add_token Greater)
        | '/' -> (
            match peek state with
            | Found '/' -> state |> next_line
            | _ -> state |> add_token Slash)
        | '"' -> scan_string state
        | '0' .. '9' -> scan_number state
        | 'a' .. 'z' | 'A' .. 'Z' | '_' -> scan_identifier state
        | _ ->
            state
            |> add_error { on_line = state.line; error = UnknownCharacter c }
      in
      scan new_state

 let full_scan source =
  let state = scan (initial_state source) in
  (List.rev state.tokens, List.rev state.errors)
	(* A straightforward adaptation of the Java code for the Scanner in the book Crafting Interpreters, by Robert Nystrom
	Original Java code can be found here : https://github.com/munificent/craftinginterpreters/blob/master/java/com/craftinginterpreters/lox/Scanner.java *)

	type token =
	\| LeftParen
	\| RightParen
	\| LeftBrace
	\| RightBrace
	\| Comma
	\| Dot
	\| Minus
	\| Plus
	\| Semicolon
	\| Star
	\| Equal
	\| EqualEqual
	\| Bang
	\| BangEqual
	\| Less
	\| LessEqual
	\| Greater
	\| GreaterEqual
	\| Slash
	\| String of string
	\| Number of (string * float)
	\| Identifier of string
	\| And
	\| Class
	\| Else
	\| False
	\| For
	\| Fun
	\| If
	\| Nil
	\| Or
	\| Print
	\| Return
	\| Super
	\| This
	\| True
	\| Var
	\| While
	\| EOF

	let number_token_of_string string =
	try Number (string, float_of_string string)
	with Failure _ ->
	print_endline ("Failed to parse number: " ^ string);
	raise (Failure "Failed to parse number")

	let keywords =
	[
	("and", And);
	("class", Class);
	("else", Else);
	("false", False);
	("for", For);
	("fun", Fun);
	("if", If);
	("nil", Nil);
	("or", Or);
	("print", Print);
	("return", Return);
	("super", Super);
	("this", This);
	("true", True);
	("var", Var);
	("while", While);
	]

	type error = UnknownCharacter of char \| UnterminatedString
	type error_log = { on_line : int; error : error }

	let message_of_error error =
	match error with
	\| UnknownCharacter c -> Printf.sprintf "Unexpected character: %c" c
	\| UnterminatedString -> "Unterminated string."

	let string_of_error_log log =
	Printf.sprintf "(%d: %s)" log.on_line (message_of_error log.error)

	let string_of_errors errors =
	List.map string_of_error_log errors
	\|> String.concat "; " \|> Printf.sprintf "[%s]"

	type scanner_state = {
	source : string;
	tokens : token list;
	errors : error_log list;
	index : int;
	start : int;
	line : int;
	}

	let string_of_state state =
	Printf.sprintf
	"{ tokens = %s; errors = %s; current_index = %d; current_line = %d }"
	(string_of_tokens state.tokens)
	(string_of_errors state.errors)
	state.index state.line

	let initial_state source =
	{ source; tokens = []; errors = []; index = 0; start = 0; line = 1 }

	type char_lookup_result = Found of char \| OutOfBounds

	let current_char state =
	if state.index < String.length state.source then
	Found state.source.[state.index]
	else OutOfBounds

	let start_new_lexeme state = { state with start = state.index }
	let incr_index state = { state with index = state.index + 1 }
	let incr_line state = { state with line = state.line + 1 }
	let advance state = (current_char state, incr_index state)
	let peek state = current_char state
	let peek_next state = peek { state with index = state.index + 1 }
	let add_token token state = { state with tokens = token :: state.tokens }

	let add_number_token state =
	let token =
	number_token_of_string
	(String.sub state.source state.start (state.index - state.start))
	in
	add_token token state

	let add_identifier_or_keyword_token state =
	let string =
	String.sub state.source state.start (state.index - state.start)
	in
	let token =
	match List.assoc_opt string keywords with
	\| Some token -> token
	\| None -> Identifier string
	in
	add_token token state

	let add_error error state = { state with errors = error :: state.errors }

	let rec next_line state =
	let next, state = advance state in
	match next with
	\| OutOfBounds -> state
	\| Found '\n' -> state \|> incr_line
	\| _ -> next_line state

	let rec scan_string state =
	let next, state = advance state in
	match next with
	\| OutOfBounds ->
	state \|> add_error { on_line = state.line; error = UnterminatedString }
	\| Found '"' ->
	let string =
	let start_string = state.start + 1 in
	let end_string = state.index - 1 in
	String.sub state.source start_string (end_string - start_string)
	in
	state \|> add_token (String string)
	\| Found _ -> scan_string state

	let rec scan_number state =
	match peek state with
	\| Found '0' .. '9' ->
	let _, state = advance state in
	scan_number state
	\| Found '.' -> (
	match peek_next state with
	\| Found '0' .. '9' ->
	let _, state = advance state in
	scan_number state
	\| _ -> state \|> add_number_token)
	\| _ -> state \|> add_number_token

	let rec scan_identifier state =
	match peek state with
	\| Found c -> (
	match c with
	\| 'a' .. 'z' \| 'A' .. 'Z' \| '_' \| '0' .. '9' ->
	let _, state = advance state in
	scan_identifier state
	\| _ -> state \|> add_identifier_or_keyword_token)
	\| OutOfBounds -> state \|> add_identifier_or_keyword_token

	let rec scan state =
	let state = start_new_lexeme state in
	let next, state = advance state in
	match next with
	\| OutOfBounds -> state \|> add_token EOF
	\| Found c ->
	let new_state =
	match c with
	\| '\t' \| ' ' -> state
	\| '\n' -> state \|> incr_line
	\| '(' -> state \|> add_token LeftParen
	\| ')' -> state \|> add_token RightParen
	\| '{' -> state \|> add_token LeftBrace
	\| '}' -> state \|> add_token RightBrace
	\| ',' -> state \|> add_token Comma
	\| '.' -> state \|> add_token Dot
	\| '-' -> state \|> add_token Minus
	\| '+' -> state \|> add_token Plus
	\| ';' -> state \|> add_token Semicolon
	\| '*' -> state \|> add_token Star
	\| '=' -> (
	match peek state with
	\| Found '=' -> state \|> incr_index \|> add_token EqualEqual
	\| _ -> state \|> add_token Equal)
	\| '!' -> (
	match peek state with
	\| Found '=' -> state \|> incr_index \|> add_token BangEqual
	\| _ -> state \|> add_token Bang)
	\| '<' -> (
	match peek state with
	\| Found '=' -> state \|> incr_index \|> add_token LessEqual
	\| _ -> state \|> add_token Less)
	\| '>' -> (
	match peek state with
	\| Found '=' -> state \|> incr_index \|> add_token GreaterEqual
	\| _ -> state \|> add_token Greater)
	\| '/' -> (
	match peek state with
	\| Found '/' -> state \|> next_line
	\| _ -> state \|> add_token Slash)
	\| '"' -> scan_string state
	\| '0' .. '9' -> scan_number state
	\| 'a' .. 'z' \| 'A' .. 'Z' \| '_' -> scan_identifier state
	\| _ ->
	state
	\|> add_error { on_line = state.line; error = UnknownCharacter c }
	in
	scan new_state

	let full_scan source =
	let state = scan (initial_state source) in
	(List.rev state.tokens, List.rev state.errors)