Last active
August 29, 2015 14:01
-
-
Save vkobel/7a6b9e1edae6e5c4592f to your computer and use it in GitHub Desktop.
Simple lexer for a minimalist asm-like language
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module Lexer | |
open System | |
open System.IO | |
open System.Text.RegularExpressions | |
type Register = | |
| A // 8 bits | |
| B // 8 bits | |
| D // 16 bits (A + B) | |
| X // 16 bits | |
| Y // 16 bits | |
type Operand = | |
| Val of string // Standard value | |
| Hex of string // Hexadecimal value | |
| Reg of Register // Register pointer | |
| Ptr of string // Label pointer | |
type Operator = | |
| LDA // LOAD reg A with Operand | |
| LDX // LOAD reg X with Operand | |
| STA // STORE val of reg A in dest by Operand | |
| BGW // BEGINWITH pointer | |
| END // END of program (if not in code -> EOF) | |
type Operation = | |
| Operand of Operand | |
| Operator of Operator | |
| Comment of string | |
| Label of string | |
| Unknown of string | |
let rec getNextToken (input: string) (index: int) : (Operation * int) = | |
(* Get the token as a string * int tuple, int is the current index *) | |
let rec getStrToken (data: string) (i: int) (token: string) = | |
match i with | |
| i when i >= data.Length -> ("END", i) // emit END token if EOF reached | |
| _ -> match data.[i] with | |
| '\r' -> (token, i + 2) // if \r\n return token and eat 2 chars | |
| '\n' -> (token, i + 1) // if \n only return token and eat 1 char | |
| ' ' when token.Trim() <> "" -> (token, i) // if it's a space after an operator, emit the operator BUT don't eat the space (else it's a Label) | |
| ';' -> // Handle comments -> eat from ; to the end of line | |
let endIndex = data.IndexOf('\n', i) | |
(data.Substring(i, endIndex - i).Trim(), endIndex + 1) | |
| c -> getStrToken data (i + 1) (token + c.ToString()) // continue if any other char | |
let token, i = getStrToken input index "" | |
(* Convert string tokens in actual types tokens *) | |
match Regex.Replace(token, @"\s+", " ") with // replace multiple spaces/tab by one | |
| t when String.IsNullOrWhiteSpace t -> getNextToken input i // ignore whitespace only | |
| " LDA" -> (Operator LDA, i) | |
| " LDX" -> (Operator LDX, i) | |
| " STA" -> (Operator STA, i) | |
| " BGW" -> (Operator BGW, i) | |
| "END" -> (Operator END, i) | |
| t when t.[0] = ';' -> (Comment(t.TrimStart ';'), i) | |
| t when not (Char.IsWhiteSpace t.[0]) && t.EndsWith(":") -> (Label(t.TrimEnd ':'), i) // Label | |
| t when t.Trim().StartsWith("#$") -> (Operand(Hex(t.Substring(3))), i) // Hex | |
| t when t.Trim().StartsWith("#") -> (Operand(Val(t.Substring(2))), i) // Val | |
| t when t.Trim().StartsWith(":") -> (Operand(Ptr(t.Substring(2))), i) // Ptr | |
| t -> match t.Trim() with // Reg | |
| "A" -> (Operand(Reg A), i) | |
| "B" -> (Operand(Reg B), i) | |
| "D" -> (Operand(Reg D), i) | |
| "X" -> (Operand(Reg X), i) | |
| "Y" -> (Operand(Reg Y), i) | |
| o -> (Unknown o, i) // all other -> Unknown | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
module main | |
open System.IO | |
open Lexer | |
let rec lexcall lexer index = | |
match lexer index with | |
| (Operator END, _) -> () | |
| op, idx -> | |
printfn "%A" op | |
lexcall lexer idx | |
[<EntryPoint>] | |
let main argv = | |
let input = File.OpenText(@"C:\B32\Test.asm").ReadToEnd() | |
let lexer = Lexer.getNextToken input | |
lexcall lexer 0 | |
System.Console.ReadLine() |> ignore | |
0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment