Last active
January 23, 2020 22:42
-
-
Save milesrout/2ff3049ab3dc325fb49a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
;;; as | |
;;; | |
;;; an assembler for the DCPU-16 assembly language | |
;;; | |
;;; accepts a file of no more than 512 words indicating the sectors on disk that hold a particular file. | |
;;; then reads those sectors in the order given, treating them as assembler source | |
;;; then assembles said source to produce a DCPU-16 binary. | |
;;; the supported binary format is an image that can be loaded directly from a disk by a bootloader: | |
;;; it contains an 'initial sector' which is written to the first part of the disk and that loads the rest | |
;;; of the binary. | |
;;; | |
;;; Copyright (c) 2014 Miles Rout | |
;;; SEE THE LICENSE FILE IN THE ROOT DIRECTORY | |
;;; I suppose I will end up hand-assembling this for the proper 'bootstrapping' feel. eugh. | |
;;; LINKED LIST OF TOKENS | |
;;; | |
;;; structure: | |
;;; +0: tok_type | |
;;; +1: next | |
;;; +2: prev | |
;;; +3: tok_data - string for TOK_STRING and TOK_IDENTIFIER, number for TOK_NUMBER, lstring for TOK_LSTRING | |
;;; | |
;;; initial: prev=0 | |
;;; final: next=0 | |
;;; hardcoded into a particular place in memory at this stage | |
;;; first item is always empty (for code clarity and laziness, TODO: fix this) | |
list_create: | |
SET [0x4000], 0 | |
SET [0x4001], 0 | |
SET [0x4002], 0 | |
SET [0x4003], 0 | |
SET A, 0x4000 | |
;;; A: address of list | |
;;; [SP]: <return address> | |
;;; [SP+1]: tok_type | |
;;; [SP+2]: tok_data | |
list_append: | |
SET A, 0x4000 ; TEMPORARY, until I can be bothered having more than one list | |
list_append_loop: | |
IFE [A+1], 0 ; IF (node->next != NULL) | |
SET PC, list_append_found | |
SET A, [A+1] | |
SET PC, list_append_loop | |
list_append_found: | |
SET B, A | |
SET A, [A+1] | |
SET [A], PICK 1 | |
SET [A+1], 0 | |
SET [A+2], B ; new->prev = old | |
SET [A+3], PICK 2 | |
SET PC, POP | |
;;; OPCODES | |
;;; finds the basic opcode corresponding to a certain string | |
;;; A is the address of the string | |
check_basic_opcode: | |
SET B, basic_opcodes | |
check_basic_opcode_loop: | |
IFE [A], [B] | |
IFE [A+1], [B+1] | |
IFE [A+2], [B+2] | |
SET PC, check_basic_opcode_found | |
ADD B, 4 | |
IFL B, basic_opcodes_end | |
SET PC, check_basic_opcode_loop | |
SET A, 0 ; it wasn't found in the basic opcodes | |
SET PC, POP | |
check_basic_opcode_found: | |
SET A, [B+3] | |
SET PC, POP | |
;;; finds the special opcode corresponding to a certain string | |
;;; A is the address of the string | |
check_special_opcode: | |
SET B, special_opcodes | |
check_special_opcode_loop: | |
IFE [A], [B] | |
IFE [A+1], [B+1] | |
IFE [A+2], [B+2] | |
SET PC, check_special_opcode_found | |
ADD B, 4 | |
IFL B, special_opcodes_end | |
SET PC, check_special_opcode_loop | |
SET A, 0 ; it wasn't found in the special opcodes | |
SET PC, POP | |
check_special_opcode_found: | |
SET A, [B+3] | |
SET PC, POP | |
lookahead: | |
SET A, [Z] | |
SET PC, POP | |
;;; STATE MACHINE FOR LEXICAL ANALYSIS | |
;;; TOKEN TYPES: | |
;;; 0: invalid token | |
;;; 1: identifier | |
;;; 2: lstring (length-prefixed string) | |
;;; 3: cstring (null-terminated C-style string) | |
;;; 4: colon | |
;;; 5: lbrack | |
;;; 6: rbrack | |
;;; 7: bin_number | |
;;; 8: oct_number | |
;;; 9: dec_number | |
;;; 10: hex_number | |
;;; 11: pstring (packed string) | |
SET Z, 0 ; Z: stream pointer | |
SET J, 0x8000 ; J: string pointer | |
state_0: | |
JSR lookahead | |
;; stay in same state | |
IFE A, 0x5B ; OR A == '[' | |
SET PC, lex_tr_0_to_0 | |
IFE A, 0x5D ; OR A == ']' | |
SET PC, lex_tr_0_to_0 | |
IFE A, 0x3A ; IF A == ':' | |
SET PC, lex_tr_0_to_0 | |
;; go to string recognition states | |
IFE A, 0x22 ; " | |
SET PC, lex_tr_0_to_1 | |
IFE A, 0x4C ; L | |
SET PC, lex_tr_0_to_2 | |
;; go to identifier recognition states | |
;; note that this doesn't actually include L despite | |
;; appearances, because L is dealt with above. | |
IFG A, 0x40 ; IF (A >= 'A' AND A <= 'Z') | |
IFL A, 0x5B ; | |
SET PC, lex_tr_0_to_4 | |
IFG A, 0x60 ; OR (A >= 'a' AND A <= 'z') | |
IFL A, 0x7B | |
SET PC, lex_tr_0_to_4 | |
IFE A, 0x5F ; OR A == '_' | |
SET PC, lex_tr_0_to_4 | |
;; go to non-decimal number recognition states | |
IFE A, 0x30 ; IF A == '0' | |
SET PC, lex_tr_0_to_5 | |
;; go to decimal number recognition states | |
IFG A, 0x30 ; IF (A >= '1' AND A <= '9') | |
IFL A, 0x3A | |
SET PC, lex_tr_0_to_9 | |
state_1: | |
JSR lookahead | |
;; if this is a ", end our string recognition | |
IFE A, 0x22 ; " | |
SET PC, lex_tr_1_to_0 | |
;; otherwise continue recognising the string | |
SET PC, lex_tr_1_to_1 | |
state_2: | |
JSR lookahead | |
;; stay in same state | |
IFE A, 0x20 ; space | |
SET PC, lex_tr_2_to_2 | |
IFE A, 0x09 ; horizontal tab | |
SET PC, lex_tr_2_to_2 | |
IFE A, 0x0A ; line feed (new line) | |
SET PC, lex_tr_2_to_2 | |
IFE A, 0x0B ; vertical tab | |
SET PC, lex_tr_2_to_2 | |
IFE A, 0x0C ; form feed (new page) | |
SET PC, lex_tr_2_to_2 | |
IFE A, 0x0D ; carriage return | |
SET PC, lex_tr_2_to_2 | |
;; quotation mark: start reading a string | |
IFE A, 0x22 ; " | |
SET PC, lex_tr_2_to_3 | |
;; oops, that L is actually the start of an | |
;; identifier, not an lstring. better head back over | |
;; to identifier recognition | |
IFG A, 0x40 ; IF (A >= 'A' AND A <= 'Z') | |
IFL A, 0x5B ; | |
SET PC, lex_tr_2_to_4 | |
IFG A, 0x60 ; OR (A >= 'a' AND A <= 'z') | |
IFL A, 0x7B | |
SET PC, lex_tr_2_to_4 | |
IFG A, 0x2F ; OR (A >= '0' AND A <= '9') | |
IFL A, 0x3A | |
SET PC, lex_tr_2_to_4 | |
IFE A, 0x5F ; OR A == '_' | |
SET PC, lex_tr_2_to_4 | |
state_3: | |
JSR lookahead | |
;; if this is a ", end our string recognition | |
IFE A, 0x22 ; " | |
SET PC, lex_tr_3_to_0 | |
;; otherwise continue recognising the string | |
SET PC, lex_tr_3_to_3 | |
state_4: | |
JSR lookahead | |
;; continue reading identifier | |
IFG A, 0x40 ; IF (A >= 'A' AND A <= 'Z') | |
IFL A, 0x5B | |
SET PC, lex_tr_4_to_4 | |
IFG A, 0x60 ; OR (A >= 'a' AND A <= 'z') | |
IFL A, 0x7B | |
SET PC, lex_tr_4_to_4 | |
IFG A, 0x2F ; OR (A >= '0' AND A <= '9') | |
IFL A, 0x3A | |
SET PC, lex_tr_4_to_4 | |
IFE A, 0x5F ; OR A == '_' | |
SET PC, lex_tr_4_to_4 | |
;; otherwise do a non-consuming production transition back to zero | |
SET PC, lex_tr_4_to_0 | |
state_5: | |
JSR lookahead | |
;; octal number | |
IFG A, 0x2F ; OR (A >= '0' AND A <= '7') | |
IFL A, 0x38 | |
SET PC, lex_tr_5_to_6 | |
;; hexadecimal number | |
IFE A, 0x78 | |
SET PC, lex_tr_5_to_7 | |
;; binary number | |
IFE A, 0x62 | |
SET PC, lex_tr_5_to_8 | |
;; otherwise this is just a single zero, so we should produce and go back to zero. | |
SET PC, lex_tr_5_to_0 | |
state_6: | |
JSR lookahead | |
;; continue reading number | |
IFG A, 0x2F ; OR (A >= '0' AND A <= '7') | |
IFL A, 0x38 | |
SET PC, lex_tr_6_to_6 | |
;; otherwise do a non-consuming production transition back to zero | |
SET PC, lex_tr_6_to_0 | |
state_7: | |
JSR lookahead | |
;; continue reading number | |
IFG A, 0x40 ; IF (A >= 'A' AND A <= 'F') | |
IFL A, 46 | |
SET PC, lex_tr_7_to_7 | |
IFG A, 0x60 ; OR (A >= 'a' AND A <= 'f') | |
IFL A, 0x66 | |
SET PC, lex_tr_7_to_7 | |
IFG A, 0x2F ; OR (A >= '0' AND A <= '9') | |
IFL A, 0x3A | |
SET PC, lex_tr_7_to_7 | |
;; otherwise do a non-consuming production transition back to zero | |
SET PC, lex_tr_6_to_0 | |
state_8: | |
JSR lookahead | |
IFE A, 0x30 ; '0' | |
SET PC, lex_tr_8_to_8 | |
IFE A, 0x31 ; '1' | |
SET PC, lex_tr_8_to_8 | |
;; otherwise do a non-consuming production transition back to zero | |
SET PC, lex_tr_8_to_0 | |
state_9: | |
JSR lookahead | |
IFG A, 0x2F ; IF (A >= '0' AND A <= '9') | |
IFL A, 0x3A | |
SET PC, lex_tr_9_to_9 | |
;; otherwise do a non-consuming production transition back to zero | |
SET PC, lex_tr_9_to_0 | |
;;; Strings and other such data will be stored in memory at a fixed location for now, until I can come up | |
;;; with some more compelling memory allocation code. | |
;;; At the beginning set up a pointer in a register to a fixed location in memory. 0x8000 might be appropriate, | |
;;; because it is nowhere near 0x4000 (where our token list is) and also won't interfere with the stack. | |
;;; The register is set above the states along with the character stream pointer, and is the register J, because | |
;;; of the *very* useful STI instruction. Y additionally always points at the start of the most recent string. | |
;;; TRANSITIONS | |
lex_tr_0_to_0: | |
SET PUSH, A ; the token data is the char literal | |
IFE A, 0x3A ; : | |
SET PUSH, 4 | |
IFE A, 0x5B ; [ | |
SET PUSH, 5 | |
IFE A, 0x5D ; ] | |
SET PUSH, 6 | |
JSR list_append ; append pushed token | |
ADD SP, 1 ; get rid of rubbish values | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_0_to_1: | |
SET Y, J ; save the start of the string being built | |
;; note that we don't want to push the first character yet, because we don't yet have it: we only have a quotation mark. | |
ADD Z, 1 ; move to next character | |
SET PC, state_1 ; move to relevant state | |
lex_tr_0_to_2: | |
STI Y, J ; save the start of the string being built and save a space for it in the storage space | |
;; note that Y points to this space (which has a garbage value), not to the first character | |
;; also note that we don't want to push the first character yet, because we don't have it: we only have | |
;; L, which could be the start of an lstring or the first character of an identifier. | |
ADD Z, 1 ; move to next character | |
SET PC, state_2 ; move to relevant state | |
lex_tr_0_to_4: | |
SET Y, J ; save the start of the string being built | |
STI [J], A ; append the first character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_4 ; move to relevant state | |
lex_tr_0_to_5: | |
SET Y, J ; save the start of the string being built | |
STI [J], A ; append the first character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_5 ; move to relevant state | |
lex_tr_0_to_9: | |
;; build a string of digit characters. these will be turned into the actual NUMBER later. | |
SET Y, J ; save the start of the string being built | |
STI [J], A ; append the first character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_9 ; move to relevant state | |
lex_tr_1_to_0: | |
;; currently Y is pointing at the start of the string. | |
;; J is pointing one past the end of the string. | |
;; the string hasn't been null-terminated. | |
STI [J], 0 ; null-terminate the string | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 3 ; push tok_type: 3 (cstring) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_1_to_1: | |
STI [J], A ; append the next character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_1 ; move to relevant state | |
lex_tr_2_to_2: | |
;; ignore whitespace | |
ADD Z, 1 ; move to next character | |
SET PC, state_2 ; move to relevant state | |
lex_tr_2_to_3: | |
;; start reading the string as usual | |
SET [Y], 0 ; initialise the string length to zero. | |
;; we still don't want to add a character, but this definitely is an lstring. | |
ADD Z, 1 ; move to next character | |
SET PC, state_3 ; move to relevant state | |
lex_tr_2_to_4: | |
;; oops, this is actually an identifier. we're a bit behind. | |
;; we need to add the L character in and the new character. | |
SET [Y], 0x4C ; 'L' - put it where we would otherwise have put the lstring's length | |
STI [J], A ; append the next character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_4 ; move to relevant state | |
lex_tr_3_to_0: | |
;; currently Y is pointing at the length of the string, J is pointing one past the end. | |
;; the string hasn't been and doesn't need to be null-terminated. | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 2 ; push tok_type: 2 (lstring) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_3_to_3: | |
ADD [Y], 1 ; increment string length | |
STI [J], A ; append character to string | |
ADD Z, 1 ; move to next character | |
SET PC, state_3 ; move to relevant state | |
lex_tr_4_to_0: | |
;; currently Y is pointing at the first character of the string. | |
;; J is pointing one past the end. | |
;; the string needs to be null-terminated. | |
STI [J], 0 ; null-terminate the string. | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 1 ; push tok_type: 1 (identifier) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from the stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_4_to_4: | |
STI [J], A ; append the next character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_4 ; move to relevant state | |
lex_tr_5_to_0: | |
;; currently Y is pointing at the first character of the string. | |
;; J is pointing one past the end of the string. | |
;; the string needs to be null-terminated. | |
STI [J], 0 ; null-terminate the string. | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 8 ; push tok_type: 8 (oct_number) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_5_to_6: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_6 ; move to relevant state | |
lex_tr_5_to_7: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_7 ; move to relevant state | |
lex_tr_5_to_8: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_8 ; move to relevant state | |
lex_tr_6_to_0: | |
;; currently Y is pointing at the first character of the string. | |
;; the string needs to be null-terminated. | |
STI [J], 0 ; null-terminate the string | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 8 ; push tok_type: 8 (oct_number) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_6_to_6: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_6 ; move to relevant state | |
lex_tr_7_to_0: | |
;; currently Y is pointing at the first character of the string. | |
;; the string needs to be null-terminated. | |
STI [J], 0 ; null-terminate the string | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 10 ; push tok_type: 10 (hex_number) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_7_to_7: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_7 ; move to relevant state | |
lex_tr_8_to_0: | |
;; currently Y is pointing at the first character of the string | |
;; the string needs to be null-terminated. | |
STI [J], 0 ; null-terminate the string | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 7 ; push tok_type: 7 (bin_number) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_8_to_8: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_8 ; move to relevant state | |
lex_tr_9_to_0: | |
;; currently Y is pointing at the first character of the string | |
;; the string needs to be null-terminated. | |
STI [J], 0 ; null-terminate the string | |
SET PUSH, Y ; push tok_data | |
SET PUSH, 9 ; push tok_type: 9 (dec_number) | |
JSR list_append | |
ADD SP, 2 ; clear rubbish values from stack | |
ADD Z, 1 ; move to next character | |
SET PC, state_0 ; move to relevant state | |
lex_tr_9_to_9: | |
STI [J], A ; append the character to the string | |
ADD Z, 1 ; move to next character | |
SET PC, state_9 ; move to relevant state | |
;;; BASIC OPCODES | |
basic_opcodes: | |
DAT 0x53, 0x45, 0x54, 0x01 ;SET 1h | |
DAT 0x41, 0x44, 0x44, 0x02 ;ADD 2h | |
DAT 0x53, 0x55, 0x42, 0x03 ;SUB 3h | |
DAT 0x4D, 0x55, 0x4C, 0x04 ;MUL 4h | |
DAT 0x4D, 0x4C, 0x49, 0x05 ;MLI 5h | |
DAT 0x44, 0x49, 0x56, 0x06 ;DIV 6h | |
DAT 0x44, 0x56, 0x49, 0x07 ;DVI 7h | |
DAT 0x4D, 0x4F, 0x44, 0x08 ;MOD 8h | |
DAT 0x4D, 0x44, 0x49, 0x09 ;MDI 9h | |
DAT 0x41, 0x4E, 0x44, 0x0A ;AND Ah | |
DAT 0x42, 0x4F, 0x52, 0x0B ;BOR Bh | |
DAT 0x58, 0x4F, 0x52, 0x0C ;XOR Ch | |
DAT 0x53, 0x48, 0x52, 0x0D ;SHR Dh | |
DAT 0x41, 0x53, 0x52, 0x0E ;ASR Eh | |
DAT 0x53, 0x48, 0x4C, 0x0F ;SHL Fh | |
DAT 0x49, 0x46, 0x42, 0x10 ;IFB 10h | |
DAT 0x49, 0x46, 0x43, 0x11 ;IFC 11h | |
DAT 0x49, 0x46, 0x45, 0x12 ;IFE 12h | |
DAT 0x49, 0x46, 0x4E, 0x13 ;IFN 13h | |
DAT 0x49, 0x46, 0x47, 0x14 ;IFG 14h | |
DAT 0x49, 0x46, 0x41, 0x15 ;IFA 15h | |
DAT 0x49, 0x46, 0x4C, 0x16 ;IFL 16h | |
DAT 0x49, 0x46, 0x55, 0x17 ;IFU 17h | |
;; empty | |
;; empty | |
DAT 0x41, 0x44, 0x52, 0x1A ;ADX 1Ah | |
DAT 0x53, 0x42, 0x52, 0x1B ;SBX 1Bh | |
;; empty | |
;; empty | |
DAT 0x53, 0x54, 0x49, 0x1E ;STI 1Eh | |
DAT 0x53, 0x54, 0x44, 0x1F ;STD 1Fh | |
basic_opcodes_end: | |
DAT 0x00, 0x00, 0x00, 0x00 ;sentinel | |
;;; SPECIAL OPCODES | |
special_opcodes: | |
DAT 0x4A, 0x53, 0x52, 0x01 ;JSR 1h | |
;; empty | |
;; empty | |
;; empty | |
;; empty | |
;; empty | |
;; empty | |
DAT 0x49, 0x4E, 0x54, 0x08 ;INT 8h | |
DAT 0x49, 0x41, 0x47, 0x09 ;IAG 9h | |
DAT 0x49, 0x41, 0x53, 0x0A ;IAS Ah | |
DAT 0x52, 0x46, 0x49, 0x0B ;RFI Bh | |
DAT 0x49, 0x41, 0x51, 0x0C ;IAQ Ch | |
;; empty | |
;; empty | |
;; empty | |
DAT 0x48, 0x57, 0x4E, 0x10 ;HWN 10h | |
DAT 0x48, 0x57, 0x51, 0x11 ;HWQ 11h | |
DAT 0x48, 0x57, 0x49, 0x12 ;HWI 12h | |
special_opcodes_end: | |
DAT 0x00, 0x00, 0x00, 0x00 ;sentinel |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment