Created
May 5, 2010 23:29
-
-
Save jpbougie/391594 to your computer and use it in GitHub Desktop.
a rough scheme lexer in LLVM IR
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; FILE type for i/o using stdio | |
%struct.FILE = type { i32, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, i8*, %struct._IO_marker*, %struct.FILE*, i32, i32, i64, i16, i8, [1 x i8], i8*, i64, i8*, i8*, i8*, i8*, i64, i32, [20 x i8] } | |
%struct._IO_marker = type { %struct._IO_marker*, %struct.FILE*, i32 } | |
@.read_mode = private constant [3 x i8] c"rb\00", align 1 | |
declare i32 @fgetc(%struct.FILE* nocapture) nounwind | |
declare i32 @feof(%struct.FILE* nocapture) nounwind | |
declare i32 @fseek(%struct.FILE* nocapture, i64, i32) nounwind | |
declare i32 @ungetc(i32, %struct.FILE * nocapture) nounwind | |
declare i8 * @malloc(i64) nounwind | |
declare void @free(i8* nocapture) nounwind | |
declare i32 @putchar(i32) nounwind | |
declare noalias %struct.FILE* @fopen(i8* noalias nocapture, i8* noalias nocapture) nounwind | |
declare i8 @fclose(%struct.FILE*) nounwind | |
; stdin | |
;@stdin = external global %struct.FILE* ; <%struct.FILE**> | |
%struct.tok = type { i4, [1024 x i8] , i32 } ; type, value, value length | |
%struct.tok_stream = type { i64, %struct.tok *} ; length, array | |
; token types | |
;tok.open_paren = i4 0 | |
;tok.close_paren = i4 1 | |
;tok.integer = i4 2 | |
;tok.string = i4 3 | |
;tok.identifier = i4 4 | |
;tok.quote = i4 5 | |
define i8 @get_next_character(%struct.FILE* %fhandle, i1 %ignore_whitespace) { | |
entry: | |
br label %loop | |
loop: | |
%0 = tail call i32 @fgetc(%struct.FILE* %fhandle) nounwind ; <i32> | |
%char = trunc i32 %0 to i8 | |
br i1 %ignore_whitespace, label %check_whitespace, label %result | |
check_whitespace: | |
switch i8 %char, label %result [ i8 9, label %loop ; tab | |
i8 10, label %loop ; \n | |
i8 13, label %loop ; \r | |
i8 32, label %loop ]; space | |
result: | |
ret i8 %char | |
} | |
define void @rewind_character(%struct.FILE* %fhandle, i8 %char) { | |
%1 = zext i8 %char to i32 | |
tail call i32 @ungetc(i32 %1, %struct.FILE* %fhandle) | |
ret void | |
} | |
define void @ignore_whitespace(%struct.FILE* %fhandle) { | |
entry: | |
br label %begin | |
begin: | |
%0 = tail call i32 @fgetc(%struct.FILE* %fhandle) nounwind ; <i32> | |
%1 = trunc i32 %0 to i8 | |
switch i8 %1, label %exit [ i8 9, label %begin ; tab | |
i8 10, label %begin ; \n | |
i8 13, label %begin ; \r | |
i8 32, label %begin ; space | |
] | |
exit: | |
%ret = tail call i32 @fseek(%struct.FILE* %fhandle, i64 -1, i32 1) ; go back one char (SEEK_CUR is 1) | |
ret void | |
} | |
define void @ignore_until_end_of_line(%struct.FILE * %fhandle) { | |
entry: | |
br label %begin | |
begin: | |
%0 = tail call i32 @fgetc(%struct.FILE* %fhandle) nounwind ; <i32> | |
%1 = trunc i32 %0 to i8 | |
switch i8 %1, label %begin [ | |
i8 10, label %exit ; \n | |
i8 -1, label %exit ; end of file | |
] | |
exit: | |
%ret = tail call i32 @fseek(%struct.FILE* %fhandle, i64 -1, i32 1) ; go back one char (SEEK_CUR is 1) | |
ret void | |
} | |
define i1 @is_alpha(i8 %input) { | |
; upper case | |
%1 = icmp uge i8 %input, 65 | |
%2 = icmp ule i8 %input, 90 | |
%3 = and i1 %1, %2 | |
;lower case | |
%4 = icmp uge i8 %input, 97 | |
%5 = icmp ule i8 %input, 122 | |
%6 = and i1 %4, %5 | |
%7 = or i1 %3, %6 | |
ret i1 %7 | |
} | |
define i1 @is_whitespace(i8 %input) { | |
switch i8 %input, label %ret_false [ i8 9, label %ret_true ; tab | |
i8 10, label %ret_true ; \n | |
i8 13, label %ret_true ; \r | |
i8 32, label %ret_true ] ; space | |
ret_true: | |
ret i1 true | |
ret_false: | |
ret i1 false | |
} | |
define i1 @is_digit(i8 %input) { | |
%1 = icmp uge i8 %input, 48 | |
%2 = icmp ule i8 %input, 57 | |
%3 = and i1 %1, %2 | |
ret i1 %3 | |
} | |
define %struct.tok* @get_next_token(%struct.FILE* %fhandle) { | |
entry: | |
;get the size of the structure | |
%size = getelementptr %struct.tok* null, i32 1 | |
%sz = ptrtoint %struct.tok* %size to i64 | |
%0 = tail call i8* @malloc(i64 %sz); allocate the resulting token | |
%tok.ptr = bitcast i8* %0 to %struct.tok * | |
;check for eof | |
%eofflag = tail call i32 @feof(%struct.FILE* %fhandle) | |
%isnoteof = icmp eq i32 %eofflag, 0 | |
br i1 %isnoteof, label %not_eof, label %eof | |
not_eof: | |
; read a character | |
%1 = call i8 @get_next_character(%struct.FILE * %fhandle, i1 true) | |
switch i8 %1, label %other [ i8 -1, label %eof | |
i8 40, label %open_paren | |
i8 41, label %close_paren | |
i8 34, label %double_quote | |
i8 39, label %single_quote | |
] | |
;;;;;; simple tokens ;;;;;; | |
eof: | |
%2 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 -1, i4 * %2 | |
br label %exit | |
open_paren: | |
%3 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 0, i4 * %3 | |
br label %exit | |
close_paren: | |
%4 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 1, i4 * %4 | |
br label %exit | |
single_quote: | |
%5 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 5, i4 * %5 | |
br label %exit | |
;;; complex tokens ;;; | |
double_quote: | |
%6 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 3, i4 * %6 | |
; set the current length to 0 | |
%7 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2 | |
store i32 0, i32 * %7 | |
br label %read_string | |
read_string: | |
;read another char | |
%ch = call i8 @get_next_character(%struct.FILE * %fhandle, i1 false) | |
%tok.value.length = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2 | |
switch i8 %ch, label %add_to_string [ i8 -1, label %error | |
i8 34, label %exit | |
i8 92, label %string_escape | |
] | |
add_to_string: | |
%8 = load i32 * %tok.value.length ; get the current length | |
%9 = add i32 %8, 1 ; increment the length by one | |
; store the new character | |
%10 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %8 ; get the new position for the character | |
store i8 %ch, i8 * %10 | |
store i32 %9, i32 * %tok.value.length ; store the new length | |
br label %read_string | |
string_escape: | |
; store the \ | |
%11 = load i32 * %tok.value.length ; get the current length | |
%12 = add i32 %11, 1 ; increment the length by one | |
; store the new character | |
%13 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %11 | |
store i8 %ch, i8 * %13 | |
store i32 %12, i32 * %tok.value.length ; store the new length | |
; get a new character and store it right away, unless it's the end of file | |
%14 = call i8 @get_next_character(%struct.FILE* %fhandle, i1 false) | |
%15 = icmp eq i8 %14, -1 | |
br i1 %15, label %error, label %string_add_next | |
string_add_next: | |
%16 = add i32 %12, 1 | |
%17 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %12 | |
store i8 %14, i8 * %17 | |
store i32 %16, i32 * %tok.value.length | |
br label %read_string | |
other: | |
;check if it's a number | |
%18 = icmp uge i8 %1, 48 | |
%19 = icmp ule i8 %1, 57 | |
%20 = and i1 %18, %19 | |
br i1 %20, label %number, label %not_a_number | |
not_a_number: | |
;check if it's a valid first letter for an identifier | |
%21 = call i1 @is_alpha(i8 %1) | |
; valid initials | |
; <letter> | ! | $ | % | & | * | / | : | < | = | > | ? | ~ | _ | ^ | |
%22 = icmp eq i8 %1, 33 ; ! | |
%23 = icmp eq i8 %1, 36 ; $ | |
%24 = icmp eq i8 %1, 37 ; % | |
%25 = icmp eq i8 %1, 38 ; & | |
%26 = icmp eq i8 %1, 42 ; * | |
%27 = icmp eq i8 %1, 47 ; / | |
%28 = icmp eq i8 %1, 58 ; : | |
%29 = icmp eq i8 %1, 60 ; < | |
%30 = icmp eq i8 %1, 61 ; = | |
%31 = icmp eq i8 %1, 62 ; > | |
%32 = icmp eq i8 %1, 63 ; ? | |
%33 = icmp eq i8 %1, 126 ; ~ | |
%34 = icmp eq i8 %1, 95 ; _ | |
%35 = icmp eq i8 %1, 94 ; ^ | |
%36 = or i1 %21, %22 | |
%37 = or i1 %36, %23 | |
%38 = or i1 %37, %24 | |
%39 = or i1 %38, %25 | |
%40 = or i1 %39, %26 | |
%41 = or i1 %40, %27 | |
%42 = or i1 %41, %28 | |
%43 = or i1 %42, %29 | |
%44 = or i1 %43, %30 | |
%45 = or i1 %44, %31 | |
%46 = or i1 %45, %32 | |
%47 = or i1 %46, %33 | |
%48 = or i1 %47, %34 | |
%49 = or i1 %48, %35 | |
br i1 %49, label %read_identifier, label %error | |
read_identifier: | |
%50 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 4, i4 * %50 | |
; set the current length to 1 | |
%51 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2 | |
store i32 1, i32 * %51 | |
; add the first character | |
%id.array.pos = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 0 | |
store i8 %1, i8 * %id.array.pos | |
; add next | |
br label %add_to_identifier | |
add_to_identifier: | |
%52 = call i8 @get_next_character(%struct.FILE* %fhandle, i1 false) | |
;check if it's a valid subsequent | |
%53 = call i1 @is_alpha(i8 %52) | |
%54 = call i1 @is_digit(i8 %52) | |
;special subsequents : + | - | . | @ | |
%55 = icmp eq i8 %52, 43 ; + | |
%56 = icmp eq i8 %52, 45 ; - | |
%57 = icmp eq i8 %52, 46 ; . | |
%58 = icmp eq i8 %52, 64 ; @ | |
%59 = or i1 %53, %54 | |
%60 = or i1 %59, %55 | |
%61 = or i1 %60, %56 | |
%62 = or i1 %61, %57 | |
%63 = or i1 %62, %58 | |
br i1 %63, label %valid_subsequent, label %end_of_identifier | |
valid_subsequent: | |
; add the character to the value | |
; get the current length | |
%64 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2 | |
%65 = load i32 * %64 | |
%66 = add i32 %65, 1 | |
; store the new character | |
%67 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %65 | |
store i8 %52, i8 * %67 | |
; store the increased length | |
store i32 %66, i32 * %64 | |
br label %add_to_identifier | |
end_of_identifier: | |
call void @rewind_character(%struct.FILE* %fhandle, i8 %52) | |
br label %exit | |
number: | |
; set the type to integer (2) | |
%68 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 0 | |
store i4 2, i4 * %68 | |
; set the current length to 1 | |
%69 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2 | |
store i32 1, i32 * %69 | |
; add the first number | |
%70 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 0 | |
store i8 %1, i8 * %70 | |
br label %add_number | |
add_number: | |
%71 = call i8 @get_next_character(%struct.FILE* %fhandle, i1 false) | |
%72 = call i1 @is_digit(i8 %71) | |
br i1 %72, label %valid_digit, label %end_of_number | |
valid_digit: | |
; get the current length | |
%73 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 2 | |
%74 = load i32 * %73 | |
; increment the length | |
%75 = add i32 %74, 1 | |
; store the new character | |
%76 = getelementptr %struct.tok * %tok.ptr, i32 0, i32 1, i32 %74 | |
store i8 %71, i8 * %76 | |
; store the new length | |
store i32 %75, i32 * %73 | |
br label %add_number | |
end_of_number: | |
call void @rewind_character(%struct.FILE* %fhandle, i8 %71) | |
br label %exit | |
error: | |
br label %exit | |
exit: | |
ret %struct.tok* %tok.ptr | |
} | |
; an expression can be a list, an integer, a symbol or a string | |
; the first element is the type, the second the length (or the number in the case of the integer, and the third is a pointer, either to the list of sub elements or to the string itself) | |
%struct.expr = type { i4, i64, i8*} | |
define %struct.expr * @build_ast() { | |
ret %struct.expr * undef | |
} | |
; data will be stored as tagged integers | |
; 00 = integer, the highest 62 bits contain the number | |
; 01 = string/symbol, pointer to the string in memory | |
; 10 = cons-cell/vector | |
; 11 = function | |
define i32 @main(i32 %argc, i8** nocapture %argv) nounwind { | |
entry: | |
%0 = getelementptr inbounds i8** %argv, i64 1 | |
%1 = load i8** %0, align 8 | |
%file = tail call %struct.FILE* @fopen(i8* noalias %1, i8* noalias getelementptr inbounds ([3 x i8]* @.read_mode, i64 0, i64 0)) nounwind | |
%fileptr = ptrtoint %struct.FILE * %file to i32 | |
%notvalid = icmp eq i32 %fileptr, 0 | |
br i1 %notvalid, label %error, label %loop | |
loop: | |
%tok = call %struct.tok* @get_next_token(%struct.FILE* %file) | |
%type.ptr = getelementptr %struct.tok* %tok, i32 0, i32 0 | |
%type = load i4* %type.ptr | |
%2 = zext i4 %type to i32 | |
%3 = add i32 %2, 48 | |
tail call i32 @putchar(i32 %3); | |
%eof = icmp eq i4 %type, -1 | |
%ptr = bitcast %struct.tok* %tok to i8* | |
tail call void @free(i8* %ptr) | |
br i1 %eof, label %exit, label %loop | |
; br label %exit | |
exit: | |
tail call i8 @fclose(%struct.FILE * %file) | |
ret i32 0 | |
error: | |
ret i32 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment