Last active
May 15, 2018 21:26
-
-
Save McMartin/0c2283f3dc4c4e22b55b6296f4b099b4 to your computer and use it in GitHub Desktop.
A CMake lexer written in CMake
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cmake_policy(VERSION ${CMAKE_VERSION}) | |
function(lexer filename out_namespace) | |
set(tokens_count 0) | |
set(line 1) | |
set(column 1) | |
macro(fill_buffer_from_file_content) | |
#[[ For development | |
message("Filling.") | |
#]] | |
if(file_offset LESS file_content_length) | |
string(SUBSTRING "${file_content}" ${file_offset} 512 new_buffer) | |
math(EXPR file_offset "${file_offset} + 512") | |
string(APPEND buffer "${new_buffer}") | |
endif() | |
#[[ For development | |
string(LENGTH "${buffer}" buffer_length) | |
if(buffer_length LESS 512) | |
message("Almost at EOF.") | |
endif() | |
#]] | |
endmacro() | |
macro(emit_token type) | |
#[[ For development | |
if(NOT DEFINED text) | |
message(FATAL_ERROR "(${line},${column}) Missing text variable.") | |
endif() | |
string(LENGTH "${text}" text_length) | |
if(text_length LESS 0) | |
message(FATAL_ERROR "No text?") | |
endif() | |
print_token("${type}" "${text}" "${line}" "${column}") | |
#]] | |
math(EXPR tokens_count "${tokens_count} + 1") | |
set(${out_namespace}_${tokens_count}_text ${text} PARENT_SCOPE) | |
set(${out_namespace}_${tokens_count}_type ${type} PARENT_SCOPE) | |
set(${out_namespace}_${tokens_count}_line ${line} PARENT_SCOPE) | |
set(${out_namespace}_${tokens_count}_column ${column} PARENT_SCOPE) | |
string(LENGTH "${text}" length) | |
string(FIND "${text}" "\n" last_newline_index REVERSE) | |
if(last_newline_index EQUAL -1) | |
math(EXPR column "${column} + ${length}") | |
else() | |
math(EXPR column "${length} - ${last_newline_index}") | |
set(newline_count 0) | |
set(text_remainder "${text}") | |
while(NOT last_newline_index EQUAL -1) | |
math(EXPR newline_count "${newline_count} + 1") | |
string(SUBSTRING "${text_remainder}" 0 ${last_newline_index} text_remainder) | |
string(FIND "${text_remainder}" "\n" last_newline_index REVERSE) | |
endwhile() | |
math(EXPR line "${line} + ${newline_count}") | |
endif() | |
set(current_state 0) | |
#[[ For development | |
unset(text) | |
#]] | |
endmacro() | |
macro(set_and_advance) | |
string(LENGTH "${CMAKE_MATCH_0}" match_length) | |
set(text "${CMAKE_MATCH_0}") | |
string(SUBSTRING "${buffer}" ${match_length} -1 buffer) | |
endmacro() | |
macro(append_and_advance) | |
string(LENGTH "${CMAKE_MATCH_0}" match_length) | |
string(APPEND text "${CMAKE_MATCH_0}") | |
string(SUBSTRING "${buffer}" ${match_length} -1 buffer) | |
endmacro() | |
file(READ "${filename}" file_content) | |
string(LENGTH "${file_content}" file_content_length) | |
set(file_offset 0) | |
fill_buffer_from_file_content() | |
set(current_state 0) # State_Initial | |
while(1) | |
string(LENGTH "${buffer}" buffer_length) | |
if(buffer_length LESS 3) | |
fill_buffer_from_file_content() | |
endif() | |
if(current_state EQUAL 0) # State_Initial | |
if(buffer MATCHES "^\n") | |
set(text "\n") | |
string(SUBSTRING "${buffer}" 1 -1 buffer) | |
emit_token(Token_Newline) | |
elseif(buffer MATCHES "^#?\\[\\[") | |
set_and_advance() | |
set(closing_bracket "]]") | |
set(current_state 5) # State_Bracket | |
elseif(buffer MATCHES "^#?\\[=+") | |
set_and_advance() | |
set(current_state 6) # State_BracketBegin | |
elseif(buffer MATCHES "^#[^\n]*") | |
set_and_advance() | |
set(current_state 4) # State_LineComment | |
elseif(buffer MATCHES "^\\(") | |
set(text "(") | |
string(SUBSTRING "${buffer}" 1 -1 buffer) | |
emit_token(Token_LeftParen) | |
elseif(buffer MATCHES "^\\)") | |
set(text ")") | |
string(SUBSTRING "${buffer}" 1 -1 buffer) | |
emit_token(Token_RightParen) | |
elseif(buffer MATCHES "^[A-Za-z_][A-Za-z0-9_]*") | |
if(CMAKE_MATCH_0 STREQUAL "PARENT_SCOPE") | |
string(CONCAT text "PARENT_SCOPE") | |
string(SUBSTRING "${buffer}" 12 -1 buffer) | |
else() | |
set_and_advance() | |
endif() | |
set(current_state 1) # State_Identifier | |
elseif(buffer MATCHES "^(\\.|[^\n \t()#\"\\])+") | |
set_and_advance() | |
set(current_state 7) # State_Unquoted | |
elseif(buffer MATCHES "^\"([\\].|[^\"\\])*") | |
set(text "\"") | |
string(SUBSTRING "${buffer}" 1 -1 buffer) | |
set(current_state 3) # State_Quoted | |
elseif(buffer MATCHES "^[ \t]+") | |
set_and_advance() | |
set(current_state 2) # State_Space | |
elseif(buffer MATCHES "^.") | |
set(text "${CMAKE_MATCH_0}") | |
string(SUBSTRING "${buffer}" 1 -1 buffer) | |
emit_token(Token_BadCharacter) | |
else() | |
break() | |
endif() | |
elseif(current_state EQUAL 1) # State_Identifier | |
if(buffer MATCHES "^[A-Za-z0-9_]+") | |
append_and_advance() | |
elseif(buffer MATCHES "^(\\.|[^\n \t()#\"\\])+") | |
append_and_advance() | |
set(current_state 7) | |
else() | |
emit_token(Token_Identifier) | |
endif() | |
elseif(current_state EQUAL 2) # State_Space | |
if(buffer MATCHES "^[ \t]+") | |
append_and_advance() | |
else() | |
emit_token(Token_Space) | |
endif() | |
elseif(current_state EQUAL 3) # State_Quoted | |
if(buffer MATCHES "^([\\].|[^\"\\])+") | |
append_and_advance() | |
elseif(buffer MATCHES "^\"") | |
append_and_advance() | |
emit_token(Token_QuotedArgument) | |
else() | |
emit_token(Token_BadQuoted) | |
endif() | |
elseif(current_state EQUAL 4) # State_LineComment | |
if(buffer MATCHES "^[^\n]+") | |
append_and_advance() | |
else() | |
emit_token(Token_LineComment) | |
endif() | |
elseif(current_state EQUAL 5) # State_Bracket | |
string(LENGTH "${closing_bracket}" closing_bracket_length) | |
if(buffer_length LESS closing_bracket_length) | |
fill_buffer_from_file_content() | |
endif() | |
if(buffer MATCHES "^${closing_bracket}") | |
append_and_advance() | |
if(text MATCHES "^#") | |
emit_token(Token_BracketComment) | |
else() | |
emit_token(Token_BracketArgument) | |
endif() | |
elseif(buffer MATCHES "^[^]]+") | |
append_and_advance() | |
else() | |
emit_token(Token_BadBracket) | |
endif() | |
elseif(current_state EQUAL 6) # State_BracketBegin | |
if(buffer MATCHES "^=+") | |
append_and_advance() | |
elseif(buffer MATCHES "^\\[") | |
append_and_advance() | |
string(REGEX REPLACE "[^=]" "" equal_signs "${text}") | |
set(closing_bracket "]${equal_signs}]") | |
set(current_state 5) # State_Bracket | |
else() | |
message(FATAL_ERROR "TODO") | |
endif() | |
elseif(current_state EQUAL 7) # State_Unquoted | |
if(buffer MATCHES "^(\\.|[^\n \t()#\"\\])+") | |
append_and_advance() | |
else() | |
emit_token(Token_UnquotedArgument) | |
endif() | |
#[[ For development | |
else() | |
message(FATAL_ERROR "No state?") | |
]] | |
endif() | |
endwhile() | |
set(${out_namespace}_tokens_count ${tokens_count} PARENT_SCOPE) | |
endfunction() | |
function(print_token type text line column) | |
if(column LESS 10) | |
set(column "0${column}") | |
endif() | |
string(REGEX REPLACE "\n" "\\\\n" text "${text}") | |
string(LENGTH "${type}" type_len) | |
set(padded_type "${type}") | |
foreach(i RANGE ${type_len} 24) | |
string(APPEND padded_type " ") | |
endforeach() | |
message(STATUS "${line},${column}: ${padded_type}'${text}'") | |
endfunction() | |
function(print_tokens namespace) | |
foreach(i RANGE 1 ${${namespace}_tokens_count}) | |
print_token( | |
"${${namespace}_${i}_type}" | |
"${${namespace}_${i}_text}" | |
"${${namespace}_${i}_line}" | |
"${${namespace}_${i}_column}" | |
) | |
endforeach() | |
endfunction() | |
function(main) | |
if(NOT DEFINED CMAKE_ARGV3) | |
message(FATAL_ERROR "usage: cmake -P lexer.cmake filename") | |
endif() | |
set(filename "${CMAKE_ARGV3}") | |
lexer("${filename}" my_tokens) | |
print_tokens(my_tokens) | |
endfunction() | |
if(CMAKE_SCRIPT_MODE_FILE STREQUAL CMAKE_CURRENT_LIST_FILE) | |
main() | |
endif() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment