Skip to content

Instantly share code, notes, and snippets.

@McMartin
Last active May 15, 2018 21:26
Show Gist options
  • Save McMartin/0c2283f3dc4c4e22b55b6296f4b099b4 to your computer and use it in GitHub Desktop.
Save McMartin/0c2283f3dc4c4e22b55b6296f4b099b4 to your computer and use it in GitHub Desktop.
A CMake lexer written in CMake
cmake_policy(VERSION ${CMAKE_VERSION})
function(lexer filename out_namespace)
set(tokens_count 0)
set(line 1)
set(column 1)
macro(fill_buffer_from_file_content)
#[[ For development
message("Filling.")
#]]
if(file_offset LESS file_content_length)
string(SUBSTRING "${file_content}" ${file_offset} 512 new_buffer)
math(EXPR file_offset "${file_offset} + 512")
string(APPEND buffer "${new_buffer}")
endif()
#[[ For development
string(LENGTH "${buffer}" buffer_length)
if(buffer_length LESS 512)
message("Almost at EOF.")
endif()
#]]
endmacro()
macro(emit_token type)
#[[ For development
if(NOT DEFINED text)
message(FATAL_ERROR "(${line},${column}) Missing text variable.")
endif()
string(LENGTH "${text}" text_length)
if(text_length LESS 0)
message(FATAL_ERROR "No text?")
endif()
print_token("${type}" "${text}" "${line}" "${column}")
#]]
math(EXPR tokens_count "${tokens_count} + 1")
set(${out_namespace}_${tokens_count}_text ${text} PARENT_SCOPE)
set(${out_namespace}_${tokens_count}_type ${type} PARENT_SCOPE)
set(${out_namespace}_${tokens_count}_line ${line} PARENT_SCOPE)
set(${out_namespace}_${tokens_count}_column ${column} PARENT_SCOPE)
string(LENGTH "${text}" length)
string(FIND "${text}" "\n" last_newline_index REVERSE)
if(last_newline_index EQUAL -1)
math(EXPR column "${column} + ${length}")
else()
math(EXPR column "${length} - ${last_newline_index}")
set(newline_count 0)
set(text_remainder "${text}")
while(NOT last_newline_index EQUAL -1)
math(EXPR newline_count "${newline_count} + 1")
string(SUBSTRING "${text_remainder}" 0 ${last_newline_index} text_remainder)
string(FIND "${text_remainder}" "\n" last_newline_index REVERSE)
endwhile()
math(EXPR line "${line} + ${newline_count}")
endif()
set(current_state 0)
#[[ For development
unset(text)
#]]
endmacro()
macro(set_and_advance)
string(LENGTH "${CMAKE_MATCH_0}" match_length)
set(text "${CMAKE_MATCH_0}")
string(SUBSTRING "${buffer}" ${match_length} -1 buffer)
endmacro()
macro(append_and_advance)
string(LENGTH "${CMAKE_MATCH_0}" match_length)
string(APPEND text "${CMAKE_MATCH_0}")
string(SUBSTRING "${buffer}" ${match_length} -1 buffer)
endmacro()
file(READ "${filename}" file_content)
string(LENGTH "${file_content}" file_content_length)
set(file_offset 0)
fill_buffer_from_file_content()
set(current_state 0) # State_Initial
while(1)
string(LENGTH "${buffer}" buffer_length)
if(buffer_length LESS 3)
fill_buffer_from_file_content()
endif()
if(current_state EQUAL 0) # State_Initial
if(buffer MATCHES "^\n")
set(text "\n")
string(SUBSTRING "${buffer}" 1 -1 buffer)
emit_token(Token_Newline)
elseif(buffer MATCHES "^#?\\[\\[")
set_and_advance()
set(closing_bracket "]]")
set(current_state 5) # State_Bracket
elseif(buffer MATCHES "^#?\\[=+")
set_and_advance()
set(current_state 6) # State_BracketBegin
elseif(buffer MATCHES "^#[^\n]*")
set_and_advance()
set(current_state 4) # State_LineComment
elseif(buffer MATCHES "^\\(")
set(text "(")
string(SUBSTRING "${buffer}" 1 -1 buffer)
emit_token(Token_LeftParen)
elseif(buffer MATCHES "^\\)")
set(text ")")
string(SUBSTRING "${buffer}" 1 -1 buffer)
emit_token(Token_RightParen)
elseif(buffer MATCHES "^[A-Za-z_][A-Za-z0-9_]*")
if(CMAKE_MATCH_0 STREQUAL "PARENT_SCOPE")
string(CONCAT text "PARENT_SCOPE")
string(SUBSTRING "${buffer}" 12 -1 buffer)
else()
set_and_advance()
endif()
set(current_state 1) # State_Identifier
elseif(buffer MATCHES "^(\\.|[^\n \t()#\"\\])+")
set_and_advance()
set(current_state 7) # State_Unquoted
elseif(buffer MATCHES "^\"([\\].|[^\"\\])*")
set(text "\"")
string(SUBSTRING "${buffer}" 1 -1 buffer)
set(current_state 3) # State_Quoted
elseif(buffer MATCHES "^[ \t]+")
set_and_advance()
set(current_state 2) # State_Space
elseif(buffer MATCHES "^.")
set(text "${CMAKE_MATCH_0}")
string(SUBSTRING "${buffer}" 1 -1 buffer)
emit_token(Token_BadCharacter)
else()
break()
endif()
elseif(current_state EQUAL 1) # State_Identifier
if(buffer MATCHES "^[A-Za-z0-9_]+")
append_and_advance()
elseif(buffer MATCHES "^(\\.|[^\n \t()#\"\\])+")
append_and_advance()
set(current_state 7)
else()
emit_token(Token_Identifier)
endif()
elseif(current_state EQUAL 2) # State_Space
if(buffer MATCHES "^[ \t]+")
append_and_advance()
else()
emit_token(Token_Space)
endif()
elseif(current_state EQUAL 3) # State_Quoted
if(buffer MATCHES "^([\\].|[^\"\\])+")
append_and_advance()
elseif(buffer MATCHES "^\"")
append_and_advance()
emit_token(Token_QuotedArgument)
else()
emit_token(Token_BadQuoted)
endif()
elseif(current_state EQUAL 4) # State_LineComment
if(buffer MATCHES "^[^\n]+")
append_and_advance()
else()
emit_token(Token_LineComment)
endif()
elseif(current_state EQUAL 5) # State_Bracket
string(LENGTH "${closing_bracket}" closing_bracket_length)
if(buffer_length LESS closing_bracket_length)
fill_buffer_from_file_content()
endif()
if(buffer MATCHES "^${closing_bracket}")
append_and_advance()
if(text MATCHES "^#")
emit_token(Token_BracketComment)
else()
emit_token(Token_BracketArgument)
endif()
elseif(buffer MATCHES "^[^]]+")
append_and_advance()
else()
emit_token(Token_BadBracket)
endif()
elseif(current_state EQUAL 6) # State_BracketBegin
if(buffer MATCHES "^=+")
append_and_advance()
elseif(buffer MATCHES "^\\[")
append_and_advance()
string(REGEX REPLACE "[^=]" "" equal_signs "${text}")
set(closing_bracket "]${equal_signs}]")
set(current_state 5) # State_Bracket
else()
message(FATAL_ERROR "TODO")
endif()
elseif(current_state EQUAL 7) # State_Unquoted
if(buffer MATCHES "^(\\.|[^\n \t()#\"\\])+")
append_and_advance()
else()
emit_token(Token_UnquotedArgument)
endif()
#[[ For development
else()
message(FATAL_ERROR "No state?")
]]
endif()
endwhile()
set(${out_namespace}_tokens_count ${tokens_count} PARENT_SCOPE)
endfunction()
function(print_token type text line column)
if(column LESS 10)
set(column "0${column}")
endif()
string(REGEX REPLACE "\n" "\\\\n" text "${text}")
string(LENGTH "${type}" type_len)
set(padded_type "${type}")
foreach(i RANGE ${type_len} 24)
string(APPEND padded_type " ")
endforeach()
message(STATUS "${line},${column}: ${padded_type}'${text}'")
endfunction()
function(print_tokens namespace)
foreach(i RANGE 1 ${${namespace}_tokens_count})
print_token(
"${${namespace}_${i}_type}"
"${${namespace}_${i}_text}"
"${${namespace}_${i}_line}"
"${${namespace}_${i}_column}"
)
endforeach()
endfunction()
function(main)
if(NOT DEFINED CMAKE_ARGV3)
message(FATAL_ERROR "usage: cmake -P lexer.cmake filename")
endif()
set(filename "${CMAKE_ARGV3}")
lexer("${filename}" my_tokens)
print_tokens(my_tokens)
endfunction()
if(CMAKE_SCRIPT_MODE_FILE STREQUAL CMAKE_CURRENT_LIST_FILE)
main()
endif()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment