|
/* This file is Copyright (C) 2019 Jason Pepas. */ |
|
/* This file is released under the terms of the MIT License. */ |
|
/* See https://opensource.org/licenses/MIT */ |
|
|
|
#include "read.h" |
|
#include "errcode.h" |
|
#include <stdlib.h> |
|
#include <sys/errno.h> |
|
#include <assert.h> |
|
#include <stdbool.h> |
|
#include <ctype.h> |
|
#include <string.h> |
|
|
|
|
|
/* Is ch considered whitespace? |
|
Note: commas are considered whitespace. */ |
|
static bool is_ch_ws(char ch) { |
|
return isspace(ch) || ch == ','; |
|
} |
|
|
|
|
|
/* Is ch the start of a collection literal? */ |
|
static bool is_ch_opener(char ch) { |
|
return ch == '(' || ch == '[' || ch == '{'; |
|
} |
|
|
|
|
|
/* Is ch the end of a collection literal? */ |
|
static bool is_ch_closer(char ch) { |
|
return ch == ')' || ch == ']' || ch == '}'; |
|
} |
|
|
|
|
|
/* Does ch indicate the end of a token? */ |
|
static bool is_ch_delim(char ch) { |
|
return is_ch_ws(ch) || is_ch_opener(ch) || is_ch_closer(ch); |
|
} |
|
|
|
|
|
/* Does ch constitute a token on its own? */ |
|
static bool is_ch_single_character_token(char ch) { |
|
char* p = strchr("()[]{}'`@", (int)ch); |
|
return (p != NULL); |
|
} |
|
|
|
|
|
/* Places a char from fp into chp. |
|
Returns 0, EOF, or errno. */ |
|
static int file_get_ch(FILE* fp, char* chp) { |
|
int i = fgetc(fp); |
|
if (ferror(fp)) { |
|
return errno; |
|
} else if (i == EOF) { |
|
return EOF; |
|
} else { |
|
*chp = (char)i; |
|
return 0; |
|
} |
|
} |
|
|
|
|
|
/* Pushes ch back into fp. |
|
Returns 0 or EOF. */ |
|
static int file_unget_ch(FILE* fp, char ch) { |
|
int err = ungetc((int)ch, fp); |
|
if (err == EOF) { |
|
return err; |
|
} else { |
|
return 0; |
|
} |
|
} |
|
|
|
|
|
/* Peeks a char from fp into chp without advancing the stream. |
|
Returns 0, EOF, or errno. */ |
|
static int file_peek_ch(FILE* fp, char* chp) { |
|
int err = file_get_ch(fp, chp); |
|
if (err) { |
|
return err; |
|
} |
|
|
|
err = file_unget_ch(fp, *chp); |
|
if (err) { |
|
return err; |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
|
|
/* Advance fp by one char (which is discarded). |
|
Returns 0, EOF, or errno. */ |
|
static int file_consume_ch(FILE* fp) { |
|
int i = fgetc(fp); |
|
if (ferror(fp)) { |
|
return errno; |
|
} else if (i == EOF) { |
|
return EOF; |
|
} else { |
|
return 0; |
|
} |
|
} |
|
|
|
|
|
/* Advance fp past any leading whitespace. |
|
Note: commas are considered whitespace. |
|
Returns 0, EOF, or errno. */ |
|
static int file_discard_ws(FILE* fp, int* countp) { |
|
int err; |
|
char ch; |
|
int count = 0; |
|
|
|
while (true) { |
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
return err; |
|
} else if (is_ch_ws(ch)) { |
|
count++; |
|
continue; |
|
} else { |
|
err = file_unget_ch(fp, ch); |
|
if (err) { |
|
return err; |
|
} |
|
if (countp != NULL) { |
|
*countp = count; |
|
} |
|
break; |
|
} |
|
} |
|
|
|
return 0; |
|
} |
|
|
|
|
|
/* Advance fp far enough to read one token (which is a string literal). |
|
Points buffpp to a malloc'ed buffer containing the string. |
|
Returns 0, EOF, errno, or an error code. */ |
|
static int file_get_token_str(FILE* fp, char** buffpp) { |
|
int err; |
|
char ch; |
|
|
|
/* allocate the initial buffer. */ |
|
size_t buffsize = 100; |
|
size_t bufflen = buffsize - 1; |
|
char* buffp = malloc(buffsize); |
|
if (buffp == NULL) { |
|
return errno; |
|
} |
|
char* cursor = buffp; |
|
|
|
/* the first char must be the opening quote. */ |
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
free(buffp); |
|
return err; |
|
} else if (ch != '"') { |
|
free(buffp); |
|
return E_file_get_token_str_invalid_string_literal; |
|
} else { |
|
*cursor = ch; |
|
cursor++; |
|
} |
|
|
|
while (true) { |
|
size_t len = cursor - buffp; |
|
|
|
/* time to grow the buffer. */ |
|
if (len == bufflen) { |
|
buffsize *= 2; |
|
bufflen = buffsize - 1; |
|
char* newbuffp = realloc(buffp, buffsize); |
|
if (newbuffp == NULL) { |
|
err = errno; |
|
free(buffp); |
|
return err; |
|
} else { |
|
buffp = newbuffp; |
|
} |
|
} |
|
|
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
free(buffp); |
|
return err; |
|
|
|
/* this is the end of the string. */ |
|
} else if (ch == '"') { |
|
*cursor = ch; |
|
cursor++; |
|
*cursor = '\0'; |
|
// shrink buffp to fit the size of the string. |
|
size_t finalbuffsize = buffp - cursor + 1; |
|
if (finalbuffsize < buffsize) { |
|
char* finalbuffp = realloc(buffp, finalbuffsize); |
|
if (finalbuffp == NULL) { |
|
err = errno; |
|
free(buffp); |
|
return err; |
|
} else { |
|
buffp = finalbuffp; |
|
} |
|
} |
|
break; |
|
|
|
/* this is an escape sequence. */ |
|
} else if (ch == '\\') { |
|
assert(false); // TODO |
|
|
|
/* this is a regular char. */ |
|
} else { |
|
*cursor = ch; |
|
cursor++; |
|
} |
|
} |
|
|
|
*buffpp = buffp; |
|
return 0; |
|
} |
|
|
|
|
|
/* Advances fp past the current comment. |
|
Returns 0, EOF, or errno. */ |
|
static int file_discard_comment(FILE* fp) { |
|
int err; |
|
char ch; |
|
|
|
/* the first char must be ';'. */ |
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
return err; |
|
} else { |
|
assert(ch == ';'); |
|
} |
|
|
|
/* discard the rest of the current line. */ |
|
while (true) { |
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
return err; |
|
} else if (ch == '\n') { |
|
break; |
|
} else { |
|
continue; |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
/* Advance fp far enough to read one token of input. |
|
Writes the token contents to *buffpp. |
|
In the case of a string literal, buffpp is pointed at a new malloc'ed buffer. |
|
Returns 0, EOF, errno, or an error code. */ |
|
static int file_get_token(FILE* fp, char** buffpp, size_t buffsize) { |
|
int err; |
|
char ch; |
|
size_t bufflen = buffsize - 1; |
|
char* cursor = *buffpp; |
|
|
|
/* discard any leading whitespace. */ |
|
err = file_discard_ws(fp, NULL); |
|
if (err) { |
|
return err; |
|
} |
|
|
|
/* first char. */ |
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
return err; |
|
|
|
/* this is a comment. */ |
|
} else if (ch == ';') { |
|
err = file_discard_comment(fp); |
|
if (err) { |
|
return err; |
|
} |
|
return file_get_token(fp, buffpp, buffsize); |
|
|
|
/* this is a string literal. */ |
|
} else if (ch == '"') { |
|
err = file_unget_ch(fp, ch); |
|
if (err) { |
|
return err; |
|
} |
|
return file_get_token_str(fp, buffpp); |
|
|
|
} else { |
|
*cursor = ch; |
|
cursor++; |
|
} |
|
|
|
/* this is a single-character token. */ |
|
if (is_ch_single_character_token(ch)) { |
|
*cursor = '\0'; |
|
|
|
/* this is a multi-character token. */ |
|
} else { |
|
/* the rest of the chars. */ |
|
while (true) { |
|
size_t len = cursor - *buffpp; |
|
|
|
/* we have run out of room. */ |
|
if (len == bufflen) { |
|
return E_file_get_token_buff_overflow; |
|
} |
|
|
|
err = file_get_ch(fp, &ch); |
|
if (err) { |
|
return err; |
|
|
|
/* we've reached the end of this token. */ |
|
} else if (is_ch_delim(ch)) { |
|
err = file_unget_ch(fp, ch); |
|
if (err) { |
|
return err; |
|
} else { |
|
*cursor = '\0'; |
|
break; |
|
} |
|
|
|
} else { |
|
*cursor = ch; |
|
cursor++; |
|
} |
|
} |
|
} |
|
|
|
#ifdef TRACE |
|
printf("TRACE: token: '%s'\n", *buffpp); |
|
#endif |
|
|
|
return 0; |
|
} |
|
|
|
|
|
/* Tries to parse a long from buffp into lp. |
|
Returns true or false. */ |
|
static bool try_parse_long(const char* buffp, long* lp) { |
|
char* endptr; |
|
long l = strtol(buffp, &endptr, 10); |
|
if (errno != 0 || endptr == buffp || *endptr != '\0') { |
|
return false; |
|
} else { |
|
*lp = l; |
|
return true; |
|
} |
|
} |
|
|
|
|
|
/* Trie to parse a double from buffp intp dp. |
|
Returns true or false. */ |
|
static bool try_parse_double(const char* buffp, double* dp) { |
|
char* endptr; |
|
double d = strtod(buffp, &endptr); |
|
if (errno != 0 || endptr == buffp || *endptr != '\0') { |
|
return false; |
|
} else { |
|
*dp = d; |
|
return true; |
|
} |
|
} |
|
|
|
|
|
static int read_list(FILE* fp, Form** formpp) { |
|
/* note: the leading '(' has already been consumed. */ |
|
|
|
int err; |
|
int i = 0; |
|
int ws_count; |
|
char ch1; |
|
List* headp = g_emptylist; |
|
List* tailp = g_emptylist; |
|
|
|
while (true) { |
|
err = file_discard_ws(fp, &ws_count); |
|
if (err) { |
|
free_form((Form*)headp); |
|
return err; |
|
} |
|
|
|
err = file_peek_ch(fp, &ch1); |
|
if (err) { |
|
free_form((Form*)headp); |
|
/* reaching EOF before ')' is an error. */ |
|
if (err == EOF) { |
|
return E_read_list_eof; |
|
} else { |
|
return err; |
|
} |
|
|
|
/* we've reached the end of the list. */ |
|
} else if (ch1 == ')') { |
|
err = file_consume_ch(fp); |
|
if (err) { |
|
free_form((Form*)headp); |
|
return err; |
|
} |
|
*formpp = (Form*)headp; |
|
return 0; |
|
|
|
} else { |
|
|
|
/* no space between atoms is an error. */ |
|
if (i > 0 && ws_count == 0 && is_ch_delim(ch1) == false) { |
|
free_form((Form*)headp); |
|
return E_read_list_missing_ws_delimiter; |
|
} |
|
|
|
/* read the next form in the list. */ |
|
Form* formp; |
|
err = read(fp, &formp); |
|
if (err) { |
|
free_form((Form*)headp); |
|
return err; |
|
|
|
/* append the form onto the list. */ |
|
} else { |
|
List* newp; |
|
int err = new_list(&newp, formp); |
|
if (err) { |
|
free_form((Form*)headp); |
|
return err; |
|
} |
|
|
|
if (headp == g_emptylist) { |
|
headp = newp; |
|
tailp = newp; |
|
} else { |
|
tailp->nextp = newp; |
|
tailp = newp; |
|
} |
|
} |
|
} |
|
|
|
i++; |
|
} |
|
} |
|
|
|
|
|
static int read_vector(FILE* fp, Form** formpp) { |
|
assert(false); // TODO |
|
} |
|
|
|
|
|
static int read_map(FILE* fp, Form** formpp) { |
|
assert(false); // TODO |
|
} |
|
|
|
|
|
/* Read one Lisp form from fp intp formpp. |
|
Returns 0 or EOF or errno or error code. |
|
FIXME this can leak on error. */ |
|
int read(FILE* fp, Form** formpp) { |
|
int err; |
|
int buffsize = 100; |
|
char buff[buffsize]; |
|
char* buffp = buff; |
|
char ch1; |
|
|
|
/* read a token. */ |
|
err = file_get_token(fp, &buffp, buffsize); |
|
if (err) { |
|
return err; |
|
} |
|
ch1 = *buffp; |
|
|
|
/* we've reached the end of input. */ |
|
if (ch1 == '\0') { |
|
return EOF; |
|
|
|
/* this is a collection. */ |
|
} else if (ch1 == '(') { |
|
return read_list(fp, formpp); |
|
} else if (ch1 == '[') { |
|
return read_vector(fp, formpp); |
|
} else if (ch1 == '{') { |
|
return read_map(fp, formpp); |
|
|
|
/* nil literal. */ |
|
} else if (ch1 == 'n' && strcmp(buffp, "nil") == 0) { |
|
*formpp = (Form*)g_nil; |
|
return 0; |
|
|
|
/* boolean literals. */ |
|
} else if (ch1 == 't' && strcmp(buffp, "true") == 0) { |
|
*formpp = (Form*)g_true; |
|
return 0; |
|
} else if (ch1 == 'f' && strcmp(buffp, "false") == 0) { |
|
*formpp = (Form*)g_false; |
|
return 0; |
|
|
|
/* character literal. */ |
|
} else if (ch1 == '\\') { |
|
// FIXME implement char parsing. |
|
assert(false); |
|
|
|
/* string literal. */ |
|
} else if (ch1 == '"') { |
|
CString* csp; |
|
err = new_cstring(&csp, buffp); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)csp; |
|
return 0; |
|
} |
|
|
|
/* a keyword. */ |
|
} else if (ch1 == ':') { |
|
Keyword* kwp; |
|
// TODO implement interning. |
|
err = new_keyword(&kwp, buffp); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)kwp; |
|
return 0; |
|
} |
|
|
|
/* a collection closer at this point is a syntax error. */ |
|
} else if (ch1 == ')' || ch1 == ']' || ch1 == '}') { |
|
return E_read_unexpected_closer; |
|
|
|
/* an anonymous function argument. */ |
|
} else if (ch1 == '%') { |
|
// TODO |
|
assert(false); |
|
|
|
/* a deref. */ |
|
} else if (ch1 == '@') { |
|
// TODO |
|
assert(false); |
|
|
|
/* metadata. */ |
|
} else if (ch1 == '^') { |
|
// TODO |
|
assert(false); |
|
|
|
/* quote. */ |
|
} else if (ch1 == '\'') { |
|
// TODO |
|
assert(false); |
|
|
|
/* syntax quote. */ |
|
} else if (ch1 == '`') { |
|
// TODO |
|
assert(false); |
|
|
|
/* syntax quote. */ |
|
} else if (ch1 == '~') { |
|
// TODO |
|
assert(false); |
|
|
|
/* the dispatch character. */ |
|
} else if (ch1 == '#') { |
|
char ch2 = *(buffp+1); |
|
|
|
/* an EOF at this point is an error. */ |
|
if (ch2 == '\0') { |
|
return E_read_incomplete_dispatch; |
|
|
|
/* the "discard" form. */ |
|
} else if (ch2 == '_') { |
|
// TODO |
|
assert(false); |
|
|
|
/* a set. */ |
|
} else if (ch2 == '{') { |
|
// TODO |
|
assert(false); |
|
|
|
/* a regex. */ |
|
} else if (ch2 == '"') { |
|
// TODO |
|
assert(false); |
|
|
|
/* an anonymous function. */ |
|
} else if (ch2 == '(') { |
|
// TODO |
|
assert(false); |
|
|
|
/* a var quote. */ |
|
} else if (ch2 == '\'') { |
|
// TODO |
|
assert(false); |
|
|
|
/* a symbolic value. */ |
|
} else if (ch2 == '#') { |
|
// TODO |
|
assert(false); |
|
|
|
/* tagged literals. */ |
|
} else if (ch2 == 'i' && strcmp(buffp, "#inst") == 0) { |
|
// TODO |
|
assert(false); |
|
} else if (ch2 == 'u' && strcmp(buffp, "#uuid") == 0) { |
|
// TODO |
|
assert(false); |
|
} else if (ch2 == 'j' && strcmp(buffp, "#js") == 0) { |
|
// TODO |
|
assert(false); |
|
|
|
} else { |
|
// TODO |
|
assert(false); |
|
} |
|
|
|
/* forms which can't be determined from ch1 alone. */ |
|
} else { |
|
bool success; |
|
|
|
/* an integer literal. */ |
|
long l; |
|
success = try_parse_long(buffp, &l); |
|
if (success) { |
|
CLong* clp; |
|
err = new_clong(&clp, l); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)clp; |
|
return 0; |
|
} |
|
} |
|
|
|
/* a floating-point literal. */ |
|
double d; |
|
success = try_parse_double(buffp, &d); |
|
if (success) { |
|
CDouble* cdp; |
|
err = new_cdouble(&cdp, d); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)cdp; |
|
return 0; |
|
} |
|
} |
|
|
|
/* assume anything else to be a symbol. */ |
|
Symbol* symp; |
|
err = new_symbol(&symp, buffp); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)symp; |
|
return 0; |
|
} |
|
} |
|
} |