|
/* This file is Copyright (C) 2019 Jason Pepas. */ |
|
/* This file is released under the terms of the MIT License. */ |
|
/* See https://opensource.org/licenses/MIT */ |
|
|
|
#include "reader.h" |
|
#include "errors.h" |
|
#include <stdlib.h> |
|
#include <sys/errno.h> |
|
#include <assert.h> |
|
#include <stdbool.h> |
|
#include <ctype.h> |
|
#include <string.h> |
|
#include <stdio.h> |
|
|
|
|
|
/* Creates a new FBuff. |
|
Returns 0 or errno. */ |
|
int new_fbuff(FBuff** fbpp, FILE* fp) { |
|
FBuff* fbp = malloc(sizeof(FBuff)); |
|
if (fbp == NULL) { |
|
int err = errno; |
|
errno = 0; |
|
return err; |
|
} |
|
|
|
fbp->fp = fp; |
|
fbp->buffp = NULL; |
|
fbp->nextp = NULL; |
|
fbp->size = 0; |
|
fbp->len = 0; |
|
|
|
*fbpp = fbp; |
|
return 0; |
|
} |
|
|
|
|
|
/* Frees fbp. */ |
|
void free_fbuff(FBuff* fbp) { |
|
free(fbp->buffp); |
|
free(fbp); |
|
} |
|
|
|
|
|
/* Reads the next line into fbp. |
|
Returns 0, EOF, or errno. */ |
|
static int fbuff_getline(FBuff* fbp) { |
|
ssize_t result = getline(&(fbp->buffp), &(fbp->size), fbp->fp); |
|
if (result == -1) { |
|
if (feof(fbp->fp)) { |
|
return EOF; |
|
} else { |
|
result = errno; |
|
errno = 0; |
|
return result; |
|
} |
|
} else { |
|
fbp->len = result; |
|
fbp->nextp = fbp->buffp; |
|
return 0; |
|
} |
|
} |
|
|
|
|
|
/* Is fbp at the end of the current line? */ |
|
bool is_fbuff_eol(FBuff* fbp) { |
|
return fbp->len == 0 || fbp->nextp == fbp->buffp + fbp->len; |
|
} |
|
|
|
|
|
/* Reads and consumes the next character into chp from fbp. |
|
Returns 0, EOF, or errno. */ |
|
static int fbuff_getch(FBuff* fbp, char* chp) { |
|
if (is_fbuff_eol(fbp)) { |
|
int err = fbuff_getline(fbp); |
|
if (err) { |
|
return err; |
|
} |
|
} |
|
char ch = *(fbp->nextp); |
|
(fbp->nextp)++; |
|
*chp = ch; |
|
return 0; |
|
} |
|
|
|
|
|
/* Pushes ch back into fbp. |
|
Asserts if used incorrectly. */ |
|
static void fbuff_ungetch(FBuff* fbp, char ch) { |
|
assert(fbp->nextp > fbp->buffp); |
|
fbp->nextp--; |
|
*(fbp->nextp) = ch; |
|
} |
|
|
|
|
|
/* Is ch considered whitespace? |
|
Note: commas are considered whitespace. */ |
|
static bool is_ch_ws(char ch) { |
|
return isspace(ch) || ch == ','; |
|
} |
|
|
|
|
|
/* Does ch indicate the end of a token? */ |
|
static bool is_ch_delim(char ch) { |
|
return is_ch_ws(ch) || ch == '(' || ch == ')' || ch == ';'; |
|
} |
|
|
|
|
|
/* Does ch constitute a token on its own? */ |
|
static bool is_ch_single_character_token(char ch) { |
|
char* p = strchr("()", (int)ch); |
|
return (p != NULL); |
|
} |
|
|
|
|
|
/* Advances fbp past any leading whitespace. |
|
Note: commas are considered whitespace. |
|
Returns 0, EOF, or errno. */ |
|
static int fbuff_discard_ws(FBuff* fbp, int* countp) { |
|
int err; |
|
char ch; |
|
int count = 0; |
|
|
|
while (true) { |
|
err = fbuff_getch(fbp, &ch); |
|
if (err) { |
|
return err; |
|
} else if (is_ch_ws(ch)) { |
|
count++; |
|
continue; |
|
} else { |
|
fbuff_ungetch(fbp, ch); |
|
if (countp != NULL) { |
|
*countp = count; |
|
} |
|
break; |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
/* Advances fbp past any whitespace in the current line. */ |
|
void fbuff_skip_buffered_ws(FBuff* fbp) { |
|
while (!is_fbuff_eol(fbp) && is_ch_ws(*(fbp->nextp))) { |
|
fbp->nextp++; |
|
} |
|
} |
|
|
|
|
|
/* Is u even? */ |
|
static bool is_even(unsigned int u) { |
|
/* if the LSB isn't set, u is even. */ |
|
return !(u & 0x1); |
|
} |
|
|
|
|
|
/* Advances fbp far enough to read one token (which is a string literal). |
|
Points buffpp to a malloc'ed buffer containing the string. |
|
Returns 0, EOF, errno, or an error code. */ |
|
static int fbuff_get_token_str(FBuff* fbp, char** buffpp) { |
|
int err; |
|
char ch; |
|
|
|
/* allocate the initial buffer. */ |
|
size_t buffsize = 1000; |
|
size_t bufflen = buffsize - 1; |
|
char* buffp = malloc(buffsize); |
|
if (buffp == NULL) { |
|
err = errno; |
|
errno = 0; |
|
return err; |
|
} |
|
char* cursor = buffp; |
|
|
|
/* the first char must be the opening quote. */ |
|
err = fbuff_getch(fbp, &ch); |
|
if (err) { |
|
free(buffp); |
|
return err; |
|
} else { |
|
assert(ch == '"'); |
|
*cursor = ch; |
|
cursor++; |
|
} |
|
|
|
unsigned int backslash_count = 0; |
|
while (true) { |
|
size_t len = cursor - buffp; |
|
|
|
/* time to grow the buffer. */ |
|
if (len == bufflen) { |
|
buffsize *= 2; |
|
bufflen = buffsize - 1; |
|
char* newbuffp = realloc(buffp, buffsize); |
|
if (newbuffp == NULL) { |
|
free(buffp); |
|
err = errno; |
|
errno = 0; |
|
return err; |
|
} else { |
|
buffp = newbuffp; |
|
} |
|
} |
|
|
|
err = fbuff_getch(fbp, &ch); |
|
if (err) { |
|
free(buffp); |
|
return err; |
|
|
|
/* this is the end of the string. */ |
|
} else if (ch == '"' && is_even(backslash_count)) { |
|
*cursor = ch; |
|
cursor++; |
|
*cursor = '\0'; |
|
/* shrink buffp to fit the size of the string. */ |
|
size_t finalbuffsize = buffp - cursor + 1; |
|
if (finalbuffsize < buffsize) { |
|
char* finalbuffp = realloc(buffp, finalbuffsize); |
|
if (finalbuffp == NULL) { |
|
free(buffp); |
|
err = errno; |
|
errno = 0; |
|
return err; |
|
} else { |
|
buffp = finalbuffp; |
|
} |
|
} |
|
break; |
|
|
|
/* this is a regular char. */ |
|
} else { |
|
/* track the number of consecutive backslashes so we can |
|
disambiguate the closing quote. */ |
|
if (ch == '\\') { |
|
backslash_count++; |
|
} else { |
|
backslash_count = 0; |
|
} |
|
|
|
*cursor = ch; |
|
cursor++; |
|
} |
|
} |
|
|
|
*buffpp = buffp; |
|
return 0; |
|
} |
|
|
|
|
|
/* Advances fbp far enough to read one token of input. |
|
Writes the token contents to *buffpp. |
|
In the case of a string literal, points buffpp to a malloc'ed string buffer. |
|
Returns 0, EOF, errno, or an error code. */ |
|
static int fbuff_get_token(FBuff* fbp, char** buffpp, size_t buffsize) { |
|
int err; |
|
char ch; |
|
size_t bufflen = buffsize - 1; |
|
char* cursor = *buffpp; |
|
|
|
/* discard any leading whitespace. */ |
|
err = fbuff_discard_ws(fbp, NULL); |
|
if (err) { |
|
return err; |
|
} |
|
|
|
/* a token must be at least one char in length. */ |
|
err = fbuff_getch(fbp, &ch); |
|
if (err) { |
|
return err; |
|
|
|
/* this is a string literal. */ |
|
} else if (ch == '"') { |
|
fbuff_ungetch(fbp, ch); |
|
return fbuff_get_token_str(fbp, buffpp); |
|
|
|
} else { |
|
*cursor = ch; |
|
cursor++; |
|
} |
|
|
|
/* this is a single-character token. */ |
|
if (is_ch_single_character_token(ch)) { |
|
*cursor = '\0'; |
|
|
|
/* this is a multi-character token. read the rest of the chars. */ |
|
} else { |
|
bool is_comment = (ch == ';'); |
|
|
|
while (true) { |
|
size_t len = cursor - *buffpp; |
|
|
|
/* we have run out of room. */ |
|
if (len == bufflen) { |
|
return E_file_get_token__buff_overflow; |
|
} |
|
|
|
err = fbuff_getch(fbp, &ch); |
|
/* we've reached EOF. return what we have so far. */ |
|
if (err == EOF) { |
|
*cursor = '\0'; |
|
break; |
|
|
|
/* there was an error reading from fp. */ |
|
} else if (err != 0) { |
|
return err; |
|
|
|
/* we've reached the end of this regular token. */ |
|
} else if (!is_comment && is_ch_delim(ch)) { |
|
fbuff_ungetch(fbp, ch); |
|
*cursor = '\0'; |
|
break; |
|
|
|
/* we've reached the end of this comment token. */ |
|
} else if (is_comment && ch == '\n') { |
|
/* we leave the trailing '\n' to ensure the list args are |
|
still whitespace-separated. */ |
|
fbuff_ungetch(fbp, ch); |
|
*cursor = '\0'; |
|
break; |
|
|
|
/* this char is part of the token. */ |
|
} else { |
|
*cursor = ch; |
|
cursor++; |
|
continue; |
|
} |
|
} |
|
} |
|
return 0; |
|
} |
|
|
|
|
|
/* Tries to parse a long from buffp into lp. |
|
Returns true or false. */ |
|
static bool try_parse_long(const char* buffp, long* lp) { |
|
char* endptr; |
|
long l = strtol(buffp, &endptr, 10); |
|
if (errno != 0) { |
|
errno = 0; |
|
return false; |
|
} else if (endptr == buffp || *endptr != '\0') { |
|
return false; |
|
} else { |
|
*lp = l; |
|
return true; |
|
} |
|
} |
|
|
|
|
|
/* Tries to parse a double from buffp intp dp. |
|
Returns true or false. */ |
|
static bool try_parse_double(const char* buffp, double* dp) { |
|
char* endptr; |
|
double d = strtod(buffp, &endptr); |
|
if (errno != 0) { |
|
errno = 0; |
|
return false; |
|
} else if (endptr == buffp || *endptr != '\0') { |
|
return false; |
|
} else { |
|
*dp = d; |
|
return true; |
|
} |
|
} |
|
|
|
|
|
/* Is ch in the list of escapable chars? */ |
|
static bool is_escapechar(char ch) { |
|
char* found = strchr("abefnrtv?'\"\\", (int)ch); |
|
return found != NULL; |
|
} |
|
|
|
|
|
/* Returns the "unescaped" char corresponding to the given escape char. |
|
E.g. if esc is 'n', a newline character is returned. |
|
Asserts false if esc is not a valid escape char. */ |
|
static char unescape_char(char esc) { |
|
if (esc == 'a') { |
|
return '\a'; |
|
} else if (esc == 'b') { |
|
return '\b'; |
|
} else if (esc == 'e') { |
|
return '\e'; |
|
} else if (esc == 'f') { |
|
return '\f'; |
|
} else if (esc == 'n') { |
|
return '\n'; |
|
} else if (esc == 'r') { |
|
return '\r'; |
|
} else if (esc == 't') { |
|
return '\t'; |
|
} else if (esc == 'v') { |
|
return '\v'; |
|
} else if (esc == '\\') { |
|
return '\\'; |
|
} else if (esc == '\'') { |
|
return '\''; |
|
} else if (esc == '"') { |
|
return '"'; |
|
} else if (esc == '?') { |
|
return '?'; |
|
} else { |
|
assert(false); |
|
} |
|
} |
|
|
|
|
|
/* Parses a string from buffp. |
|
*spp is malloc'ed with a copy of the parsed string. |
|
Returns 0 or errno or error. */ |
|
static int parse_string(const char* buffp, char** spp) { |
|
size_t src_len = strlen(buffp); |
|
size_t src_size = src_len + 1; |
|
|
|
/* minimum buffp is an opening and closing quote, so we know len >= 2. */ |
|
assert(src_len >= 2); |
|
/* first and last char must be '"'. */ |
|
if (*buffp != '"') { |
|
return E_parse_string__invalid_string_1; |
|
} |
|
if (*(buffp + src_len - 1) != '"') { |
|
return E_parse_string__invalid_string_2; |
|
} |
|
|
|
size_t dst_size = src_size - 2; |
|
char* dst = malloc(dst_size); |
|
if (dst == NULL) { |
|
int err = errno; |
|
errno = 0; |
|
return err; |
|
} |
|
|
|
/* skip the opening quote. */ |
|
const char* src_first = buffp + 1; |
|
/* skip the closing quote. */ |
|
const char* src_last = buffp + src_len - 2; |
|
|
|
const char* src_cursor = src_first; |
|
char* dst_cursor = dst; |
|
|
|
while (src_cursor <= src_last) { |
|
/* this is possibly an escape sequence. */ |
|
if (*src_cursor == '\\') { |
|
src_cursor++; |
|
|
|
/* the last char of the string is a backslash, which is invalid. */ |
|
if (src_cursor > src_last) { |
|
free(dst); |
|
return E_parse_string__invalid_string_3; |
|
|
|
/* this is an escape sequence. */ |
|
} else if (is_escapechar(*src_cursor)) { |
|
*dst_cursor = unescape_char(*src_cursor); |
|
src_cursor++; |
|
dst_cursor++; |
|
continue; |
|
|
|
/* this is an invalid escape sequence. */ |
|
} else { |
|
free(dst); |
|
return E_parse_string__invalid_string_4; |
|
} |
|
|
|
/* this is just a regular char. */ |
|
} else { |
|
*dst_cursor = *src_cursor; |
|
src_cursor++; |
|
dst_cursor++; |
|
continue; |
|
} |
|
} |
|
*dst_cursor = '\0'; |
|
*spp = dst; |
|
return 0; |
|
} |
|
|
|
|
|
/* Read all of the forms in this Lisp list. |
|
Returns 0 or error. */ |
|
static int read_list(FBuff* fbp, Form** formpp) { |
|
/* note: the leading '(' has already been consumed. */ |
|
|
|
int err; |
|
int i = 0; |
|
int ws_count; |
|
char ch1; |
|
List* headp = g_emptylist; |
|
List* tailp = g_emptylist; |
|
|
|
while (true) { |
|
err = fbuff_discard_ws(fbp, &ws_count); |
|
if (err) { |
|
/* reaching EOF before ')' is an error. */ |
|
if (err == EOF) { |
|
return E_read_list__premature_eof_1; |
|
} else { |
|
return err; |
|
} |
|
} |
|
|
|
err = fbuff_getch(fbp, &ch1); |
|
if (err) { |
|
/* reaching EOF before ')' is an error. */ |
|
if (err == EOF) { |
|
return E_read_list__premature_eof_2; |
|
} else { |
|
return err; |
|
} |
|
|
|
/* we've reached the end of the list. */ |
|
} else if (ch1 == ')') { |
|
*formpp = (Form*)headp; |
|
return 0; |
|
|
|
} else { |
|
fbuff_ungetch(fbp, ch1); |
|
|
|
/* no space between atoms is an error. */ |
|
if (i > 0 && ws_count == 0 && is_ch_delim(ch1) == false) { |
|
return E_read_list__missing_ws; |
|
} |
|
|
|
/* read the next form in the list. */ |
|
Form* formp; |
|
err = read_form(fbp, &formp); |
|
if (err) { |
|
if (err == E_read_form__comment) { |
|
continue; |
|
} else { |
|
/* reaching EOF before ')' is an error. */ |
|
if (err == EOF) { |
|
return E_read_list__premature_eof_3; |
|
} else { |
|
return err; |
|
} |
|
} |
|
|
|
/* append the form onto the list. */ |
|
} else { |
|
List* newp; |
|
int err = new_list(&newp, formp); |
|
if (err) { |
|
return err; |
|
} |
|
|
|
if (headp == g_emptylist) { |
|
headp = newp; |
|
tailp = newp; |
|
} else { |
|
tailp->nextp = newp; |
|
tailp = newp; |
|
} |
|
} |
|
} |
|
|
|
i++; |
|
} |
|
} |
|
|
|
|
|
/* Read one Lisp form from fp intp formpp. |
|
Returns 0 or E_read_form__comment or EOF or errno or error. */ |
|
int read_form(FBuff* fbp, Form** formpp) { |
|
int err; |
|
int buffsize = 100; |
|
char buff[buffsize]; |
|
char* buffp = buff; |
|
char ch1; |
|
|
|
/* read a token. */ |
|
err = fbuff_get_token(fbp, &buffp, buffsize); |
|
if (err) { |
|
return err; |
|
} |
|
ch1 = *buffp; |
|
|
|
/* we've reached the end of input. */ |
|
if (ch1 == '\0') { |
|
return EOF; |
|
|
|
/* this is a comment. */ |
|
} else if (ch1 == ';') { |
|
return E_read_form__comment; |
|
|
|
/* a list. */ |
|
} else if (ch1 == '(') { |
|
return read_list(fbp, formpp); |
|
|
|
/* a list closer at this point is a syntax error. */ |
|
} else if (ch1 == ')') { |
|
return E_read_form__unexpected_list_closer; |
|
|
|
/* string literal. */ |
|
} else if (ch1 == '"') { |
|
assert(buffp != buff); |
|
char* sp; |
|
err = parse_string(buffp, &sp); |
|
if (err) { |
|
free(buffp); |
|
return err; |
|
} |
|
|
|
CString* csp; |
|
err = new_cstring(&csp, sp); |
|
if (err) { |
|
free(buffp); |
|
return err; |
|
} else { |
|
*formpp = (Form*)csp; |
|
return 0; |
|
} |
|
|
|
/* the form type can't be determined from ch1 alone. */ |
|
} else { |
|
bool success; |
|
|
|
/* an integer literal. */ |
|
long l; |
|
success = try_parse_long(buffp, &l); |
|
if (success) { |
|
CLong* clp; |
|
err = new_clong(&clp, l); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)clp; |
|
return 0; |
|
} |
|
} |
|
|
|
/* a floating-point literal. */ |
|
double d; |
|
success = try_parse_double(buffp, &d); |
|
if (success) { |
|
CDouble* cdp; |
|
err = new_cdouble(&cdp, d); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)cdp; |
|
return 0; |
|
} |
|
} |
|
|
|
/* assume anything else is a symbol. */ |
|
Symbol* symp; |
|
err = new_symbol(&symp, buffp); |
|
if (err) { |
|
return err; |
|
} else { |
|
*formpp = (Form*)symp; |
|
return 0; |
|
} |
|
} |
|
} |