Skip to content

Instantly share code, notes, and snippets.

@etscrivner
Created February 4, 2020 05:25
Show Gist options
  • Save etscrivner/b277cc4160942d1afe19d7eff0f5638d to your computer and use it in GitHub Desktop.
Save etscrivner/b277cc4160942d1afe19d7eff0f5638d to your computer and use it in GitHub Desktop.
Very basic markdown parser in C
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
typedef uint32_t u32;
typedef int32_t i32;
typedef uint16_t u16;
typedef int16_t i16;
typedef uint8_t u8;
typedef int8_t i8;
typedef uintptr_t umm;
#ifdef DEBUG
#define DPRINTF(FMT, ...) fprintf(stderr, FMT, __VA_ARGS__)
#else
#define DPRINTF(FMT, ...)
#endif
#define MAX_TOKEN_STACK_SIZE 256
typedef struct platform_file_t {
u8* Data;
umm Size;
} platform_file;
typedef struct utf8_iterator_t {
u8* At;
u8* Stop;
} utf8_iterator;
typedef enum md_token_type_t {
MD_TOKEN_none,
MD_TOKEN_header,
MD_TOKEN_link,
MD_TOKEN_text,
MD_TOKEN_MAX
} md_token_type;
typedef struct md_token_t {
md_token_type Type;
union {
struct {
u32 Level;
} Header;
struct {
utf8_iterator Href;
utf8_iterator Title;
} Link;
struct {
utf8_iterator Text;
} Text;
};
} md_token;
typedef struct md_parser_t {
md_token TokenStack[MAX_TOKEN_STACK_SIZE];
i32 TokenStackSize;
utf8_iterator TextIter;
} md_parser;
///////////////////////////////////////////////////////////////////////////////
// platform_file
///////////////////////////////////////////////////////////////////////////////
platform_file ReadEntireFile(const char* FilePath) {
platform_file Result = {};
FILE* File = fopen(FilePath, "rb");
if (File == NULL) {
return(Result);
}
fseek(File, 0, SEEK_END);
Result.Size = ftell(File);
fseek(File, 0, SEEK_SET);
Result.Data = (u8*)malloc(Result.Size);
fread(Result.Data, Result.Size, 1, File);
fclose(File);
return(Result);
}
void DestroyEntireFile(platform_file File) {
free(File.Data);
}
///////////////////////////////////////////////////////////////////////////////
// utf8_iterator
///////////////////////////////////////////////////////////////////////////////
utf8_iterator ParseUTF8(u8* Start, u8* Stop) {
utf8_iterator Result = {};
Result.At = Start;
Result.Stop = Stop;
return(Result);
}
bool IsValid(utf8_iterator Iter) {
bool Result = (Iter.At < Iter.Stop);
return(Result);
}
u16 UTF8CodepointLengthBytes(utf8_iterator Iter) {
char Ch = *Iter.At;
if (((Ch >> 7) & 0x1) == 0) {
return(1);
} else if (((Ch >> 5) & 0x7) == 0x6) {
return(2);
} else if (((Ch >> 4) & 0xF) == 0xE) {
return(3);
} else {
return(4);
}
}
utf8_iterator NextChar(utf8_iterator Iter) {
utf8_iterator Result = {};
Result.At = Iter.At;
Result.Stop = Iter.At + UTF8CodepointLengthBytes(Iter);
return(Result);
}
///////////////////////////////////////////////////////////////////////////////
// md_parser
///////////////////////////////////////////////////////////////////////////////
md_parser MarkdownParserInit(utf8_iterator MarkdownFile) {
md_parser Result = {};
Result.TokenStackSize = 0;
Result.TextIter = MarkdownFile;
return(Result);
}
md_token* PushTokenStack(md_parser* Parser) {
Parser->TokenStackSize++;
return Parser->TokenStack + Parser->TokenStackSize;
}
md_token* PopTokenStack(md_parser* Parser) {
md_token* CurrentToken = Parser->TokenStack + Parser->TokenStackSize;
CurrentToken->Type = MD_TOKEN_none;
if (Parser->TokenStackSize > 0) {
Parser->TokenStackSize--;
return Parser->TokenStack + Parser->TokenStackSize;
}
return CurrentToken;
}
int EnterBlock(md_token* Token) {
if (Token->Type == MD_TOKEN_header) {
printf("\n<h%d>", Token->Header.Level);
} else if (Token->Type == MD_TOKEN_link) {
printf("<a href=\"%.*s\">", (int)(Token->Link.Href.Stop - Token->Link.Href.At), Token->Link.Href.At);
}
return(0);
}
int LeaveBlock(md_token* Token) {
if (Token->Type == MD_TOKEN_header) {
printf("</h%d>\n", Token->Header.Level);
} else if (Token->Type == MD_TOKEN_link) {
printf("%.*s</a> ", (int)(Token->Link.Title.Stop - Token->Link.Title.At), Token->Link.Title.At);
}
return(0);
}
int Text(md_token* Token) {
for (u8* Ch = Token->Text.Text.At; Ch < Token->Text.Text.Stop; ++Ch)
{
printf("%c", *Ch);
}
return(0);
}
///////////////////////////////////////////////////////////////////////////////
int main() {
const char* MarkdownFile = "test.md";
// Process the markdown file
platform_file File = ReadEntireFile(MarkdownFile);
{
utf8_iterator Iter = ParseUTF8(File.Data, File.Data + File.Size - 1);
md_parser Parser = MarkdownParserInit(Iter);
while (IsValid(Parser.TextIter)) {
utf8_iterator Ch = NextChar(Parser.TextIter);
if (IsValid(Ch)) {
md_token* Token = Parser.TokenStack + Parser.TokenStackSize;
if (*Ch.At == '#') {
if (Token->Type == MD_TOKEN_none) {
DPRINTF("#: None -> Header\n");
Token->Type = MD_TOKEN_header;
Token->Header.Level = 1;
} else if (Token->Type == MD_TOKEN_header) {
DPRINTF("#: Header -> Header + 1\n");
++Token->Header.Level;
}
} else if (*Ch.At == '[') {
if (Token->Type == MD_TOKEN_none || Token->Type == MD_TOKEN_text) {
if (Token->Type == MD_TOKEN_text) {
Text(Token);
}
Token->Type = MD_TOKEN_link;
Token->Link.Title.At = Ch.Stop;
}
} else if (*Ch.At == ']') {
if (Token->Type == MD_TOKEN_link) {
Token->Link.Title.Stop = Ch.At;
}
} else if (*Ch.At == '(') {
if (Token->Type == MD_TOKEN_link) {
Token->Link.Href.At = Ch.Stop;
}
} else if (*Ch.At == ')') {
if (Token->Type == MD_TOKEN_link) {
Token->Link.Href.Stop = Ch.At;
EnterBlock(Token);
LeaveBlock(Token);
Token->Type = MD_TOKEN_none;
}
} else if (*Ch.At == ' ' || *Ch.At == '\t') {
if (Token->Type == MD_TOKEN_header) {
DPRINTF("SPC: Header\n");
EnterBlock(Token);
Token = PushTokenStack(&Parser);
Token->Type = MD_TOKEN_none;
} else if (Token->Type == MD_TOKEN_text) {
Token->Text.Text.Stop = Ch.Stop;
}
} else if (*Ch.At == '\n') {
if (Token->Type == MD_TOKEN_header) {
DPRINTF("NL: Header\n");
LeaveBlock(Token);
PopTokenStack(&Parser);
} else if (Token->Type == MD_TOKEN_text) {
DPRINTF("NL: Text\n");
Token->Text.Text.Stop = Ch.At;
Text(Token);
Token = PopTokenStack(&Parser);
if (Token->Type == MD_TOKEN_header) {
LeaveBlock(Token);
PopTokenStack(&Parser);
}
}
} else {
if (Token->Type == MD_TOKEN_none) {
DPRINTF("None -> Text\n");
Token->Type = MD_TOKEN_text;
Token->Text.Text.At = Ch.At;
Token->Text.Text.Stop = Ch.Stop;
} else if (Token->Type == MD_TOKEN_text) {
Token->Text.Text.Stop = Ch.Stop;
}
}
}
Parser.TextIter.At = Ch.Stop;
}
md_token* Token = Parser.TokenStack + Parser.TokenStackSize;
if (Token->Type == MD_TOKEN_text) {
Text(Token);
}
}
DestroyEntireFile(File);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment