Created
February 4, 2020 05:25
-
-
Save etscrivner/b277cc4160942d1afe19d7eff0f5638d to your computer and use it in GitHub Desktop.
Very basic markdown parser in C
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
typedef uint32_t u32; | |
typedef int32_t i32; | |
typedef uint16_t u16; | |
typedef int16_t i16; | |
typedef uint8_t u8; | |
typedef int8_t i8; | |
typedef uintptr_t umm; | |
#ifdef DEBUG | |
#define DPRINTF(FMT, ...) fprintf(stderr, FMT, __VA_ARGS__) | |
#else | |
#define DPRINTF(FMT, ...) | |
#endif | |
#define MAX_TOKEN_STACK_SIZE 256 | |
typedef struct platform_file_t { | |
u8* Data; | |
umm Size; | |
} platform_file; | |
typedef struct utf8_iterator_t { | |
u8* At; | |
u8* Stop; | |
} utf8_iterator; | |
typedef enum md_token_type_t { | |
MD_TOKEN_none, | |
MD_TOKEN_header, | |
MD_TOKEN_link, | |
MD_TOKEN_text, | |
MD_TOKEN_MAX | |
} md_token_type; | |
typedef struct md_token_t { | |
md_token_type Type; | |
union { | |
struct { | |
u32 Level; | |
} Header; | |
struct { | |
utf8_iterator Href; | |
utf8_iterator Title; | |
} Link; | |
struct { | |
utf8_iterator Text; | |
} Text; | |
}; | |
} md_token; | |
typedef struct md_parser_t { | |
md_token TokenStack[MAX_TOKEN_STACK_SIZE]; | |
i32 TokenStackSize; | |
utf8_iterator TextIter; | |
} md_parser; | |
/////////////////////////////////////////////////////////////////////////////// | |
// platform_file | |
/////////////////////////////////////////////////////////////////////////////// | |
platform_file ReadEntireFile(const char* FilePath) { | |
platform_file Result = {}; | |
FILE* File = fopen(FilePath, "rb"); | |
if (File == NULL) { | |
return(Result); | |
} | |
fseek(File, 0, SEEK_END); | |
Result.Size = ftell(File); | |
fseek(File, 0, SEEK_SET); | |
Result.Data = (u8*)malloc(Result.Size); | |
fread(Result.Data, Result.Size, 1, File); | |
fclose(File); | |
return(Result); | |
} | |
void DestroyEntireFile(platform_file File) { | |
free(File.Data); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// utf8_iterator | |
/////////////////////////////////////////////////////////////////////////////// | |
utf8_iterator ParseUTF8(u8* Start, u8* Stop) { | |
utf8_iterator Result = {}; | |
Result.At = Start; | |
Result.Stop = Stop; | |
return(Result); | |
} | |
bool IsValid(utf8_iterator Iter) { | |
bool Result = (Iter.At < Iter.Stop); | |
return(Result); | |
} | |
u16 UTF8CodepointLengthBytes(utf8_iterator Iter) { | |
char Ch = *Iter.At; | |
if (((Ch >> 7) & 0x1) == 0) { | |
return(1); | |
} else if (((Ch >> 5) & 0x7) == 0x6) { | |
return(2); | |
} else if (((Ch >> 4) & 0xF) == 0xE) { | |
return(3); | |
} else { | |
return(4); | |
} | |
} | |
utf8_iterator NextChar(utf8_iterator Iter) { | |
utf8_iterator Result = {}; | |
Result.At = Iter.At; | |
Result.Stop = Iter.At + UTF8CodepointLengthBytes(Iter); | |
return(Result); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
// md_parser | |
/////////////////////////////////////////////////////////////////////////////// | |
md_parser MarkdownParserInit(utf8_iterator MarkdownFile) { | |
md_parser Result = {}; | |
Result.TokenStackSize = 0; | |
Result.TextIter = MarkdownFile; | |
return(Result); | |
} | |
md_token* PushTokenStack(md_parser* Parser) { | |
Parser->TokenStackSize++; | |
return Parser->TokenStack + Parser->TokenStackSize; | |
} | |
md_token* PopTokenStack(md_parser* Parser) { | |
md_token* CurrentToken = Parser->TokenStack + Parser->TokenStackSize; | |
CurrentToken->Type = MD_TOKEN_none; | |
if (Parser->TokenStackSize > 0) { | |
Parser->TokenStackSize--; | |
return Parser->TokenStack + Parser->TokenStackSize; | |
} | |
return CurrentToken; | |
} | |
int EnterBlock(md_token* Token) { | |
if (Token->Type == MD_TOKEN_header) { | |
printf("\n<h%d>", Token->Header.Level); | |
} else if (Token->Type == MD_TOKEN_link) { | |
printf("<a href=\"%.*s\">", (int)(Token->Link.Href.Stop - Token->Link.Href.At), Token->Link.Href.At); | |
} | |
return(0); | |
} | |
int LeaveBlock(md_token* Token) { | |
if (Token->Type == MD_TOKEN_header) { | |
printf("</h%d>\n", Token->Header.Level); | |
} else if (Token->Type == MD_TOKEN_link) { | |
printf("%.*s</a> ", (int)(Token->Link.Title.Stop - Token->Link.Title.At), Token->Link.Title.At); | |
} | |
return(0); | |
} | |
int Text(md_token* Token) { | |
for (u8* Ch = Token->Text.Text.At; Ch < Token->Text.Text.Stop; ++Ch) | |
{ | |
printf("%c", *Ch); | |
} | |
return(0); | |
} | |
/////////////////////////////////////////////////////////////////////////////// | |
int main() { | |
const char* MarkdownFile = "test.md"; | |
// Process the markdown file | |
platform_file File = ReadEntireFile(MarkdownFile); | |
{ | |
utf8_iterator Iter = ParseUTF8(File.Data, File.Data + File.Size - 1); | |
md_parser Parser = MarkdownParserInit(Iter); | |
while (IsValid(Parser.TextIter)) { | |
utf8_iterator Ch = NextChar(Parser.TextIter); | |
if (IsValid(Ch)) { | |
md_token* Token = Parser.TokenStack + Parser.TokenStackSize; | |
if (*Ch.At == '#') { | |
if (Token->Type == MD_TOKEN_none) { | |
DPRINTF("#: None -> Header\n"); | |
Token->Type = MD_TOKEN_header; | |
Token->Header.Level = 1; | |
} else if (Token->Type == MD_TOKEN_header) { | |
DPRINTF("#: Header -> Header + 1\n"); | |
++Token->Header.Level; | |
} | |
} else if (*Ch.At == '[') { | |
if (Token->Type == MD_TOKEN_none || Token->Type == MD_TOKEN_text) { | |
if (Token->Type == MD_TOKEN_text) { | |
Text(Token); | |
} | |
Token->Type = MD_TOKEN_link; | |
Token->Link.Title.At = Ch.Stop; | |
} | |
} else if (*Ch.At == ']') { | |
if (Token->Type == MD_TOKEN_link) { | |
Token->Link.Title.Stop = Ch.At; | |
} | |
} else if (*Ch.At == '(') { | |
if (Token->Type == MD_TOKEN_link) { | |
Token->Link.Href.At = Ch.Stop; | |
} | |
} else if (*Ch.At == ')') { | |
if (Token->Type == MD_TOKEN_link) { | |
Token->Link.Href.Stop = Ch.At; | |
EnterBlock(Token); | |
LeaveBlock(Token); | |
Token->Type = MD_TOKEN_none; | |
} | |
} else if (*Ch.At == ' ' || *Ch.At == '\t') { | |
if (Token->Type == MD_TOKEN_header) { | |
DPRINTF("SPC: Header\n"); | |
EnterBlock(Token); | |
Token = PushTokenStack(&Parser); | |
Token->Type = MD_TOKEN_none; | |
} else if (Token->Type == MD_TOKEN_text) { | |
Token->Text.Text.Stop = Ch.Stop; | |
} | |
} else if (*Ch.At == '\n') { | |
if (Token->Type == MD_TOKEN_header) { | |
DPRINTF("NL: Header\n"); | |
LeaveBlock(Token); | |
PopTokenStack(&Parser); | |
} else if (Token->Type == MD_TOKEN_text) { | |
DPRINTF("NL: Text\n"); | |
Token->Text.Text.Stop = Ch.At; | |
Text(Token); | |
Token = PopTokenStack(&Parser); | |
if (Token->Type == MD_TOKEN_header) { | |
LeaveBlock(Token); | |
PopTokenStack(&Parser); | |
} | |
} | |
} else { | |
if (Token->Type == MD_TOKEN_none) { | |
DPRINTF("None -> Text\n"); | |
Token->Type = MD_TOKEN_text; | |
Token->Text.Text.At = Ch.At; | |
Token->Text.Text.Stop = Ch.Stop; | |
} else if (Token->Type == MD_TOKEN_text) { | |
Token->Text.Text.Stop = Ch.Stop; | |
} | |
} | |
} | |
Parser.TextIter.At = Ch.Stop; | |
} | |
md_token* Token = Parser.TokenStack + Parser.TokenStackSize; | |
if (Token->Type == MD_TOKEN_text) { | |
Text(Token); | |
} | |
} | |
DestroyEntireFile(File); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment