Skip to content

Instantly share code, notes, and snippets.

@marcelofern
Created August 26, 2024 09:39
Show Gist options
  • Save marcelofern/896574e055a05d011449b00217600fe6 to your computer and use it in GitHub Desktop.
Save marcelofern/896574e055a05d011449b00217600fe6 to your computer and use it in GitHub Desktop.
Website converter (.md -> .html)
/* converter.c
*
* This code converts `.md` files into `.html` files.
*
* The main use is to write a static website in `.md` files, and then run this
* program to convert it to `.html`
*
* The program relies on the following variables:
*
* - INPUT_FOLDER: full path of the website with `.md` files.
* - TMP_FOLDER: a temporary folder to store the conversion results.
* - HEADER_FILE: a file containing html ending with the <body> tag.
* - FOOTER_FILE: a file containing html starting with the </body> tag.
*
* On a high level, the program does the following:
*
* - Removes the TMP_FOLDER if it exists.
* - Copies the INPUT_FOLDER onto the TMP_FOLDER path.
* - Calls nftw to read the TMP_FOLDER directory and all its md files.
* - For each md file, create a thread to process the md file.
* - For each md file, convert .md links into .html <a> links.
*/
// DEBUG
// !gcc -Wall -Werror -g -O0 % %:h/parser/*.c -o /tmp/a.out && /tmp/a.out
// RUN MEMORY ANALYSIS.
// !valgrind -s --leak-check=full --show-leak-kinds=all /tmp/a.out
// PROFILE
// !gcc -Og -pg % %:h/parser/*.c -o /tmp/prog && /tmp/prog && gprof /tmp/prog
// COMPILE FOR USE
// !gcc -O3 -std=c99 % %:h/parser/*.c -o scripts/website/converter.bin
#define MEDIUM_BUFFER_SIZE 512
#define _XOPEN_SOURCE 500 // Required for nftw
#include "../c/array.c"
#include "../c/file.c"
#include "../c/str.c"
#include "parser/md4c-html.h"
#include <ftw.h>
#include <pthread.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
char *INPUT_FOLDER = "/home/x/workspace/website/";
char *OUTPUT_FOLDER = "/home/x/workspace/marcelofern.github.io/";
char *OUTPUT_FOLDER_GIT = "/home/x/workspace/marcelofern.github.io/.git/";
char *TMP_FOLDER = "/tmp/website_converter/";
char *HEADER_FILE = "/home/x/workspace/dotfiles/scripts/website/header.html";
char *FOOTER_FILE = "/home/x/workspace/dotfiles/scripts/website/footer.html";
extern bool MD4C_HTML_INSIDE_LINK_SPAN;
array_t *fpaths;
typedef struct {
char *fpath;
str_t *result;
} md_html_userdata_t;
static void store_result(const char *fpath, str_t *result) {
char *html_fpath = swap_substr((char *)fpath, ".md", ".html");
// Load `result` into the new html file.
FILE *file_ptr = fopen(html_fpath, "w");
if (file_ptr == NULL) {
perror("Error opening file.");
exit(EXIT_FAILURE);
}
str_push(result, '\0');
fprintf(file_ptr, "%s", result->str);
fclose(file_ptr);
free(html_fpath);
}
static bool not_anchors(const char *output, MD_SIZE output_size) {
if (strncmp(output, "<a ", 3) == 0) {
return false;
}
return strncmp(output, "\">", 2) != 0;
}
static void process_md_chunk(const char *output, MD_SIZE output_size,
void *userdata) {
md_html_userdata_t *html_userdata = (md_html_userdata_t *)userdata;
if (MD4C_HTML_INSIDE_LINK_SPAN && not_anchors(output, output_size)) {
// If we are inside a link that has the `.md` extension, we actually want
// to swap it to an `.html` extension.
//
// This is an internal link to another `.md` file, so without this, we
// would end up in a 404.
//
// Also, we want to turn an absolute path into a relative path, and for
// that, we need to truncate the link according to the file path.
char *new_output = calloc(output_size + 1, sizeof(char));
memcpy(new_output, output, output_size);
char *parsed_output = swap_substr(new_output, ".md", ".html");
bool is_md_link = parsed_output != new_output;
// Remove the absolute path for the link and turn it into relative path.
if (is_md_link) {
// Remove the `notes/` folder prefix, which is necessary because I am
// symlinking that folder. Remove this if I ever remove my notes from
// my website.
char *fpath = html_userdata->fpath;
// Remove the tmp folder prefix.
char *parsed_fpath = swap_substr(fpath, TMP_FOLDER, "");
char *fdir = extract_dir(parsed_fpath);
free(parsed_fpath);
if (fdir != NULL && strstr(fdir, "notes") == fdir &&
strcmp(fdir, "notes/") != 0) {
// Because I'm symlinking the notes directory, I need to append that
// to the link path so that the algorithm bellow works properly.
size_t pout_len = strlen(parsed_output);
size_t notes_len = strlen("notes/");
char *tmp = calloc(pout_len + notes_len + 1, sizeof(char));
strcat(tmp, "notes/");
strcat(tmp, parsed_output);
free(parsed_output);
parsed_output = tmp;
}
if (fdir != NULL) {
// Remove the directory prefix from the link so it's not duplicated.
char *tmp = swap_substr(parsed_output, fdir, "");
if (tmp != parsed_output) {
free(parsed_output);
parsed_output = tmp;
}
free(fdir);
}
}
for (size_t i = 0; i < strlen(parsed_output); i++) {
str_push(html_userdata->result, parsed_output[i]);
}
if (is_md_link) {
free(parsed_output);
}
free(new_output);
} else {
for (size_t i = 0; i < output_size; i++) {
str_push(html_userdata->result, output[i]);
}
}
}
static void load_into_result(const char *file, str_t *result) {
// read a file and store into the result.
// mostly used to load the header and the footer.
FILE *file_ptr = fopen(file, "r");
if (file_ptr == NULL) {
perror("Error opening file.");
exit(EXIT_FAILURE);
}
int c;
while ((c = getc(file_ptr)) != EOF) {
str_push(result, c);
}
fclose(file_ptr);
}
static void *process_md_file(void *fpath) {
fpath = (char *)fpath;
// The result where the new parsed file will live.
str_t *result = new_str();
// load the header into `result`
load_into_result(HEADER_FILE, result);
// read the .md file into `s`.
str_t *s = new_str();
FILE *file_ptr = fopen(fpath, "r");
if (file_ptr == NULL) {
perror("Error opening file");
exit(EXIT_FAILURE);
}
int c;
while ((c = getc(file_ptr)) != EOF) {
str_push(s, c);
}
fclose(file_ptr);
// process all the md chunks.
md_html_userdata_t userdata = {fpath, result};
if (md_html(s->str, s->len, &process_md_chunk, (void *)&userdata, 0, 0) ==
-1) {
perror("Couldn't parse the file.");
exit(EXIT_FAILURE);
}
free_str(s);
// load the footer into `result`
load_into_result(FOOTER_FILE, result);
// store the resulting file in the disk.
store_result(fpath, result);
free_str(result);
return NULL;
}
static int push_fpath(const char *fpath, const struct stat *sb, int tflag,
struct FTW *ftwbuf) {
/*
* Pushes the `fpath` string fetched by nftw to the global array of
* fpaths to process later.
*
* Skip if:
* - file doesn't have the `.md` extension.
* - file is in the `.git` folder.
* - file isn't a regular file (from nftw).
*/
// If not a regular file, don't do anything.
if (tflag != FTW_F) {
return 0;
}
// If the file extension is not .md, no processing needs to take place.
char *match = strstr(fpath, ".md");
if ((match == NULL) || (strlen(match) != 3)) {
return 0;
}
// If it's the `.git` folder, don't do anything either.
if (strstr(fpath, ".git") != NULL) {
return 0;
}
char *fpath_copy = malloc(sizeof(char) * (strlen(fpath) + 1));
strcpy(fpath_copy, fpath);
array_push(fpaths, fpath_copy);
return 0;
}
int main(int argc, char *argv[]) {
fpaths = new_array();
rm(TMP_FOLDER);
cp(TMP_FOLDER, INPUT_FOLDER);
// Walk down the folder tree, and store file names in the `fpaths` array.
if (nftw(TMP_FOLDER, push_fpath, 20, 0) == -1) {
perror("nftw");
exit(EXIT_FAILURE);
}
for (size_t i = 0; i < fpaths->len; i++) {
pid_t pid = fork();
if (pid < 0) {
// Error handling if fork fails
perror("fork");
exit(EXIT_FAILURE);
} else if (pid == 0) {
// Child process
process_md_file(fpaths->items[i]);
exit(EXIT_SUCCESS); // Child process should exit after finishing its work
}
}
// Parent process waits for all child processes to finish
for (size_t i = 0; i < fpaths->len; i++) {
int status;
waitpid(-1, &status, 0); // Wait for each child process
if (WIFEXITED(status)) {
if (WEXITSTATUS(status) != 0) {
fprintf(stderr, "Child process terminated with error.\n");
}
} else {
fprintf(stderr, "Child process did not terminate normally.\n");
}
}
// TODO: maybe possible to do this in C.
// Wipe out the output folder, leaving only the .git directory.
char cmd[MEDIUM_BUFFER_SIZE];
sprintf(
cmd,
"find %s -mindepth 1 -not -regex '^%s.git.*' -not -name 'feed' -delete",
OUTPUT_FOLDER, OUTPUT_FOLDER);
system(cmd);
// Move all the files from the TMP_FOLDER into the OUTPUT_FOLDER.
char tmp_folder_star[MEDIUM_BUFFER_SIZE];
strcpy(tmp_folder_star, TMP_FOLDER);
strcat(tmp_folder_star, "*");
cp(OUTPUT_FOLDER, tmp_folder_star);
free_array(fpaths, true, NULL);
exit(EXIT_SUCCESS);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment