Created
August 26, 2024 09:39
-
-
Save marcelofern/896574e055a05d011449b00217600fe6 to your computer and use it in GitHub Desktop.
Website converter (.md -> .html)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* converter.c | |
* | |
* This code converts `.md` files into `.html` files. | |
* | |
* The main use is to write a static website in `.md` files, and then run this | |
* program to convert it to `.html` | |
* | |
* The program relies on the following variables: | |
* | |
* - INPUT_FOLDER: full path of the website with `.md` files. | |
* - TMP_FOLDER: a temporary folder to store the conversion results. | |
* - HEADER_FILE: a file containing html ending with the <body> tag. | |
* - FOOTER_FILE: a file containing html starting with the </body> tag. | |
* | |
* On a high level, the program does the following: | |
* | |
* - Removes the TMP_FOLDER if it exists. | |
* - Copies the INPUT_FOLDER onto the TMP_FOLDER path. | |
* - Calls nftw to read the TMP_FOLDER directory and all its md files. | |
* - For each md file, create a thread to process the md file. | |
* - For each md file, convert .md links into .html <a> links. | |
*/ | |
// DEBUG | |
// !gcc -Wall -Werror -g -O0 % %:h/parser/*.c -o /tmp/a.out && /tmp/a.out | |
// RUN MEMORY ANALYSIS. | |
// !valgrind -s --leak-check=full --show-leak-kinds=all /tmp/a.out | |
// PROFILE | |
// !gcc -Og -pg % %:h/parser/*.c -o /tmp/prog && /tmp/prog && gprof /tmp/prog | |
// COMPILE FOR USE | |
// !gcc -O3 -std=c99 % %:h/parser/*.c -o scripts/website/converter.bin | |
#define MEDIUM_BUFFER_SIZE 512 | |
#define _XOPEN_SOURCE 500 // Required for nftw | |
#include "../c/array.c" | |
#include "../c/file.c" | |
#include "../c/str.c" | |
#include "parser/md4c-html.h" | |
#include <ftw.h> | |
#include <pthread.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <sys/types.h> | |
#include <sys/wait.h> | |
#include <unistd.h> | |
char *INPUT_FOLDER = "/home/x/workspace/website/"; | |
char *OUTPUT_FOLDER = "/home/x/workspace/marcelofern.github.io/"; | |
char *OUTPUT_FOLDER_GIT = "/home/x/workspace/marcelofern.github.io/.git/"; | |
char *TMP_FOLDER = "/tmp/website_converter/"; | |
char *HEADER_FILE = "/home/x/workspace/dotfiles/scripts/website/header.html"; | |
char *FOOTER_FILE = "/home/x/workspace/dotfiles/scripts/website/footer.html"; | |
extern bool MD4C_HTML_INSIDE_LINK_SPAN; | |
array_t *fpaths; | |
typedef struct { | |
char *fpath; | |
str_t *result; | |
} md_html_userdata_t; | |
static void store_result(const char *fpath, str_t *result) { | |
char *html_fpath = swap_substr((char *)fpath, ".md", ".html"); | |
// Load `result` into the new html file. | |
FILE *file_ptr = fopen(html_fpath, "w"); | |
if (file_ptr == NULL) { | |
perror("Error opening file."); | |
exit(EXIT_FAILURE); | |
} | |
str_push(result, '\0'); | |
fprintf(file_ptr, "%s", result->str); | |
fclose(file_ptr); | |
free(html_fpath); | |
} | |
static bool not_anchors(const char *output, MD_SIZE output_size) { | |
if (strncmp(output, "<a ", 3) == 0) { | |
return false; | |
} | |
return strncmp(output, "\">", 2) != 0; | |
} | |
static void process_md_chunk(const char *output, MD_SIZE output_size, | |
void *userdata) { | |
md_html_userdata_t *html_userdata = (md_html_userdata_t *)userdata; | |
if (MD4C_HTML_INSIDE_LINK_SPAN && not_anchors(output, output_size)) { | |
// If we are inside a link that has the `.md` extension, we actually want | |
// to swap it to an `.html` extension. | |
// | |
// This is an internal link to another `.md` file, so without this, we | |
// would end up in a 404. | |
// | |
// Also, we want to turn an absolute path into a relative path, and for | |
// that, we need to truncate the link according to the file path. | |
char *new_output = calloc(output_size + 1, sizeof(char)); | |
memcpy(new_output, output, output_size); | |
char *parsed_output = swap_substr(new_output, ".md", ".html"); | |
bool is_md_link = parsed_output != new_output; | |
// Remove the absolute path for the link and turn it into relative path. | |
if (is_md_link) { | |
// Remove the `notes/` folder prefix, which is necessary because I am | |
// symlinking that folder. Remove this if I ever remove my notes from | |
// my website. | |
char *fpath = html_userdata->fpath; | |
// Remove the tmp folder prefix. | |
char *parsed_fpath = swap_substr(fpath, TMP_FOLDER, ""); | |
char *fdir = extract_dir(parsed_fpath); | |
free(parsed_fpath); | |
if (fdir != NULL && strstr(fdir, "notes") == fdir && | |
strcmp(fdir, "notes/") != 0) { | |
// Because I'm symlinking the notes directory, I need to append that | |
// to the link path so that the algorithm bellow works properly. | |
size_t pout_len = strlen(parsed_output); | |
size_t notes_len = strlen("notes/"); | |
char *tmp = calloc(pout_len + notes_len + 1, sizeof(char)); | |
strcat(tmp, "notes/"); | |
strcat(tmp, parsed_output); | |
free(parsed_output); | |
parsed_output = tmp; | |
} | |
if (fdir != NULL) { | |
// Remove the directory prefix from the link so it's not duplicated. | |
char *tmp = swap_substr(parsed_output, fdir, ""); | |
if (tmp != parsed_output) { | |
free(parsed_output); | |
parsed_output = tmp; | |
} | |
free(fdir); | |
} | |
} | |
for (size_t i = 0; i < strlen(parsed_output); i++) { | |
str_push(html_userdata->result, parsed_output[i]); | |
} | |
if (is_md_link) { | |
free(parsed_output); | |
} | |
free(new_output); | |
} else { | |
for (size_t i = 0; i < output_size; i++) { | |
str_push(html_userdata->result, output[i]); | |
} | |
} | |
} | |
static void load_into_result(const char *file, str_t *result) { | |
// read a file and store into the result. | |
// mostly used to load the header and the footer. | |
FILE *file_ptr = fopen(file, "r"); | |
if (file_ptr == NULL) { | |
perror("Error opening file."); | |
exit(EXIT_FAILURE); | |
} | |
int c; | |
while ((c = getc(file_ptr)) != EOF) { | |
str_push(result, c); | |
} | |
fclose(file_ptr); | |
} | |
static void *process_md_file(void *fpath) { | |
fpath = (char *)fpath; | |
// The result where the new parsed file will live. | |
str_t *result = new_str(); | |
// load the header into `result` | |
load_into_result(HEADER_FILE, result); | |
// read the .md file into `s`. | |
str_t *s = new_str(); | |
FILE *file_ptr = fopen(fpath, "r"); | |
if (file_ptr == NULL) { | |
perror("Error opening file"); | |
exit(EXIT_FAILURE); | |
} | |
int c; | |
while ((c = getc(file_ptr)) != EOF) { | |
str_push(s, c); | |
} | |
fclose(file_ptr); | |
// process all the md chunks. | |
md_html_userdata_t userdata = {fpath, result}; | |
if (md_html(s->str, s->len, &process_md_chunk, (void *)&userdata, 0, 0) == | |
-1) { | |
perror("Couldn't parse the file."); | |
exit(EXIT_FAILURE); | |
} | |
free_str(s); | |
// load the footer into `result` | |
load_into_result(FOOTER_FILE, result); | |
// store the resulting file in the disk. | |
store_result(fpath, result); | |
free_str(result); | |
return NULL; | |
} | |
static int push_fpath(const char *fpath, const struct stat *sb, int tflag, | |
struct FTW *ftwbuf) { | |
/* | |
* Pushes the `fpath` string fetched by nftw to the global array of | |
* fpaths to process later. | |
* | |
* Skip if: | |
* - file doesn't have the `.md` extension. | |
* - file is in the `.git` folder. | |
* - file isn't a regular file (from nftw). | |
*/ | |
// If not a regular file, don't do anything. | |
if (tflag != FTW_F) { | |
return 0; | |
} | |
// If the file extension is not .md, no processing needs to take place. | |
char *match = strstr(fpath, ".md"); | |
if ((match == NULL) || (strlen(match) != 3)) { | |
return 0; | |
} | |
// If it's the `.git` folder, don't do anything either. | |
if (strstr(fpath, ".git") != NULL) { | |
return 0; | |
} | |
char *fpath_copy = malloc(sizeof(char) * (strlen(fpath) + 1)); | |
strcpy(fpath_copy, fpath); | |
array_push(fpaths, fpath_copy); | |
return 0; | |
} | |
int main(int argc, char *argv[]) { | |
fpaths = new_array(); | |
rm(TMP_FOLDER); | |
cp(TMP_FOLDER, INPUT_FOLDER); | |
// Walk down the folder tree, and store file names in the `fpaths` array. | |
if (nftw(TMP_FOLDER, push_fpath, 20, 0) == -1) { | |
perror("nftw"); | |
exit(EXIT_FAILURE); | |
} | |
for (size_t i = 0; i < fpaths->len; i++) { | |
pid_t pid = fork(); | |
if (pid < 0) { | |
// Error handling if fork fails | |
perror("fork"); | |
exit(EXIT_FAILURE); | |
} else if (pid == 0) { | |
// Child process | |
process_md_file(fpaths->items[i]); | |
exit(EXIT_SUCCESS); // Child process should exit after finishing its work | |
} | |
} | |
// Parent process waits for all child processes to finish | |
for (size_t i = 0; i < fpaths->len; i++) { | |
int status; | |
waitpid(-1, &status, 0); // Wait for each child process | |
if (WIFEXITED(status)) { | |
if (WEXITSTATUS(status) != 0) { | |
fprintf(stderr, "Child process terminated with error.\n"); | |
} | |
} else { | |
fprintf(stderr, "Child process did not terminate normally.\n"); | |
} | |
} | |
// TODO: maybe possible to do this in C. | |
// Wipe out the output folder, leaving only the .git directory. | |
char cmd[MEDIUM_BUFFER_SIZE]; | |
sprintf( | |
cmd, | |
"find %s -mindepth 1 -not -regex '^%s.git.*' -not -name 'feed' -delete", | |
OUTPUT_FOLDER, OUTPUT_FOLDER); | |
system(cmd); | |
// Move all the files from the TMP_FOLDER into the OUTPUT_FOLDER. | |
char tmp_folder_star[MEDIUM_BUFFER_SIZE]; | |
strcpy(tmp_folder_star, TMP_FOLDER); | |
strcat(tmp_folder_star, "*"); | |
cp(OUTPUT_FOLDER, tmp_folder_star); | |
free_array(fpaths, true, NULL); | |
exit(EXIT_SUCCESS); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment