marcelofern · August 26, 2024 09:39
diff --git a/converter.c b/converter.c
 /* converter.c
 *
 * This code converts `.md` files into `.html` files.
 *
 * The main use is to write a static website in `.md` files, and then run this
 * program to convert it to `.html`
 *
 * The program relies on the following variables:
 *
 * - INPUT_FOLDER: full path of the website with `.md` files.
 * - TMP_FOLDER: a temporary folder to store the conversion results.
 * - HEADER_FILE: a file containing html ending with the <body> tag.
 * - FOOTER_FILE: a file containing html starting with the </body> tag.
 *
 * On a high level, the program does the following:
 *
 * - Removes the TMP_FOLDER if it exists.
 * - Copies the INPUT_FOLDER onto the TMP_FOLDER path.
 * - Calls nftw to read the TMP_FOLDER directory and all its md files.
 * - For each md file, create a thread to process the md file.
 * - For each md file, convert .md links into .html <a> links.
 */

 // DEBUG
 // !gcc -Wall -Werror -g -O0 % %:h/parser/*.c -o /tmp/a.out && /tmp/a.out

 // RUN MEMORY ANALYSIS.
 // !valgrind -s --leak-check=full --show-leak-kinds=all /tmp/a.out

 // PROFILE
 // !gcc -Og -pg % %:h/parser/*.c -o /tmp/prog && /tmp/prog && gprof /tmp/prog

 // COMPILE FOR USE
 // !gcc -O3 -std=c99 % %:h/parser/*.c -o scripts/website/converter.bin

 #define MEDIUM_BUFFER_SIZE 512
 #define _XOPEN_SOURCE 500 // Required for nftw
 #include "../c/array.c"
 #include "../c/file.c"
 #include "../c/str.c"
 #include "parser/md4c-html.h"
 #include <ftw.h>
 #include <pthread.h>
 #include <stdio.h>
 #include <string.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>

 char *INPUT_FOLDER = "/home/x/workspace/website/";
 char *OUTPUT_FOLDER = "/home/x/workspace/marcelofern.github.io/";
 char *OUTPUT_FOLDER_GIT = "/home/x/workspace/marcelofern.github.io/.git/";
 char *TMP_FOLDER = "/tmp/website_converter/";
 char *HEADER_FILE = "/home/x/workspace/dotfiles/scripts/website/header.html";
 char *FOOTER_FILE = "/home/x/workspace/dotfiles/scripts/website/footer.html";
 extern bool MD4C_HTML_INSIDE_LINK_SPAN;

 array_t *fpaths;

 typedef struct {
  char *fpath;
  str_t *result;
 } md_html_userdata_t;

 static void store_result(const char *fpath, str_t *result) {
  char *html_fpath = swap_substr((char *)fpath, ".md", ".html");

  // Load `result` into the new html file.
  FILE *file_ptr = fopen(html_fpath, "w");

  if (file_ptr == NULL) {
    perror("Error opening file.");
    exit(EXIT_FAILURE);
  }
  str_push(result, '\0');
  fprintf(file_ptr, "%s", result->str);

  fclose(file_ptr);
  free(html_fpath);
 }

 static bool not_anchors(const char *output, MD_SIZE output_size) {
  if (strncmp(output, "<a ", 3) == 0) {
    return false;
  }
  return strncmp(output, "\">", 2) != 0;
 }

 static void process_md_chunk(const char *output, MD_SIZE output_size,
                             void *userdata) {
  md_html_userdata_t *html_userdata = (md_html_userdata_t *)userdata;

  if (MD4C_HTML_INSIDE_LINK_SPAN && not_anchors(output, output_size)) {
    // If we are inside a link that has the `.md` extension, we actually want
    // to swap it to an `.html` extension.
    //
    // This is an internal link to another `.md` file, so without this, we
    // would end up in a 404.
    //
    // Also, we want to turn an absolute path into a relative path, and for
    // that, we need to truncate the link according to the file path.
    char *new_output = calloc(output_size + 1, sizeof(char));
    memcpy(new_output, output, output_size);
    char *parsed_output = swap_substr(new_output, ".md", ".html");
    bool is_md_link = parsed_output != new_output;

    // Remove the absolute path for the link and turn it into relative path.
    if (is_md_link) {

      // Remove the `notes/` folder prefix, which is necessary because I am
      // symlinking that folder. Remove this if I ever remove my notes from
      // my website.
      char *fpath = html_userdata->fpath;
      // Remove the tmp folder prefix.
      char *parsed_fpath = swap_substr(fpath, TMP_FOLDER, "");
      char *fdir = extract_dir(parsed_fpath);
      free(parsed_fpath);

      if (fdir != NULL && strstr(fdir, "notes") == fdir &&
          strcmp(fdir, "notes/") != 0) {
        // Because I'm symlinking the notes directory, I need to append that
        // to the link path so that the algorithm bellow works properly.
        size_t pout_len = strlen(parsed_output);
        size_t notes_len = strlen("notes/");
        char *tmp = calloc(pout_len + notes_len + 1, sizeof(char));
        strcat(tmp, "notes/");
        strcat(tmp, parsed_output);
        free(parsed_output);
        parsed_output = tmp;
      }

      if (fdir != NULL) {
        // Remove the directory prefix from the link so it's not duplicated.
        char *tmp = swap_substr(parsed_output, fdir, "");
        if (tmp != parsed_output) {
          free(parsed_output);
          parsed_output = tmp;
        }
        free(fdir);
      }
    }

    for (size_t i = 0; i < strlen(parsed_output); i++) {
      str_push(html_userdata->result, parsed_output[i]);
    }

    if (is_md_link) {
      free(parsed_output);
    }
    free(new_output);

  } else {
    for (size_t i = 0; i < output_size; i++) {
      str_push(html_userdata->result, output[i]);
    }
  }
 }

 static void load_into_result(const char *file, str_t *result) {
  // read a file and store into the result.
  // mostly used to load the header and the footer.
  FILE *file_ptr = fopen(file, "r");
  if (file_ptr == NULL) {
    perror("Error opening file.");
    exit(EXIT_FAILURE);
  }

  int c;
  while ((c = getc(file_ptr)) != EOF) {
    str_push(result, c);
  }
  fclose(file_ptr);
 }

 static void *process_md_file(void *fpath) {
  fpath = (char *)fpath;

  // The result where the new parsed file will live.
  str_t *result = new_str();
  // load the header into `result`
  load_into_result(HEADER_FILE, result);

  // read the .md file into `s`.
  str_t *s = new_str();
  FILE *file_ptr = fopen(fpath, "r");
  if (file_ptr == NULL) {
    perror("Error opening file");
    exit(EXIT_FAILURE);
  }
  int c;
  while ((c = getc(file_ptr)) != EOF) {
    str_push(s, c);
  }
  fclose(file_ptr);

  // process all the md chunks.
  md_html_userdata_t userdata = {fpath, result};
  if (md_html(s->str, s->len, &process_md_chunk, (void *)&userdata, 0, 0) ==
      -1) {
    perror("Couldn't parse the file.");
    exit(EXIT_FAILURE);
  }
  free_str(s);

  // load the footer into `result`
  load_into_result(FOOTER_FILE, result);

  // store the resulting file in the disk.
  store_result(fpath, result);
  free_str(result);
  return NULL;
 }

 static int push_fpath(const char *fpath, const struct stat *sb, int tflag,
                      struct FTW *ftwbuf) {
  /*
   * Pushes the `fpath` string fetched by nftw to the global array of
   * fpaths to process later.
   *
   * Skip if:
   * - file doesn't have the `.md` extension.
   * - file is in the `.git` folder.
   * - file isn't a regular file (from nftw).
   */

  // If not a regular file, don't do anything.
  if (tflag != FTW_F) {
    return 0;
  }

  // If the file extension is not .md, no processing needs to take place.
  char *match = strstr(fpath, ".md");
  if ((match == NULL) || (strlen(match) != 3)) {
    return 0;
  }

  // If it's the `.git` folder, don't do anything either.
  if (strstr(fpath, ".git") != NULL) {
    return 0;
  }

  char *fpath_copy = malloc(sizeof(char) * (strlen(fpath) + 1));
  strcpy(fpath_copy, fpath);
  array_push(fpaths, fpath_copy);

  return 0;
 }

 int main(int argc, char *argv[]) {
  fpaths = new_array();

  rm(TMP_FOLDER);
  cp(TMP_FOLDER, INPUT_FOLDER);

  // Walk down the folder tree, and store file names in the `fpaths` array.
  if (nftw(TMP_FOLDER, push_fpath, 20, 0) == -1) {
    perror("nftw");
    exit(EXIT_FAILURE);
  }

  for (size_t i = 0; i < fpaths->len; i++) {
    pid_t pid = fork();

    if (pid < 0) {
      // Error handling if fork fails
      perror("fork");
      exit(EXIT_FAILURE);
    } else if (pid == 0) {
      // Child process
      process_md_file(fpaths->items[i]);
      exit(EXIT_SUCCESS); // Child process should exit after finishing its work
    }
  }

  // Parent process waits for all child processes to finish
  for (size_t i = 0; i < fpaths->len; i++) {
    int status;
    waitpid(-1, &status, 0); // Wait for each child process
    if (WIFEXITED(status)) {
      if (WEXITSTATUS(status) != 0) {
        fprintf(stderr, "Child process terminated with error.\n");
      }
    } else {
      fprintf(stderr, "Child process did not terminate normally.\n");
    }
  }

  // TODO: maybe possible to do this in C.
  // Wipe out the output folder, leaving only the .git directory.
  char cmd[MEDIUM_BUFFER_SIZE];
  sprintf(
      cmd,
      "find %s -mindepth 1 -not -regex '^%s.git.*' -not -name 'feed' -delete",
      OUTPUT_FOLDER, OUTPUT_FOLDER);
  system(cmd);

  // Move all the files from the TMP_FOLDER into the OUTPUT_FOLDER.
  char tmp_folder_star[MEDIUM_BUFFER_SIZE];
  strcpy(tmp_folder_star, TMP_FOLDER);
  strcat(tmp_folder_star, "*");
  cp(OUTPUT_FOLDER, tmp_folder_star);

  free_array(fpaths, true, NULL);
  exit(EXIT_SUCCESS);
 }
	/* converter.c
	*
	* This code converts `.md` files into `.html` files.
	*
	* The main use is to write a static website in `.md` files, and then run this
	* program to convert it to `.html`
	*
	* The program relies on the following variables:
	*
	* - INPUT_FOLDER: full path of the website with `.md` files.
	* - TMP_FOLDER: a temporary folder to store the conversion results.
	* - HEADER_FILE: a file containing html ending with the <body> tag.
	* - FOOTER_FILE: a file containing html starting with the </body> tag.
	*
	* On a high level, the program does the following:
	*
	* - Removes the TMP_FOLDER if it exists.
	* - Copies the INPUT_FOLDER onto the TMP_FOLDER path.
	* - Calls nftw to read the TMP_FOLDER directory and all its md files.
	* - For each md file, create a thread to process the md file.
	* - For each md file, convert .md links into .html <a> links.
	*/

	// DEBUG
	// !gcc -Wall -Werror -g -O0 % %:h/parser/*.c -o /tmp/a.out && /tmp/a.out

	// RUN MEMORY ANALYSIS.
	// !valgrind -s --leak-check=full --show-leak-kinds=all /tmp/a.out

	// PROFILE
	// !gcc -Og -pg % %:h/parser/*.c -o /tmp/prog && /tmp/prog && gprof /tmp/prog

	// COMPILE FOR USE
	// !gcc -O3 -std=c99 % %:h/parser/*.c -o scripts/website/converter.bin

	#define MEDIUM_BUFFER_SIZE 512
	#define _XOPEN_SOURCE 500 // Required for nftw
	#include "../c/array.c"
	#include "../c/file.c"
	#include "../c/str.c"
	#include "parser/md4c-html.h"
	#include <ftw.h>
	#include <pthread.h>
	#include <stdio.h>
	#include <string.h>
	#include <sys/types.h>
	#include <sys/wait.h>
	#include <unistd.h>

	char *INPUT_FOLDER = "/home/x/workspace/website/";
	char *OUTPUT_FOLDER = "/home/x/workspace/marcelofern.github.io/";
	char *OUTPUT_FOLDER_GIT = "/home/x/workspace/marcelofern.github.io/.git/";
	char *TMP_FOLDER = "/tmp/website_converter/";
	char *HEADER_FILE = "/home/x/workspace/dotfiles/scripts/website/header.html";
	char *FOOTER_FILE = "/home/x/workspace/dotfiles/scripts/website/footer.html";
	extern bool MD4C_HTML_INSIDE_LINK_SPAN;

	array_t *fpaths;

	typedef struct {
	char *fpath;
	str_t *result;
	} md_html_userdata_t;

	static void store_result(const char fpath, str_t result) {
	char html_fpath = swap_substr((char )fpath, ".md", ".html");

	// Load `result` into the new html file.
	FILE *file_ptr = fopen(html_fpath, "w");

	if (file_ptr == NULL) {
	perror("Error opening file.");
	exit(EXIT_FAILURE);
	}
	str_push(result, '\0');
	fprintf(file_ptr, "%s", result->str);

	fclose(file_ptr);
	free(html_fpath);
	}

	static bool not_anchors(const char *output, MD_SIZE output_size) {
	if (strncmp(output, "<a ", 3) == 0) {
	return false;
	}
	return strncmp(output, "\">", 2) != 0;
	}

	static void process_md_chunk(const char *output, MD_SIZE output_size,
	void *userdata) {
	md_html_userdata_t html_userdata = (md_html_userdata_t )userdata;

	if (MD4C_HTML_INSIDE_LINK_SPAN && not_anchors(output, output_size)) {
	// If we are inside a link that has the `.md` extension, we actually want
	// to swap it to an `.html` extension.
	//
	// This is an internal link to another `.md` file, so without this, we
	// would end up in a 404.
	//
	// Also, we want to turn an absolute path into a relative path, and for
	// that, we need to truncate the link according to the file path.
	char *new_output = calloc(output_size + 1, sizeof(char));
	memcpy(new_output, output, output_size);
	char *parsed_output = swap_substr(new_output, ".md", ".html");
	bool is_md_link = parsed_output != new_output;

	// Remove the absolute path for the link and turn it into relative path.
	if (is_md_link) {

	// Remove the `notes/` folder prefix, which is necessary because I am
	// symlinking that folder. Remove this if I ever remove my notes from
	// my website.
	char *fpath = html_userdata->fpath;
	// Remove the tmp folder prefix.
	char *parsed_fpath = swap_substr(fpath, TMP_FOLDER, "");
	char *fdir = extract_dir(parsed_fpath);
	free(parsed_fpath);

	if (fdir != NULL && strstr(fdir, "notes") == fdir &&
	strcmp(fdir, "notes/") != 0) {
	// Because I'm symlinking the notes directory, I need to append that
	// to the link path so that the algorithm bellow works properly.
	size_t pout_len = strlen(parsed_output);
	size_t notes_len = strlen("notes/");
	char *tmp = calloc(pout_len + notes_len + 1, sizeof(char));
	strcat(tmp, "notes/");
	strcat(tmp, parsed_output);
	free(parsed_output);
	parsed_output = tmp;
	}

	if (fdir != NULL) {
	// Remove the directory prefix from the link so it's not duplicated.
	char *tmp = swap_substr(parsed_output, fdir, "");
	if (tmp != parsed_output) {
	free(parsed_output);
	parsed_output = tmp;
	}
	free(fdir);
	}
	}

	for (size_t i = 0; i < strlen(parsed_output); i++) {
	str_push(html_userdata->result, parsed_output[i]);
	}

	if (is_md_link) {
	free(parsed_output);
	}
	free(new_output);

	} else {
	for (size_t i = 0; i < output_size; i++) {
	str_push(html_userdata->result, output[i]);
	}
	}
	}

	static void load_into_result(const char file, str_t result) {
	// read a file and store into the result.
	// mostly used to load the header and the footer.
	FILE *file_ptr = fopen(file, "r");
	if (file_ptr == NULL) {
	perror("Error opening file.");
	exit(EXIT_FAILURE);
	}

	int c;
	while ((c = getc(file_ptr)) != EOF) {
	str_push(result, c);
	}
	fclose(file_ptr);
	}

	static void process_md_file(void fpath) {
	fpath = (char *)fpath;

	// The result where the new parsed file will live.
	str_t *result = new_str();
	// load the header into `result`
	load_into_result(HEADER_FILE, result);

	// read the .md file into `s`.
	str_t *s = new_str();
	FILE *file_ptr = fopen(fpath, "r");
	if (file_ptr == NULL) {
	perror("Error opening file");
	exit(EXIT_FAILURE);
	}
	int c;
	while ((c = getc(file_ptr)) != EOF) {
	str_push(s, c);
	}
	fclose(file_ptr);

	// process all the md chunks.
	md_html_userdata_t userdata = {fpath, result};
	if (md_html(s->str, s->len, &process_md_chunk, (void *)&userdata, 0, 0) ==
	-1) {
	perror("Couldn't parse the file.");
	exit(EXIT_FAILURE);
	}
	free_str(s);

	// load the footer into `result`
	load_into_result(FOOTER_FILE, result);

	// store the resulting file in the disk.
	store_result(fpath, result);
	free_str(result);
	return NULL;
	}

	static int push_fpath(const char fpath, const struct stat sb, int tflag,
	struct FTW *ftwbuf) {
	/*
	* Pushes the `fpath` string fetched by nftw to the global array of
	* fpaths to process later.
	*
	* Skip if:
	* - file doesn't have the `.md` extension.
	* - file is in the `.git` folder.
	* - file isn't a regular file (from nftw).
	*/

	// If not a regular file, don't do anything.
	if (tflag != FTW_F) {
	return 0;
	}

	// If the file extension is not .md, no processing needs to take place.
	char *match = strstr(fpath, ".md");
	if ((match == NULL) \|\| (strlen(match) != 3)) {
	return 0;
	}

	// If it's the `.git` folder, don't do anything either.
	if (strstr(fpath, ".git") != NULL) {
	return 0;
	}

	char fpath_copy = malloc(sizeof(char) (strlen(fpath) + 1));
	strcpy(fpath_copy, fpath);
	array_push(fpaths, fpath_copy);

	return 0;
	}

	int main(int argc, char *argv[]) {
	fpaths = new_array();

	rm(TMP_FOLDER);
	cp(TMP_FOLDER, INPUT_FOLDER);

	// Walk down the folder tree, and store file names in the `fpaths` array.
	if (nftw(TMP_FOLDER, push_fpath, 20, 0) == -1) {
	perror("nftw");
	exit(EXIT_FAILURE);
	}

	for (size_t i = 0; i < fpaths->len; i++) {
	pid_t pid = fork();

	if (pid < 0) {
	// Error handling if fork fails
	perror("fork");
	exit(EXIT_FAILURE);
	} else if (pid == 0) {
	// Child process
	process_md_file(fpaths->items[i]);
	exit(EXIT_SUCCESS); // Child process should exit after finishing its work
	}
	}

	// Parent process waits for all child processes to finish
	for (size_t i = 0; i < fpaths->len; i++) {
	int status;
	waitpid(-1, &status, 0); // Wait for each child process
	if (WIFEXITED(status)) {
	if (WEXITSTATUS(status) != 0) {
	fprintf(stderr, "Child process terminated with error.\n");
	}
	} else {
	fprintf(stderr, "Child process did not terminate normally.\n");
	}
	}

	// TODO: maybe possible to do this in C.
	// Wipe out the output folder, leaving only the .git directory.
	char cmd[MEDIUM_BUFFER_SIZE];
	sprintf(
	cmd,
	"find %s -mindepth 1 -not -regex '^%s.git.*' -not -name 'feed' -delete",
	OUTPUT_FOLDER, OUTPUT_FOLDER);
	system(cmd);

	// Move all the files from the TMP_FOLDER into the OUTPUT_FOLDER.
	char tmp_folder_star[MEDIUM_BUFFER_SIZE];
	strcpy(tmp_folder_star, TMP_FOLDER);
	strcat(tmp_folder_star, "*");
	cp(OUTPUT_FOLDER, tmp_folder_star);

	free_array(fpaths, true, NULL);
	exit(EXIT_SUCCESS);
	}