lebriggs · August 4, 2024 01:30
diff --git a/gistfile1.txt b/gistfile1.txt
 # Title: Automated YAML Front Matter Updater For Markdown Files In R
 # Author: L. E. Briggs
 # Date: 03 August 2024

 # Note:
 # This is only the first draft of the script. Future updates will be forthcoming.
 # Related blog post: https://lebriggs.com/blog/post_index/

 # Overview:
 # This R script updates the YAML front matter of Markdown files (.md and .Rmd) by:
 # - Adding or modifying user-specified tags and categories
 # - Ensuring the date field is set to today's date if empty
 # - Incrementing the highest post_id to maintain unique identifiers
 # - Collecting previous tags and categories from all markdown files in specified directories
 # - Ensuring all tags and categories are properly formatted and unique
 # The script provides a comprehensive solution for maintaining and updating metadata across multiple markdown files.

 # Usage:
 # 1. Define the specific file to update and the directories to search.
 # 2. Customize the selected tags and categories to be added.
 # 3. Run the script in your R environment.

 # Dependencies:
 # - yaml
 # - dplyr
 # - fs
 # - here
 # - readr

 # I. Initial Set-up: Install and Load Required Packages with Basic Feedback

 #this script installs and loads required packages with basic feedback
 #separating the installation check from the loading step ensures a more robust process

 #list of required packages
 pkg3 <- c("yaml", "dplyr", "fs", "here", "readr")

 #create a function to install and load packages with feedback
 #installing all dependencies minimizes potential issues with missing packages
 #adding messages provides feedback on the installation and loading status of each package

 install_and_load <- function(package) {
  tryCatch({
    if (!package %in% rownames(installed.packages())) { #check if package is installed
      install.packages(package, dependencies = TRUE) #install the package with all dependencies
      Sys.sleep(2) #ensure the installation process completes properly
      .libPaths(.libPaths()) #reload the library paths
      if (!require(package, character.only = TRUE)) { #try to load the package again
        return(paste("failed to install or load package:", package)) #return message if loading fails
      } else {
        return(paste(package, "was installed and loaded successfully.")) #return message if successful
      }
    } else {
      if (!require(package, character.only = TRUE)) { #try to load the package
        return(paste("failed to load package:", package)) #return message if loading fails
      } else {
        return(paste(package, "was already installed and loaded.")) #return message if already installed and loaded
      }
    }
  }, error = function(e) {
    return(paste("error installing or loading package:", package, "-", e$message)) #extract and return the error message
  })
 }

 #install and load packages
 install_results <- lapply(pkg3, install_and_load)

 #print installation and loading results with a title
 cat("summary:\n", unlist(install_results), sep = "\n")

 # II. Subsequent Session(s): Load Required Packages

 #list of required packages that were previously installed
 pkg3 <- c("yaml", "dplyr", "fs", "here", "readr")

 #load packages that were previously installed
 lapply(pkg3, require, character.only = TRUE)

 # III. The Code

 # The steps are as follows:

 # Step 1: Define the file and folders to be searched.
 # Step 2: Define function to read yaml front matter.
 # Step 3: Define function to handle the date field.
 # Step 4: Define function to update field.
 # Step 5: Define function to add quotes to fields.
 # Step 6: Define function to write updated yaml front matter.
 # Step 7: Define function to update tags and categories in the YAML front matter.
 # Step 8: Write the date field in yaml_content if it's empty
 # Step 9: Check for the highest post_id and increment it.
 # Step 10: Define function to collect previous tags and categories from all markdown files.
 # Step 11: Collect and print previous unique tags and categories.
 # Step 12: Specify the selected categories and tags to be added.
 # Step 13: Update tags and categories.

 # Step 1: Define the file and folders to be searched.

 # Define the specific file to update
 file_to_update <- here::here("content", "blog", "webscraping", "index.Rmd")

 # Specify the directories to search for markdown files
 directories <- c("content/blog", "content/talk", "content/dataresource", "content/project")

 # Step 2: Define function to read yaml front matter.

 # function to read yaml front matter
 read_yaml_front_matter <- function(file) {
  content <- readr::read_lines(file)
  yaml_start <- which(content == "---")[1]
  yaml_end <- which(content == "---")[2]
  yaml_content <- paste(content[(yaml_start + 1):(yaml_end - 1)], collapse = "\n")
  yaml_parsed <- yaml::yaml.load(yaml_content)
  return(list(yaml_parsed = yaml_parsed, yaml_start = yaml_start, yaml_end = yaml_end, content = content))
 }

 # Step 3: Define function to handle the date field.

 # handle the date field
 handle_date_field <- function(yaml_content) {
  yaml_content <- sapply(yaml_content, function(line) {
    if (grepl("^date: *\"?\"?$", line)) {  # Adjusted regex to match both `date: ""` and `date:`
      print("empty date field found. updating it to today's date.")
      return(sprintf('date: "%s"', Sys.Date()))
    } else if (grepl("^date: \"[0-9]{4}-[0-9]{2}-[0-9]{2}\"$", line)) {
      print("existing date field is already formatted correctly; no need to modify.")
    }
    return(line)
  })
  yaml_content <- unlist(yaml_content)
  return(yaml_content)
 }

 # Step 4: Define function to update field.

 # function to update field
 update_field <- function(field, values, yaml_content) {
  field_regex <- paste0("(?<=^", field, ":)(?s).*$")
  if (any(grepl(paste0("^", field, ":"), yaml_content))) {
    yaml_content <- sub(field_regex, paste0("\n- ", paste(values, collapse = "\n- ")), yaml_content, perl = TRUE)
  } else {
    yaml_content <- c(yaml_content, paste0(field, ":\n- ", paste(values, collapse = "\n- ")))
  }
  return(yaml_content)
 }

 # Step 5: Define function to add quotes to fields.

 # function to add quotes to fields
 add_quotes <- function(fields) {
  fields <- lapply(fields, function(field) {
    if (!grepl('^".*"$', field)) {
      field <- sprintf('"%s"', field)
    }
    return(field)
  })
  return(fields)
 }

 # Step 6: Define function to write updated yaml front matter.

 # function to write updated yaml front matter
 write_yaml_front_matter <- function(file, yaml_content, yaml_start, yaml_end, content) {
  # reconstruct the content with updated yaml
  updated_content <- c("---", yaml_content, "---", content[(yaml_end + 1):length(content)])
  readr::write_lines(updated_content, path = file)
 }

 # Step 7: Define function to update tags and categories in the YAML front matter.

 # function to update the tags and categories in the yaml front matter
 update_tags_categories <- function(file, selected_categories, selected_tags) {
  yaml_data <- read_yaml_front_matter(file)
  yaml_parsed <- yaml_data$yaml_parsed
  yaml_start <- yaml_data$yaml_start
  yaml_end <- yaml_data$yaml_end
  content <- yaml_data$content
  
 # Step 8: Write the date field in yaml_content if it's empty

  yaml_content <- content[(yaml_start + 1):(yaml_end - 1)]
  yaml_content <- handle_date_field(yaml_content)
  
 # Step 9: Check for the highest post_id and increment it.
  
  # Extract post_id from tags and find the highest one
  extract_post_id <- function(tags) {
    post_ids <- grep("^post_id: [0-9]{3}$", tags, value = TRUE)
    post_ids_numeric <- as.numeric(sub("post_id: ", "", post_ids))
    # Sort post_ids for debugging output
    post_ids_numeric <- sort(post_ids_numeric)
    # Debugging: Print extracted numeric post_ids in order
    print("Extracted numeric post_ids:")
    print(post_ids_numeric)
    if (length(post_ids_numeric) == 0) {
      return(0)
    }
    return(max(post_ids_numeric))
  }
  
  highest_post_id <- extract_post_id(tags_categories$post_ids)
  new_post_id <- sprintf("%03d", highest_post_id + 1)
  new_post_id_tag <- sprintf('post_id: %s', new_post_id)
  
  # Debugging: Print new post_id
  print("New post_id to be assigned:")
  print(new_post_id)
  
  selected_tags <- lapply(selected_tags, function(tag) {
    if (grepl("^post_id: [0-9]{3}$", tag)) {
      tag <- ifelse(tag == sprintf('post_id: %03d', highest_post_id), new_post_id_tag, tag)
    }
    return(tag)
  })
  
  # Ensure all tags are in double quotes
  selected_tags <- add_quotes(selected_tags)
  
  # Ensure all categories are in double quotes
  selected_categories <- add_quotes(selected_categories)
  
  # Update yaml_parsed with new categories and tags
  yaml_parsed$categories <- unique(c(yaml_parsed$categories, selected_categories))
  yaml_parsed$tags <- unique(c(yaml_parsed$tags, selected_tags))
  
  # Update yaml_content with new categories and tags
  yaml_content <- update_field("categories", yaml_parsed$categories, yaml_content)
  yaml_content <- update_field("tags", yaml_parsed$tags, yaml_content)
  
  write_yaml_front_matter(file, yaml_content, yaml_start, yaml_end, content)
 }

 # Step 10: Define function to collect previous tags and categories from all markdown files.

 # function to collect previous tags and categories from all markdown files
 collect_tags_categories <- function(files) {
  tags_categories_df <- dplyr::tibble(tags = character(), categories = character())
  post_ids <- c()
  
  for (file in files) {
    yaml_data <- read_yaml_front_matter(file)
    yaml_parsed <- yaml_data$yaml_parsed
    
    tags <- if (!is.null(yaml_parsed$tags)) yaml_parsed$tags else character()
    categories <- if (!is.null(yaml_parsed$categories)) yaml_parsed$categories else character()
    
    tags <- paste(tags, collapse = ", ")
    categories <- paste(categories, collapse = ", ")
    
    tags_categories_df <- dplyr::bind_rows(tags_categories_df, dplyr::tibble(tags = tags, categories = categories))
    
    # Collect unique post_ids per file
    current_post_ids <- unique(grep("^post_id: [0-9]{3}$", yaml_parsed$tags, value = TRUE))
    post_ids <- unique(c(post_ids, current_post_ids))  # Ensure uniqueness while aggregating
  }
  
  unique_tags <- unique(unlist(strsplit(tags_categories_df$tags, ", ")))
  unique_categories <- unique(unlist(strsplit(tags_categories_df$categories, ", ")))
  
  unique_tags <- unique_tags[!is.na(unique_tags)]
  unique_categories <- unique_categories[!is.na(unique_categories)]
  
  unique_tags <- sort(unique_tags)
  unique_categories <- sort(unique_categories)
  
  return(list(unique_tags = unique_tags, unique_categories = unique_categories, post_ids = post_ids))
 }

 # Step 11: Collect and print previous unique tags and categories.

 all_files <- unlist(lapply(directories, function(dir) {
  fs::dir_ls(here::here(dir), recurse = TRUE, regexp = "\\.(md|Rmd)$")
 }))

 tags_categories <- collect_tags_categories(all_files)

 # Debugging: Print all collected post_id tags
 print("Collected post_id tags from all files:")
 print(tags_categories$post_ids)

 sorted_tags <- sort(unlist(tags_categories$unique_tags))
 sorted_categories <- sort(unlist(tags_categories$unique_categories))

 output <- paste("\nUnique Categories:\n", paste(sorted_categories, collapse = ", "), "\n\nUnique Tags:\n", paste(sorted_tags, collapse = ", "), "\n", sep = "")
 cat(output)

 # Step 12: Specify the selected categories and tags to be added.

 selected_tags <- c("R tutorial", "Web scraping") # add tags for the post here
 selected_categories <- c("Blog", "Project", "R Code", "Data Analysis") # add categories for the post here

 # Extract post_id from tags and find the highest one
 highest_post_id <- extract_post_id(tags_categories$post_ids)
 new_post_id <- sprintf("%03d", highest_post_id + 1)
 new_post_id_tag <- sprintf('post_id: %s', new_post_id)
 selected_tags <- c(new_post_id_tag, selected_tags)

 # Debugging: Print selected categories and tags for debugging purposes
 cat("Laura's Selected Categories:\n", paste(selected_categories, collapse = ", "), "\n")
 cat("Laura's Selected Tags:\n", paste(selected_tags, collapse = ", "), "\n")

 # Debugging: Ensure selected_tags is correct before updating YAML
 print("Selected tags right before updating YAML:")
 print(selected_tags)

 # Step 13: Update tags and categories.

 update_tags_categories(file_to_update, selected_categories, selected_tags)
	# Title: Automated YAML Front Matter Updater For Markdown Files In R
	# Author: L. E. Briggs
	# Date: 03 August 2024

	# Note:
	# This is only the first draft of the script. Future updates will be forthcoming.
	# Related blog post: https://lebriggs.com/blog/post_index/

	# Overview:
	# This R script updates the YAML front matter of Markdown files (.md and .Rmd) by:
	# - Adding or modifying user-specified tags and categories
	# - Ensuring the date field is set to today's date if empty
	# - Incrementing the highest post_id to maintain unique identifiers
	# - Collecting previous tags and categories from all markdown files in specified directories
	# - Ensuring all tags and categories are properly formatted and unique
	# The script provides a comprehensive solution for maintaining and updating metadata across multiple markdown files.

	# Usage:
	# 1. Define the specific file to update and the directories to search.
	# 2. Customize the selected tags and categories to be added.
	# 3. Run the script in your R environment.

	# Dependencies:
	# - yaml
	# - dplyr
	# - fs
	# - here
	# - readr

	# I. Initial Set-up: Install and Load Required Packages with Basic Feedback

	#this script installs and loads required packages with basic feedback
	#separating the installation check from the loading step ensures a more robust process

	#list of required packages
	pkg3 <- c("yaml", "dplyr", "fs", "here", "readr")

	#create a function to install and load packages with feedback
	#installing all dependencies minimizes potential issues with missing packages
	#adding messages provides feedback on the installation and loading status of each package

	install_and_load <- function(package) {
	tryCatch({
	if (!package %in% rownames(installed.packages())) { #check if package is installed
	install.packages(package, dependencies = TRUE) #install the package with all dependencies
	Sys.sleep(2) #ensure the installation process completes properly
	.libPaths(.libPaths()) #reload the library paths
	if (!require(package, character.only = TRUE)) { #try to load the package again
	return(paste("failed to install or load package:", package)) #return message if loading fails
	} else {
	return(paste(package, "was installed and loaded successfully.")) #return message if successful
	}
	} else {
	if (!require(package, character.only = TRUE)) { #try to load the package
	return(paste("failed to load package:", package)) #return message if loading fails
	} else {
	return(paste(package, "was already installed and loaded.")) #return message if already installed and loaded
	}
	}
	}, error = function(e) {
	return(paste("error installing or loading package:", package, "-", e$message)) #extract and return the error message
	})
	}

	#install and load packages
	install_results <- lapply(pkg3, install_and_load)

	#print installation and loading results with a title
	cat("summary:\n", unlist(install_results), sep = "\n")

	# II. Subsequent Session(s): Load Required Packages

	#list of required packages that were previously installed
	pkg3 <- c("yaml", "dplyr", "fs", "here", "readr")

	#load packages that were previously installed
	lapply(pkg3, require, character.only = TRUE)

	# III. The Code

	# The steps are as follows:

	# Step 1: Define the file and folders to be searched.
	# Step 2: Define function to read yaml front matter.
	# Step 3: Define function to handle the date field.
	# Step 4: Define function to update field.
	# Step 5: Define function to add quotes to fields.
	# Step 6: Define function to write updated yaml front matter.
	# Step 7: Define function to update tags and categories in the YAML front matter.
	# Step 8: Write the date field in yaml_content if it's empty
	# Step 9: Check for the highest post_id and increment it.
	# Step 10: Define function to collect previous tags and categories from all markdown files.
	# Step 11: Collect and print previous unique tags and categories.
	# Step 12: Specify the selected categories and tags to be added.
	# Step 13: Update tags and categories.

	# Step 1: Define the file and folders to be searched.

	# Define the specific file to update
	file_to_update <- here::here("content", "blog", "webscraping", "index.Rmd")

	# Specify the directories to search for markdown files
	directories <- c("content/blog", "content/talk", "content/dataresource", "content/project")

	# Step 2: Define function to read yaml front matter.

	# function to read yaml front matter
	read_yaml_front_matter <- function(file) {
	content <- readr::read_lines(file)
	yaml_start <- which(content == "---")[1]
	yaml_end <- which(content == "---")[2]
	yaml_content <- paste(content[(yaml_start + 1):(yaml_end - 1)], collapse = "\n")
	yaml_parsed <- yaml::yaml.load(yaml_content)
	return(list(yaml_parsed = yaml_parsed, yaml_start = yaml_start, yaml_end = yaml_end, content = content))
	}

	# Step 3: Define function to handle the date field.

	# handle the date field
	handle_date_field <- function(yaml_content) {
	yaml_content <- sapply(yaml_content, function(line) {
	if (grepl("^date: *\"?\"?$", line)) { # Adjusted regex to match both `date: ""` and `date:`
	print("empty date field found. updating it to today's date.")
	return(sprintf('date: "%s"', Sys.Date()))
	} else if (grepl("^date: \"[0-9]{4}-[0-9]{2}-[0-9]{2}\"$", line)) {
	print("existing date field is already formatted correctly; no need to modify.")
	}
	return(line)
	})
	yaml_content <- unlist(yaml_content)
	return(yaml_content)
	}

	# Step 4: Define function to update field.

	# function to update field
	update_field <- function(field, values, yaml_content) {
	field_regex <- paste0("(?<=^", field, ":)(?s).*$")
	if (any(grepl(paste0("^", field, ":"), yaml_content))) {
	yaml_content <- sub(field_regex, paste0("\n- ", paste(values, collapse = "\n- ")), yaml_content, perl = TRUE)
	} else {
	yaml_content <- c(yaml_content, paste0(field, ":\n- ", paste(values, collapse = "\n- ")))
	}
	return(yaml_content)
	}

	# Step 5: Define function to add quotes to fields.

	# function to add quotes to fields
	add_quotes <- function(fields) {
	fields <- lapply(fields, function(field) {
	if (!grepl('^".*"$', field)) {
	field <- sprintf('"%s"', field)
	}
	return(field)
	})
	return(fields)
	}

	# Step 6: Define function to write updated yaml front matter.

	# function to write updated yaml front matter
	write_yaml_front_matter <- function(file, yaml_content, yaml_start, yaml_end, content) {
	# reconstruct the content with updated yaml
	updated_content <- c("---", yaml_content, "---", content[(yaml_end + 1):length(content)])
	readr::write_lines(updated_content, path = file)
	}

	# Step 7: Define function to update tags and categories in the YAML front matter.

	# function to update the tags and categories in the yaml front matter
	update_tags_categories <- function(file, selected_categories, selected_tags) {
	yaml_data <- read_yaml_front_matter(file)
	yaml_parsed <- yaml_data$yaml_parsed
	yaml_start <- yaml_data$yaml_start
	yaml_end <- yaml_data$yaml_end
	content <- yaml_data$content

	# Step 8: Write the date field in yaml_content if it's empty

	yaml_content <- content[(yaml_start + 1):(yaml_end - 1)]
	yaml_content <- handle_date_field(yaml_content)

	# Step 9: Check for the highest post_id and increment it.

	# Extract post_id from tags and find the highest one
	extract_post_id <- function(tags) {
	post_ids <- grep("^post_id: [0-9]{3}$", tags, value = TRUE)
	post_ids_numeric <- as.numeric(sub("post_id: ", "", post_ids))
	# Sort post_ids for debugging output
	post_ids_numeric <- sort(post_ids_numeric)
	# Debugging: Print extracted numeric post_ids in order
	print("Extracted numeric post_ids:")
	print(post_ids_numeric)
	if (length(post_ids_numeric) == 0) {
	return(0)
	}
	return(max(post_ids_numeric))
	}

	highest_post_id <- extract_post_id(tags_categories$post_ids)
	new_post_id <- sprintf("%03d", highest_post_id + 1)
	new_post_id_tag <- sprintf('post_id: %s', new_post_id)

	# Debugging: Print new post_id
	print("New post_id to be assigned:")
	print(new_post_id)

	selected_tags <- lapply(selected_tags, function(tag) {
	if (grepl("^post_id: [0-9]{3}$", tag)) {
	tag <- ifelse(tag == sprintf('post_id: %03d', highest_post_id), new_post_id_tag, tag)
	}
	return(tag)
	})

	# Ensure all tags are in double quotes
	selected_tags <- add_quotes(selected_tags)

	# Ensure all categories are in double quotes
	selected_categories <- add_quotes(selected_categories)

	# Update yaml_parsed with new categories and tags
	yaml_parsed$categories <- unique(c(yaml_parsed$categories, selected_categories))
	yaml_parsed$tags <- unique(c(yaml_parsed$tags, selected_tags))

	# Update yaml_content with new categories and tags
	yaml_content <- update_field("categories", yaml_parsed$categories, yaml_content)
	yaml_content <- update_field("tags", yaml_parsed$tags, yaml_content)

	write_yaml_front_matter(file, yaml_content, yaml_start, yaml_end, content)
	}

	# Step 10: Define function to collect previous tags and categories from all markdown files.

	# function to collect previous tags and categories from all markdown files
	collect_tags_categories <- function(files) {
	tags_categories_df <- dplyr::tibble(tags = character(), categories = character())
	post_ids <- c()

	for (file in files) {
	yaml_data <- read_yaml_front_matter(file)
	yaml_parsed <- yaml_data$yaml_parsed

	tags <- if (!is.null(yaml_parsed$tags)) yaml_parsed$tags else character()
	categories <- if (!is.null(yaml_parsed$categories)) yaml_parsed$categories else character()

	tags <- paste(tags, collapse = ", ")
	categories <- paste(categories, collapse = ", ")

	tags_categories_df <- dplyr::bind_rows(tags_categories_df, dplyr::tibble(tags = tags, categories = categories))

	# Collect unique post_ids per file
	current_post_ids <- unique(grep("^post_id: [0-9]{3}$", yaml_parsed$tags, value = TRUE))
	post_ids <- unique(c(post_ids, current_post_ids)) # Ensure uniqueness while aggregating
	}

	unique_tags <- unique(unlist(strsplit(tags_categories_df$tags, ", ")))
	unique_categories <- unique(unlist(strsplit(tags_categories_df$categories, ", ")))

	unique_tags <- unique_tags[!is.na(unique_tags)]
	unique_categories <- unique_categories[!is.na(unique_categories)]

	unique_tags <- sort(unique_tags)
	unique_categories <- sort(unique_categories)

	return(list(unique_tags = unique_tags, unique_categories = unique_categories, post_ids = post_ids))
	}

	# Step 11: Collect and print previous unique tags and categories.

	all_files <- unlist(lapply(directories, function(dir) {
	fs::dir_ls(here::here(dir), recurse = TRUE, regexp = "\\.(md\|Rmd)$")
	}))

	tags_categories <- collect_tags_categories(all_files)

	# Debugging: Print all collected post_id tags
	print("Collected post_id tags from all files:")
	print(tags_categories$post_ids)

	sorted_tags <- sort(unlist(tags_categories$unique_tags))
	sorted_categories <- sort(unlist(tags_categories$unique_categories))

	output <- paste("\nUnique Categories:\n", paste(sorted_categories, collapse = ", "), "\n\nUnique Tags:\n", paste(sorted_tags, collapse = ", "), "\n", sep = "")
	cat(output)

	# Step 12: Specify the selected categories and tags to be added.

	selected_tags <- c("R tutorial", "Web scraping") # add tags for the post here
	selected_categories <- c("Blog", "Project", "R Code", "Data Analysis") # add categories for the post here

	# Extract post_id from tags and find the highest one
	highest_post_id <- extract_post_id(tags_categories$post_ids)
	new_post_id <- sprintf("%03d", highest_post_id + 1)
	new_post_id_tag <- sprintf('post_id: %s', new_post_id)
	selected_tags <- c(new_post_id_tag, selected_tags)

	# Debugging: Print selected categories and tags for debugging purposes
	cat("Laura's Selected Categories:\n", paste(selected_categories, collapse = ", "), "\n")
	cat("Laura's Selected Tags:\n", paste(selected_tags, collapse = ", "), "\n")

	# Debugging: Ensure selected_tags is correct before updating YAML
	print("Selected tags right before updating YAML:")
	print(selected_tags)

	# Step 13: Update tags and categories.

	update_tags_categories(file_to_update, selected_categories, selected_tags)