Skip to content

Instantly share code, notes, and snippets.

@lebriggs
Created August 4, 2024 01:30
Show Gist options
  • Save lebriggs/dc1036e923d321eb3853a924526332d9 to your computer and use it in GitHub Desktop.
Save lebriggs/dc1036e923d321eb3853a924526332d9 to your computer and use it in GitHub Desktop.
Automated YAML Front Matter Updater For Markdown Files In R
# Title: Automated YAML Front Matter Updater For Markdown Files In R
# Author: L. E. Briggs
# Date: 03 August 2024
# Note:
# This is only the first draft of the script. Future updates will be forthcoming.
# Related blog post: https://lebriggs.com/blog/post_index/
# Overview:
# This R script updates the YAML front matter of Markdown files (.md and .Rmd) by:
# - Adding or modifying user-specified tags and categories
# - Ensuring the date field is set to today's date if empty
# - Incrementing the highest post_id to maintain unique identifiers
# - Collecting previous tags and categories from all markdown files in specified directories
# - Ensuring all tags and categories are properly formatted and unique
# The script provides a comprehensive solution for maintaining and updating metadata across multiple markdown files.
# Usage:
# 1. Define the specific file to update and the directories to search.
# 2. Customize the selected tags and categories to be added.
# 3. Run the script in your R environment.
# Dependencies:
# - yaml
# - dplyr
# - fs
# - here
# - readr
# I. Initial Set-up: Install and Load Required Packages with Basic Feedback
#this script installs and loads required packages with basic feedback
#separating the installation check from the loading step ensures a more robust process
#list of required packages
pkg3 <- c("yaml", "dplyr", "fs", "here", "readr")
#create a function to install and load packages with feedback
#installing all dependencies minimizes potential issues with missing packages
#adding messages provides feedback on the installation and loading status of each package
install_and_load <- function(package) {
tryCatch({
if (!package %in% rownames(installed.packages())) { #check if package is installed
install.packages(package, dependencies = TRUE) #install the package with all dependencies
Sys.sleep(2) #ensure the installation process completes properly
.libPaths(.libPaths()) #reload the library paths
if (!require(package, character.only = TRUE)) { #try to load the package again
return(paste("failed to install or load package:", package)) #return message if loading fails
} else {
return(paste(package, "was installed and loaded successfully.")) #return message if successful
}
} else {
if (!require(package, character.only = TRUE)) { #try to load the package
return(paste("failed to load package:", package)) #return message if loading fails
} else {
return(paste(package, "was already installed and loaded.")) #return message if already installed and loaded
}
}
}, error = function(e) {
return(paste("error installing or loading package:", package, "-", e$message)) #extract and return the error message
})
}
#install and load packages
install_results <- lapply(pkg3, install_and_load)
#print installation and loading results with a title
cat("summary:\n", unlist(install_results), sep = "\n")
# II. Subsequent Session(s): Load Required Packages
#list of required packages that were previously installed
pkg3 <- c("yaml", "dplyr", "fs", "here", "readr")
#load packages that were previously installed
lapply(pkg3, require, character.only = TRUE)
# III. The Code
# The steps are as follows:
# Step 1: Define the file and folders to be searched.
# Step 2: Define function to read yaml front matter.
# Step 3: Define function to handle the date field.
# Step 4: Define function to update field.
# Step 5: Define function to add quotes to fields.
# Step 6: Define function to write updated yaml front matter.
# Step 7: Define function to update tags and categories in the YAML front matter.
# Step 8: Write the date field in yaml_content if it's empty
# Step 9: Check for the highest post_id and increment it.
# Step 10: Define function to collect previous tags and categories from all markdown files.
# Step 11: Collect and print previous unique tags and categories.
# Step 12: Specify the selected categories and tags to be added.
# Step 13: Update tags and categories.
# Step 1: Define the file and folders to be searched.
# Define the specific file to update
file_to_update <- here::here("content", "blog", "webscraping", "index.Rmd")
# Specify the directories to search for markdown files
directories <- c("content/blog", "content/talk", "content/dataresource", "content/project")
# Step 2: Define function to read yaml front matter.
# function to read yaml front matter
read_yaml_front_matter <- function(file) {
content <- readr::read_lines(file)
yaml_start <- which(content == "---")[1]
yaml_end <- which(content == "---")[2]
yaml_content <- paste(content[(yaml_start + 1):(yaml_end - 1)], collapse = "\n")
yaml_parsed <- yaml::yaml.load(yaml_content)
return(list(yaml_parsed = yaml_parsed, yaml_start = yaml_start, yaml_end = yaml_end, content = content))
}
# Step 3: Define function to handle the date field.
# handle the date field
handle_date_field <- function(yaml_content) {
yaml_content <- sapply(yaml_content, function(line) {
if (grepl("^date: *\"?\"?$", line)) { # Adjusted regex to match both `date: ""` and `date:`
print("empty date field found. updating it to today's date.")
return(sprintf('date: "%s"', Sys.Date()))
} else if (grepl("^date: \"[0-9]{4}-[0-9]{2}-[0-9]{2}\"$", line)) {
print("existing date field is already formatted correctly; no need to modify.")
}
return(line)
})
yaml_content <- unlist(yaml_content)
return(yaml_content)
}
# Step 4: Define function to update field.
# function to update field
update_field <- function(field, values, yaml_content) {
field_regex <- paste0("(?<=^", field, ":)(?s).*$")
if (any(grepl(paste0("^", field, ":"), yaml_content))) {
yaml_content <- sub(field_regex, paste0("\n- ", paste(values, collapse = "\n- ")), yaml_content, perl = TRUE)
} else {
yaml_content <- c(yaml_content, paste0(field, ":\n- ", paste(values, collapse = "\n- ")))
}
return(yaml_content)
}
# Step 5: Define function to add quotes to fields.
# function to add quotes to fields
add_quotes <- function(fields) {
fields <- lapply(fields, function(field) {
if (!grepl('^".*"$', field)) {
field <- sprintf('"%s"', field)
}
return(field)
})
return(fields)
}
# Step 6: Define function to write updated yaml front matter.
# function to write updated yaml front matter
write_yaml_front_matter <- function(file, yaml_content, yaml_start, yaml_end, content) {
# reconstruct the content with updated yaml
updated_content <- c("---", yaml_content, "---", content[(yaml_end + 1):length(content)])
readr::write_lines(updated_content, path = file)
}
# Step 7: Define function to update tags and categories in the YAML front matter.
# function to update the tags and categories in the yaml front matter
update_tags_categories <- function(file, selected_categories, selected_tags) {
yaml_data <- read_yaml_front_matter(file)
yaml_parsed <- yaml_data$yaml_parsed
yaml_start <- yaml_data$yaml_start
yaml_end <- yaml_data$yaml_end
content <- yaml_data$content
# Step 8: Write the date field in yaml_content if it's empty
yaml_content <- content[(yaml_start + 1):(yaml_end - 1)]
yaml_content <- handle_date_field(yaml_content)
# Step 9: Check for the highest post_id and increment it.
# Extract post_id from tags and find the highest one
extract_post_id <- function(tags) {
post_ids <- grep("^post_id: [0-9]{3}$", tags, value = TRUE)
post_ids_numeric <- as.numeric(sub("post_id: ", "", post_ids))
# Sort post_ids for debugging output
post_ids_numeric <- sort(post_ids_numeric)
# Debugging: Print extracted numeric post_ids in order
print("Extracted numeric post_ids:")
print(post_ids_numeric)
if (length(post_ids_numeric) == 0) {
return(0)
}
return(max(post_ids_numeric))
}
highest_post_id <- extract_post_id(tags_categories$post_ids)
new_post_id <- sprintf("%03d", highest_post_id + 1)
new_post_id_tag <- sprintf('post_id: %s', new_post_id)
# Debugging: Print new post_id
print("New post_id to be assigned:")
print(new_post_id)
selected_tags <- lapply(selected_tags, function(tag) {
if (grepl("^post_id: [0-9]{3}$", tag)) {
tag <- ifelse(tag == sprintf('post_id: %03d', highest_post_id), new_post_id_tag, tag)
}
return(tag)
})
# Ensure all tags are in double quotes
selected_tags <- add_quotes(selected_tags)
# Ensure all categories are in double quotes
selected_categories <- add_quotes(selected_categories)
# Update yaml_parsed with new categories and tags
yaml_parsed$categories <- unique(c(yaml_parsed$categories, selected_categories))
yaml_parsed$tags <- unique(c(yaml_parsed$tags, selected_tags))
# Update yaml_content with new categories and tags
yaml_content <- update_field("categories", yaml_parsed$categories, yaml_content)
yaml_content <- update_field("tags", yaml_parsed$tags, yaml_content)
write_yaml_front_matter(file, yaml_content, yaml_start, yaml_end, content)
}
# Step 10: Define function to collect previous tags and categories from all markdown files.
# function to collect previous tags and categories from all markdown files
collect_tags_categories <- function(files) {
tags_categories_df <- dplyr::tibble(tags = character(), categories = character())
post_ids <- c()
for (file in files) {
yaml_data <- read_yaml_front_matter(file)
yaml_parsed <- yaml_data$yaml_parsed
tags <- if (!is.null(yaml_parsed$tags)) yaml_parsed$tags else character()
categories <- if (!is.null(yaml_parsed$categories)) yaml_parsed$categories else character()
tags <- paste(tags, collapse = ", ")
categories <- paste(categories, collapse = ", ")
tags_categories_df <- dplyr::bind_rows(tags_categories_df, dplyr::tibble(tags = tags, categories = categories))
# Collect unique post_ids per file
current_post_ids <- unique(grep("^post_id: [0-9]{3}$", yaml_parsed$tags, value = TRUE))
post_ids <- unique(c(post_ids, current_post_ids)) # Ensure uniqueness while aggregating
}
unique_tags <- unique(unlist(strsplit(tags_categories_df$tags, ", ")))
unique_categories <- unique(unlist(strsplit(tags_categories_df$categories, ", ")))
unique_tags <- unique_tags[!is.na(unique_tags)]
unique_categories <- unique_categories[!is.na(unique_categories)]
unique_tags <- sort(unique_tags)
unique_categories <- sort(unique_categories)
return(list(unique_tags = unique_tags, unique_categories = unique_categories, post_ids = post_ids))
}
# Step 11: Collect and print previous unique tags and categories.
all_files <- unlist(lapply(directories, function(dir) {
fs::dir_ls(here::here(dir), recurse = TRUE, regexp = "\\.(md|Rmd)$")
}))
tags_categories <- collect_tags_categories(all_files)
# Debugging: Print all collected post_id tags
print("Collected post_id tags from all files:")
print(tags_categories$post_ids)
sorted_tags <- sort(unlist(tags_categories$unique_tags))
sorted_categories <- sort(unlist(tags_categories$unique_categories))
output <- paste("\nUnique Categories:\n", paste(sorted_categories, collapse = ", "), "\n\nUnique Tags:\n", paste(sorted_tags, collapse = ", "), "\n", sep = "")
cat(output)
# Step 12: Specify the selected categories and tags to be added.
selected_tags <- c("R tutorial", "Web scraping") # add tags for the post here
selected_categories <- c("Blog", "Project", "R Code", "Data Analysis") # add categories for the post here
# Extract post_id from tags and find the highest one
highest_post_id <- extract_post_id(tags_categories$post_ids)
new_post_id <- sprintf("%03d", highest_post_id + 1)
new_post_id_tag <- sprintf('post_id: %s', new_post_id)
selected_tags <- c(new_post_id_tag, selected_tags)
# Debugging: Print selected categories and tags for debugging purposes
cat("Laura's Selected Categories:\n", paste(selected_categories, collapse = ", "), "\n")
cat("Laura's Selected Tags:\n", paste(selected_tags, collapse = ", "), "\n")
# Debugging: Ensure selected_tags is correct before updating YAML
print("Selected tags right before updating YAML:")
print(selected_tags)
# Step 13: Update tags and categories.
update_tags_categories(file_to_update, selected_categories, selected_tags)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment