Created
August 4, 2024 01:30
-
-
Save lebriggs/dc1036e923d321eb3853a924526332d9 to your computer and use it in GitHub Desktop.
Automated YAML Front Matter Updater For Markdown Files In R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Title: Automated YAML Front Matter Updater For Markdown Files In R | |
# Author: L. E. Briggs | |
# Date: 03 August 2024 | |
# Note: | |
# This is only the first draft of the script. Future updates will be forthcoming. | |
# Related blog post: https://lebriggs.com/blog/post_index/ | |
# Overview: | |
# This R script updates the YAML front matter of Markdown files (.md and .Rmd) by: | |
# - Adding or modifying user-specified tags and categories | |
# - Ensuring the date field is set to today's date if empty | |
# - Incrementing the highest post_id to maintain unique identifiers | |
# - Collecting previous tags and categories from all markdown files in specified directories | |
# - Ensuring all tags and categories are properly formatted and unique | |
# The script provides a comprehensive solution for maintaining and updating metadata across multiple markdown files. | |
# Usage: | |
# 1. Define the specific file to update and the directories to search. | |
# 2. Customize the selected tags and categories to be added. | |
# 3. Run the script in your R environment. | |
# Dependencies: | |
# - yaml | |
# - dplyr | |
# - fs | |
# - here | |
# - readr | |
# I. Initial Set-up: Install and Load Required Packages with Basic Feedback | |
#this script installs and loads required packages with basic feedback | |
#separating the installation check from the loading step ensures a more robust process | |
#list of required packages | |
pkg3 <- c("yaml", "dplyr", "fs", "here", "readr") | |
#create a function to install and load packages with feedback | |
#installing all dependencies minimizes potential issues with missing packages | |
#adding messages provides feedback on the installation and loading status of each package | |
install_and_load <- function(package) { | |
tryCatch({ | |
if (!package %in% rownames(installed.packages())) { #check if package is installed | |
install.packages(package, dependencies = TRUE) #install the package with all dependencies | |
Sys.sleep(2) #ensure the installation process completes properly | |
.libPaths(.libPaths()) #reload the library paths | |
if (!require(package, character.only = TRUE)) { #try to load the package again | |
return(paste("failed to install or load package:", package)) #return message if loading fails | |
} else { | |
return(paste(package, "was installed and loaded successfully.")) #return message if successful | |
} | |
} else { | |
if (!require(package, character.only = TRUE)) { #try to load the package | |
return(paste("failed to load package:", package)) #return message if loading fails | |
} else { | |
return(paste(package, "was already installed and loaded.")) #return message if already installed and loaded | |
} | |
} | |
}, error = function(e) { | |
return(paste("error installing or loading package:", package, "-", e$message)) #extract and return the error message | |
}) | |
} | |
#install and load packages | |
install_results <- lapply(pkg3, install_and_load) | |
#print installation and loading results with a title | |
cat("summary:\n", unlist(install_results), sep = "\n") | |
# II. Subsequent Session(s): Load Required Packages | |
#list of required packages that were previously installed | |
pkg3 <- c("yaml", "dplyr", "fs", "here", "readr") | |
#load packages that were previously installed | |
lapply(pkg3, require, character.only = TRUE) | |
# III. The Code | |
# The steps are as follows: | |
# Step 1: Define the file and folders to be searched. | |
# Step 2: Define function to read yaml front matter. | |
# Step 3: Define function to handle the date field. | |
# Step 4: Define function to update field. | |
# Step 5: Define function to add quotes to fields. | |
# Step 6: Define function to write updated yaml front matter. | |
# Step 7: Define function to update tags and categories in the YAML front matter. | |
# Step 8: Write the date field in yaml_content if it's empty | |
# Step 9: Check for the highest post_id and increment it. | |
# Step 10: Define function to collect previous tags and categories from all markdown files. | |
# Step 11: Collect and print previous unique tags and categories. | |
# Step 12: Specify the selected categories and tags to be added. | |
# Step 13: Update tags and categories. | |
# Step 1: Define the file and folders to be searched. | |
# Define the specific file to update | |
file_to_update <- here::here("content", "blog", "webscraping", "index.Rmd") | |
# Specify the directories to search for markdown files | |
directories <- c("content/blog", "content/talk", "content/dataresource", "content/project") | |
# Step 2: Define function to read yaml front matter. | |
# function to read yaml front matter | |
read_yaml_front_matter <- function(file) { | |
content <- readr::read_lines(file) | |
yaml_start <- which(content == "---")[1] | |
yaml_end <- which(content == "---")[2] | |
yaml_content <- paste(content[(yaml_start + 1):(yaml_end - 1)], collapse = "\n") | |
yaml_parsed <- yaml::yaml.load(yaml_content) | |
return(list(yaml_parsed = yaml_parsed, yaml_start = yaml_start, yaml_end = yaml_end, content = content)) | |
} | |
# Step 3: Define function to handle the date field. | |
# handle the date field | |
handle_date_field <- function(yaml_content) { | |
yaml_content <- sapply(yaml_content, function(line) { | |
if (grepl("^date: *\"?\"?$", line)) { # Adjusted regex to match both `date: ""` and `date:` | |
print("empty date field found. updating it to today's date.") | |
return(sprintf('date: "%s"', Sys.Date())) | |
} else if (grepl("^date: \"[0-9]{4}-[0-9]{2}-[0-9]{2}\"$", line)) { | |
print("existing date field is already formatted correctly; no need to modify.") | |
} | |
return(line) | |
}) | |
yaml_content <- unlist(yaml_content) | |
return(yaml_content) | |
} | |
# Step 4: Define function to update field. | |
# function to update field | |
update_field <- function(field, values, yaml_content) { | |
field_regex <- paste0("(?<=^", field, ":)(?s).*$") | |
if (any(grepl(paste0("^", field, ":"), yaml_content))) { | |
yaml_content <- sub(field_regex, paste0("\n- ", paste(values, collapse = "\n- ")), yaml_content, perl = TRUE) | |
} else { | |
yaml_content <- c(yaml_content, paste0(field, ":\n- ", paste(values, collapse = "\n- "))) | |
} | |
return(yaml_content) | |
} | |
# Step 5: Define function to add quotes to fields. | |
# function to add quotes to fields | |
add_quotes <- function(fields) { | |
fields <- lapply(fields, function(field) { | |
if (!grepl('^".*"$', field)) { | |
field <- sprintf('"%s"', field) | |
} | |
return(field) | |
}) | |
return(fields) | |
} | |
# Step 6: Define function to write updated yaml front matter. | |
# function to write updated yaml front matter | |
write_yaml_front_matter <- function(file, yaml_content, yaml_start, yaml_end, content) { | |
# reconstruct the content with updated yaml | |
updated_content <- c("---", yaml_content, "---", content[(yaml_end + 1):length(content)]) | |
readr::write_lines(updated_content, path = file) | |
} | |
# Step 7: Define function to update tags and categories in the YAML front matter. | |
# function to update the tags and categories in the yaml front matter | |
update_tags_categories <- function(file, selected_categories, selected_tags) { | |
yaml_data <- read_yaml_front_matter(file) | |
yaml_parsed <- yaml_data$yaml_parsed | |
yaml_start <- yaml_data$yaml_start | |
yaml_end <- yaml_data$yaml_end | |
content <- yaml_data$content | |
# Step 8: Write the date field in yaml_content if it's empty | |
yaml_content <- content[(yaml_start + 1):(yaml_end - 1)] | |
yaml_content <- handle_date_field(yaml_content) | |
# Step 9: Check for the highest post_id and increment it. | |
# Extract post_id from tags and find the highest one | |
extract_post_id <- function(tags) { | |
post_ids <- grep("^post_id: [0-9]{3}$", tags, value = TRUE) | |
post_ids_numeric <- as.numeric(sub("post_id: ", "", post_ids)) | |
# Sort post_ids for debugging output | |
post_ids_numeric <- sort(post_ids_numeric) | |
# Debugging: Print extracted numeric post_ids in order | |
print("Extracted numeric post_ids:") | |
print(post_ids_numeric) | |
if (length(post_ids_numeric) == 0) { | |
return(0) | |
} | |
return(max(post_ids_numeric)) | |
} | |
highest_post_id <- extract_post_id(tags_categories$post_ids) | |
new_post_id <- sprintf("%03d", highest_post_id + 1) | |
new_post_id_tag <- sprintf('post_id: %s', new_post_id) | |
# Debugging: Print new post_id | |
print("New post_id to be assigned:") | |
print(new_post_id) | |
selected_tags <- lapply(selected_tags, function(tag) { | |
if (grepl("^post_id: [0-9]{3}$", tag)) { | |
tag <- ifelse(tag == sprintf('post_id: %03d', highest_post_id), new_post_id_tag, tag) | |
} | |
return(tag) | |
}) | |
# Ensure all tags are in double quotes | |
selected_tags <- add_quotes(selected_tags) | |
# Ensure all categories are in double quotes | |
selected_categories <- add_quotes(selected_categories) | |
# Update yaml_parsed with new categories and tags | |
yaml_parsed$categories <- unique(c(yaml_parsed$categories, selected_categories)) | |
yaml_parsed$tags <- unique(c(yaml_parsed$tags, selected_tags)) | |
# Update yaml_content with new categories and tags | |
yaml_content <- update_field("categories", yaml_parsed$categories, yaml_content) | |
yaml_content <- update_field("tags", yaml_parsed$tags, yaml_content) | |
write_yaml_front_matter(file, yaml_content, yaml_start, yaml_end, content) | |
} | |
# Step 10: Define function to collect previous tags and categories from all markdown files. | |
# function to collect previous tags and categories from all markdown files | |
collect_tags_categories <- function(files) { | |
tags_categories_df <- dplyr::tibble(tags = character(), categories = character()) | |
post_ids <- c() | |
for (file in files) { | |
yaml_data <- read_yaml_front_matter(file) | |
yaml_parsed <- yaml_data$yaml_parsed | |
tags <- if (!is.null(yaml_parsed$tags)) yaml_parsed$tags else character() | |
categories <- if (!is.null(yaml_parsed$categories)) yaml_parsed$categories else character() | |
tags <- paste(tags, collapse = ", ") | |
categories <- paste(categories, collapse = ", ") | |
tags_categories_df <- dplyr::bind_rows(tags_categories_df, dplyr::tibble(tags = tags, categories = categories)) | |
# Collect unique post_ids per file | |
current_post_ids <- unique(grep("^post_id: [0-9]{3}$", yaml_parsed$tags, value = TRUE)) | |
post_ids <- unique(c(post_ids, current_post_ids)) # Ensure uniqueness while aggregating | |
} | |
unique_tags <- unique(unlist(strsplit(tags_categories_df$tags, ", "))) | |
unique_categories <- unique(unlist(strsplit(tags_categories_df$categories, ", "))) | |
unique_tags <- unique_tags[!is.na(unique_tags)] | |
unique_categories <- unique_categories[!is.na(unique_categories)] | |
unique_tags <- sort(unique_tags) | |
unique_categories <- sort(unique_categories) | |
return(list(unique_tags = unique_tags, unique_categories = unique_categories, post_ids = post_ids)) | |
} | |
# Step 11: Collect and print previous unique tags and categories. | |
all_files <- unlist(lapply(directories, function(dir) { | |
fs::dir_ls(here::here(dir), recurse = TRUE, regexp = "\\.(md|Rmd)$") | |
})) | |
tags_categories <- collect_tags_categories(all_files) | |
# Debugging: Print all collected post_id tags | |
print("Collected post_id tags from all files:") | |
print(tags_categories$post_ids) | |
sorted_tags <- sort(unlist(tags_categories$unique_tags)) | |
sorted_categories <- sort(unlist(tags_categories$unique_categories)) | |
output <- paste("\nUnique Categories:\n", paste(sorted_categories, collapse = ", "), "\n\nUnique Tags:\n", paste(sorted_tags, collapse = ", "), "\n", sep = "") | |
cat(output) | |
# Step 12: Specify the selected categories and tags to be added. | |
selected_tags <- c("R tutorial", "Web scraping") # add tags for the post here | |
selected_categories <- c("Blog", "Project", "R Code", "Data Analysis") # add categories for the post here | |
# Extract post_id from tags and find the highest one | |
highest_post_id <- extract_post_id(tags_categories$post_ids) | |
new_post_id <- sprintf("%03d", highest_post_id + 1) | |
new_post_id_tag <- sprintf('post_id: %s', new_post_id) | |
selected_tags <- c(new_post_id_tag, selected_tags) | |
# Debugging: Print selected categories and tags for debugging purposes | |
cat("Laura's Selected Categories:\n", paste(selected_categories, collapse = ", "), "\n") | |
cat("Laura's Selected Tags:\n", paste(selected_tags, collapse = ", "), "\n") | |
# Debugging: Ensure selected_tags is correct before updating YAML | |
print("Selected tags right before updating YAML:") | |
print(selected_tags) | |
# Step 13: Update tags and categories. | |
update_tags_categories(file_to_update, selected_categories, selected_tags) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment