Skip to content

Instantly share code, notes, and snippets.

@muschellij2
Created May 9, 2018 19:28
Show Gist options
  • Save muschellij2/d612245564bde92bc422b223e2b698b0 to your computer and use it in GitHub Desktop.
Save muschellij2/d612245564bde92bc422b223e2b698b0 to your computer and use it in GitHub Desktop.
Consistency checks for CDS classes
library(didactr)
library(pdftools)
library(dplyr)
library(httr)
library(googledrive)
library(broom)
library(tidyr)
n_pdf_pages = function(file) {
if (length(file) == 0) {
return(NA)
}
pdf_info(file)$pages
}
res_path = file.path("manuscript", "resources")
# get manuscript md files and check names of
manuscript_files = list.files(pattern = ".md$", path = "manuscript",
full.names = TRUE)
man_stubs = sub("[.]md$", "", basename(manuscript_files))
# md file has highest precedence
df = data_frame(lesson = man_stubs, md_file = manuscript_files)
df$id = sapply(df$md_file, function(fname) {
x = readLines(fname, warn = FALSE)
x = grep(x, pattern = "\\[(S|s)lides\\]", value = TRUE)
x = sub(".*\\((http.*)\\).*", "\\1", x)
x = unlist(sapply(x, function(r) parse_url(r)$path))
x = sub("/edit$", "", x)
x = basename(x)
x = unique(x)
if (length(x) > 1) {
warning(paste0("Multiple sheets identified! Please check ",
fname))
}
if (length(x) == 0) {
return(NA)
}
return(x)
})
if (anyDuplicated(df$id)) {
dup_df = df %>%
group_by(id) %>%
add_tally() %>%
filter(n > 1)
warning("Duplicated IDs are present! MD files are off")
print(dup_df)
}
image_links = lapply(df$md_file, function(fname) {
x = readLines(fname, warn = FALSE)
x = grep(x, pattern = "!\\[.*\\]\\((images.*)\\)", value = TRUE)
x = sub(x, pattern = "!\\[(.*)\\]\\((images.*)\\)", replacement = "\\1")
return(x)
})
images = lapply(df$md_file, function(fname) {
x = readLines(fname, warn = FALSE)
x = grep(x, pattern = "!\\[.*\\]\\((images.*)\\)", value = TRUE)
x = sub(x, pattern = "!\\[.*\\]\\((images.*)\\)", replacement = "\\1")
return(x)
})
df$all_images_exist = sapply(images, function(x) {
all(file.exists(file.path(res_path, x)))
})
drive_info = drive_get(id = df$id)
if (nrow(drive_info) > 0) {
drive_info = drive_info %>%
rename(gs_name = name)
mod_time = sapply(drive_info$drive_resource,
function(x) {
x$modifiedTime
})
drive_info$mod_time = mod_time
drive_info = drive_info %>%
select(-drive_resource)
df = left_join(df, drive_info, by = "id")
df = distinct(df)
}
path = file.path("manuscript", "resources", "images")
if (!dir.exists(path)) {
dir.create(path, recursive = TRUE, showWarnings = FALSE)
}
script_path = "scripts"
if (!dir.exists(script_path)) {
dir.create(script_path, recursive = TRUE, showWarnings = FALSE)
}
df = df %>%
mutate(img_dir = file.path(path, lesson))
# naming conventions for the images folders
img_dirs = list.dirs(path = path, recursive = FALSE,
full.names = TRUE)
names(img_dirs) = img_dirs
df = df %>%
mutate(has_img_dir = img_dir %in% img_dirs)
# if img_dir doesn't exist, then create one
bad_img_dir = !df$has_img_dir
if (any(bad_img_dir)) {
sapply(df$img_dir[bad_img_dir], dir.create, recursive = TRUE,
showWarnings = FALSE)
}
df$has_img_dir = NULL
# check if image directories exist but don't have MD file
bad_img_dir = !(img_dirs %in% df$img_dir)
if (any(bad_img_dir)) {
warning(paste0("An image directory exists but doesn't correspond to a ",
"lesson. Possible naming inconsistency. Possible:"))
cat(img_dirs[bad_img_dir], sep = "\n")
}
# Check if a image folder has a PDF
df$pdf = sapply(df$img_dir,
function(x) {
pdfs = list.files(pattern = "[.]pdf",
path = x,
full.names = TRUE)
if (length(pdfs) > 1) {
warning(paste0(path, " had more than one PDF! ",
"Only grabbing first"))
pdfs = pdfs[1]
}
if (length(pdfs) == 0) {
return(NA)
}
return(pdfs)
})
# Check the number of pages of the pdf to cross-ref with the pngs
df$pdf_pages = sapply(df$pdf, n_pdf_pages)
# list out the pngs of the folder
png_names = lapply(df$img_dir,
function(x) {
pngs = list.files(pattern = "[.]png",
path = x)
pngs
})
df$n_pngs = sapply(png_names, length)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment