muschellij2 · May 9, 2018 19:28
diff --git a/cds_consistency.R b/cds_consistency.R
 library(didactr)
 library(pdftools)
 library(dplyr)
 library(httr)
 library(googledrive)
 library(broom)
 library(tidyr)
 n_pdf_pages = function(file) {
  if (length(file) == 0) {
    return(NA)
  }
  pdf_info(file)$pages
 }

 res_path = file.path("manuscript", "resources")
 # get manuscript md files and check names of 
 manuscript_files = list.files(pattern = ".md$", path = "manuscript",
                              full.names = TRUE)
 man_stubs = sub("[.]md$", "", basename(manuscript_files))
 # md file has highest precedence
 df = data_frame(lesson = man_stubs, md_file = manuscript_files)

 df$id = sapply(df$md_file, function(fname) {
  x = readLines(fname, warn = FALSE)
  x = grep(x, pattern = "\\[(S|s)lides\\]", value = TRUE)
  x = sub(".*\\((http.*)\\).*", "\\1", x)
  x = unlist(sapply(x, function(r) parse_url(r)$path))
  x = sub("/edit$", "", x)
  x = basename(x)
  x = unique(x)
  if (length(x) > 1) {
    warning(paste0("Multiple sheets identified!  Please check ", 
                   fname))
  }
  if (length(x) == 0) {
    return(NA)
  }
  return(x)
 })


 if (anyDuplicated(df$id)) {
  dup_df = df %>% 
    group_by(id) %>% 
    add_tally() %>% 
    filter(n > 1)
  warning("Duplicated IDs are present!  MD files are off")
  print(dup_df)
 }

 image_links = lapply(df$md_file, function(fname) {
  x = readLines(fname, warn = FALSE)
  x = grep(x, pattern = "!\\[.*\\]\\((images.*)\\)", value = TRUE)
  x = sub(x, pattern = "!\\[(.*)\\]\\((images.*)\\)", replacement = "\\1")
  return(x)
 })

 images = lapply(df$md_file, function(fname) {
  x = readLines(fname, warn = FALSE)
  x = grep(x, pattern = "!\\[.*\\]\\((images.*)\\)", value = TRUE)
  x = sub(x, pattern = "!\\[.*\\]\\((images.*)\\)", replacement = "\\1")
  return(x)
 })

 df$all_images_exist = sapply(images, function(x) {
  all(file.exists(file.path(res_path, x)))
 })

 drive_info = drive_get(id = df$id)
 if (nrow(drive_info) > 0) {
  drive_info = drive_info %>% 
    rename(gs_name = name)
  mod_time = sapply(drive_info$drive_resource, 
                    function(x) {
                      x$modifiedTime
                    })
  drive_info$mod_time = mod_time
  drive_info = drive_info %>% 
    select(-drive_resource)
  df = left_join(df, drive_info, by = "id")
  df = distinct(df)
 }

 path = file.path("manuscript", "resources", "images")

 if (!dir.exists(path)) {
  dir.create(path, recursive = TRUE, showWarnings = FALSE)
 }

 script_path = "scripts"

 if (!dir.exists(script_path)) {
  dir.create(script_path, recursive = TRUE, showWarnings = FALSE)
 }

 df = df %>% 
  mutate(img_dir = file.path(path, lesson))

 # naming conventions for the images folders
 img_dirs = list.dirs(path = path, recursive = FALSE, 
                     full.names = TRUE)
 names(img_dirs) = img_dirs

 df = df %>% 
  mutate(has_img_dir = img_dir %in% img_dirs)

 # if img_dir doesn't exist, then create one
 bad_img_dir = !df$has_img_dir
 if (any(bad_img_dir)) {
  sapply(df$img_dir[bad_img_dir], dir.create, recursive = TRUE,
         showWarnings = FALSE)
 }
 df$has_img_dir = NULL

 # check if image directories exist but don't have MD file
 bad_img_dir = !(img_dirs %in% df$img_dir)
 if (any(bad_img_dir)) {
  warning(paste0("An image directory exists but doesn't correspond to a ", 
                 "lesson.  Possible naming inconsistency.  Possible:"))
  cat(img_dirs[bad_img_dir], sep = "\n")
 }



 # Check if a image folder has a PDF
 df$pdf = sapply(df$img_dir,
                function(x) {
                  pdfs = list.files(pattern = "[.]pdf",
                                    path = x,
                                    full.names = TRUE)
                  if (length(pdfs) > 1) {
                    warning(paste0(path, " had more than one PDF! ", 
                                   "Only grabbing first"))
                    pdfs = pdfs[1]
                  }
                  if (length(pdfs) == 0) {
                    return(NA)
                  }
                  return(pdfs)
                })

 # Check the number of pages of the pdf to cross-ref with the pngs
 df$pdf_pages = sapply(df$pdf, n_pdf_pages)

 # list out the pngs of the folder
 png_names = lapply(df$img_dir,
                   function(x) {
                     pngs = list.files(pattern = "[.]png",
                                       path = x)
                     pngs
                   })
 df$n_pngs = sapply(png_names, length)
	library(didactr)
	library(pdftools)
	library(dplyr)
	library(httr)
	library(googledrive)
	library(broom)
	library(tidyr)
	n_pdf_pages = function(file) {
	if (length(file) == 0) {
	return(NA)
	}
	pdf_info(file)$pages
	}

	res_path = file.path("manuscript", "resources")
	# get manuscript md files and check names of
	manuscript_files = list.files(pattern = ".md$", path = "manuscript",
	full.names = TRUE)
	man_stubs = sub("[.]md$", "", basename(manuscript_files))
	# md file has highest precedence
	df = data_frame(lesson = man_stubs, md_file = manuscript_files)

	df$id = sapply(df$md_file, function(fname) {
	x = readLines(fname, warn = FALSE)
	x = grep(x, pattern = "\\[(S\|s)lides\\]", value = TRUE)
	x = sub(".\\((http.)\\).*", "\\1", x)
	x = unlist(sapply(x, function(r) parse_url(r)$path))
	x = sub("/edit$", "", x)
	x = basename(x)
	x = unique(x)
	if (length(x) > 1) {
	warning(paste0("Multiple sheets identified! Please check ",
	fname))
	}
	if (length(x) == 0) {
	return(NA)
	}
	return(x)
	})


	if (anyDuplicated(df$id)) {
	dup_df = df %>%
	group_by(id) %>%
	add_tally() %>%
	filter(n > 1)
	warning("Duplicated IDs are present! MD files are off")
	print(dup_df)
	}

	image_links = lapply(df$md_file, function(fname) {
	x = readLines(fname, warn = FALSE)
	x = grep(x, pattern = "!\\[.\\]\\((images.)\\)", value = TRUE)
	x = sub(x, pattern = "!\\[(.)\\]\\((images.)\\)", replacement = "\\1")
	return(x)
	})

	images = lapply(df$md_file, function(fname) {
	x = readLines(fname, warn = FALSE)
	x = grep(x, pattern = "!\\[.\\]\\((images.)\\)", value = TRUE)
	x = sub(x, pattern = "!\\[.\\]\\((images.)\\)", replacement = "\\1")
	return(x)
	})

	df$all_images_exist = sapply(images, function(x) {
	all(file.exists(file.path(res_path, x)))
	})

	drive_info = drive_get(id = df$id)
	if (nrow(drive_info) > 0) {
	drive_info = drive_info %>%
	rename(gs_name = name)
	mod_time = sapply(drive_info$drive_resource,
	function(x) {
	x$modifiedTime
	})
	drive_info$mod_time = mod_time
	drive_info = drive_info %>%
	select(-drive_resource)
	df = left_join(df, drive_info, by = "id")
	df = distinct(df)
	}

	path = file.path("manuscript", "resources", "images")

	if (!dir.exists(path)) {
	dir.create(path, recursive = TRUE, showWarnings = FALSE)
	}

	script_path = "scripts"

	if (!dir.exists(script_path)) {
	dir.create(script_path, recursive = TRUE, showWarnings = FALSE)
	}

	df = df %>%
	mutate(img_dir = file.path(path, lesson))

	# naming conventions for the images folders
	img_dirs = list.dirs(path = path, recursive = FALSE,
	full.names = TRUE)
	names(img_dirs) = img_dirs

	df = df %>%
	mutate(has_img_dir = img_dir %in% img_dirs)

	# if img_dir doesn't exist, then create one
	bad_img_dir = !df$has_img_dir
	if (any(bad_img_dir)) {
	sapply(df$img_dir[bad_img_dir], dir.create, recursive = TRUE,
	showWarnings = FALSE)
	}
	df$has_img_dir = NULL

	# check if image directories exist but don't have MD file
	bad_img_dir = !(img_dirs %in% df$img_dir)
	if (any(bad_img_dir)) {
	warning(paste0("An image directory exists but doesn't correspond to a ",
	"lesson. Possible naming inconsistency. Possible:"))
	cat(img_dirs[bad_img_dir], sep = "\n")
	}



	# Check if a image folder has a PDF
	df$pdf = sapply(df$img_dir,
	function(x) {
	pdfs = list.files(pattern = "[.]pdf",
	path = x,
	full.names = TRUE)
	if (length(pdfs) > 1) {
	warning(paste0(path, " had more than one PDF! ",
	"Only grabbing first"))
	pdfs = pdfs[1]
	}
	if (length(pdfs) == 0) {
	return(NA)
	}
	return(pdfs)
	})

	# Check the number of pages of the pdf to cross-ref with the pngs
	df$pdf_pages = sapply(df$pdf, n_pdf_pages)

	# list out the pngs of the folder
	png_names = lapply(df$img_dir,
	function(x) {
	pngs = list.files(pattern = "[.]png",
	path = x)
	pngs
	})
	df$n_pngs = sapply(png_names, length)