# Author: Boris Steipe (ORCID: 0000-0002-1134-6758)
# License: (c) Author (2019) + MIT
# Date: 2019-01-17I recently contributed to Siegfried Köstlmeier's qrandom package and overlooked to update the package MD5 file. That got me thinking - these files are actually not mentioned in Writing R Extensions, and they are not mentioned in Hadley Wickham's R Packages either. We find them in the wild - but I did not find an explicit source regarding their format. In fact, there is a puzzled post from 2016 by Matthew Lueder on Stack Overflow "What is the MD5 file in R packages used for? How is it generated?" ... and the sole answer on the page misunderstood the question.
After poking around a bit more, the following becomes clear:
- The file is used as input to
tools::checkMD5sums() - The format can be reverse engineered from the code in that function.
- The function checks each file in
MD5, whether it actually exists in the directory tree, and whether it has the correct MD5 hash.
This is useful to check the integrity of a downloaded package against the author's sources. There are two limitations of the check:
- Hidden files and directories are not checked.
tools::checkMD5sums()usesdir()internally, which by default does not return hidden files. That's actually good, you don't want your entire.gittree in there. But if you somehow include an.Rprofile... that won't get checked. - Added files and directories are not checked either. The function returns true if it finds all files in
MD5present, and valid. This is not a full security audit, but it will catch transmission problems. Keep that in mind.
Now: how to create such a file. By hand? No, seriously! But I actually couldn't find a tool immediately. So off we go: ...
- Produce a file that is valid input for
tools::checkMD5sums(); - support excluding files (like e.g.
MD5itself, although that very file is actually skipped internally bycheckMD5sums()); - support excluding categories of files, and directories - like everything that is specified in
.Rbuildignore, because that won't be present in the installed package whose directory tree is being checked.
Here's the code:
makeMD5 <- function(myDir = getwd(), ignoreFiles, ignorePatternsIn) {
# Purpose:
# produce a file that is valid input for tools::checkMD5sums()
# Parameters:
# myDir: The directory in which the resulting MD5 file will be placed.
# All files that are returned by a call to
# dir(myDir, recursive = TRUE) will be processed unless they are
# listed in the "ignore" parameter (see below).
# Defaults to the outout of getwd(). Will be used to construct
# fully qualified paths via file.path(), so do not add a trailing
# path-separator character.
# ignoreFiles: a vector of regular expressions. Files matching any of these
# expressions will not be processed. Defaults to "^MD5$". Set this
# parameter to "" to process all files.
# ignorePatternsIn: a vector of filenames. Regular expressions contained
# in these files will be added to the ignoreFiles list.
# Defaults to ".Rbuildignore". Set this parameter
# to "" to add no patterns to the ignoreFiles list.
# Filenames are expected relative to myDir and fully
# qualified paths are constructed from file.path(), i.e.
# in a platform independent way.
# Details: The default values are appropriate to create an MD5 file in an R
# package development workflow. In this scenario, files will be
# mentioned in .Rbuildignore that should not appear in the
# package MD5 file since they will not be bundled with the package.
# Those files will not be processed into the MD5 output file.
#
# Comment- or empty lines contained in files mentioned in
# "ignorePatternsIn" will be skipped.
# Value:
# NULL (invisible). The function is executed for its side effect of
# creating a new MD5 file in the myDir directory or updating an
# existing one. This file is suitable input for
# tools::checkMD5sums("", myDir)).
# define regex patterns, which files to ignore
if (missing(ignoreFiles)) {
ignoreFiles <- c("^MD5$") # default: ignore MD5 file itself
} else if (all(ignoreFiles == "")) {
ignoreFiles <- character()
}
# define files containing additional regexes
if (missing(ignorePatternsIn)) {
ignorePatternsIn <- c(".Rbuildignore") # default
} else if (all(ignorePatternsIn == "")) {
ignorePatternsIn <- character()
}
# process all files containing additional regexes
for (fN in ignorePatternsIn) {
fN <- file.path(myDir, fN)
x <- readLines(fN)
x <- x[! grepl("(^\\s*#)|(^\\s*$)", x)] # remove comment- or empty lines
ignoreFiles <- c(ignoreFiles, x) # add regex patterns to list
}
# make initial list of filenames
fileNames <- dir(myDir, recursive = TRUE)
# also make list of directory names - some regexes may exclude
# directories, not files
dirNames <- list.dirs(path = myDir, full.names = FALSE)
dirNames <- dirNames[! grepl("(^\\.)|(^$)", dirNames)] # not empty or hidden
# remove all files that need to be ignored
for (patt in ignoreFiles) {
if (sum(grepl(patt, dirNames)) > 0) { # note: pattern matches directories!
# change pattern to match filepaths and remove all matching files
# note: must terminate with .Platform$file.sep, otherwise filenames
# starting with the directory string will be targeted!
p2 <- gsub("(\\$)*$", .Platform$file.sep, patt)
fileNames <- fileNames[! grepl(p2, fileNames)]
}
fileNames <- fileNames[! grepl(patt, fileNames)]
}
# done creating list of files to process
# md5-process all files
md5 <- tools::md5sum(file.path(myDir, fileNames))
if (length(md5) > 0) {
# filenames are fully qualified. Remove myDir to get relative
# path, and remove first character, which must be either "\" or "/"
md5 <- paste0(md5, " *", gsub(paste0(myDir, "."), "", names(md5)))
}
# output MD5 file
writeLines(md5, con = file.path(myDir, "MD5"))
return(invisible(NULL))
}Source this, and setwd() to a package directory you are currently working on. Then ...
makeMD5()
tools::checkMD5sums("", getwd())... should return TRUE
As usual: writing the tests took longer than writing the function. (As it should. And I won't pretend for a moment that the tests did not discover a bunch of failed assumptions.)
library(testthat)
myPath <- paste0(tempdir(), "/testMD5")
dir.create(myPath)
myMD5 <- file.path(myPath, "MD5")
# list.dirs(myPath)
# list.files(myPath, recursive = TRUE)
test_that("an empty directory creates an empty MD5", {
makeMD5(myDir = myPath, ignoreFiles = "", ignorePatternsIn = "")
x <- readLines(myMD5)
unlink(myMD5)
})
dir.create(paste0(myPath, "/skip/skip"), recursive = TRUE)
dir.create(paste0(myPath, "/do/do"), recursive = TRUE)
dir.create(paste0(myPath, "/.hidden"), recursive = TRUE)
testFN <- sprintf("test%02d.txt", 1:7)
testFN[1] <- file.path(myPath, "", testFN[1])
testFN[2] <- file.path(myPath, ".hidden", testFN[2])
testFN[3] <- file.path(myPath, "skip", testFN[3])
testFN[4] <- file.path(myPath, "skip/skip", testFN[4])
testFN[5] <- file.path(myPath, "do", testFN[5])
testFN[6] <- file.path(myPath, "do", testFN[6])
testFN[7] <- file.path(myPath, "do/do", testFN[7])
txt <- "Test string"
test_that("an empty file tree creates an empty MD5", {
makeMD5(myDir = myPath, ignoreFiles = "", ignorePatternsIn = "")
x <- readLines(myMD5)
expect_equal(0, length(x)) # empty
expect_true(tools::checkMD5sums("", myPath)) # should skip "MD5"
unlink(myMD5)
})
test_that("a single file in myPath creates the right MD5", {
writeLines(txt, con = testFN[1])
makeMD5(myDir = myPath, ignoreFiles = "", ignorePatternsIn = "")
x <- readLines(myMD5)
expect_equal(1, length(x))
expect_true(grepl(paste0("\\*", gsub("^.+/","", testFN[1])), x))
expect_true(tools::checkMD5sums("", myPath))
unlink(testFN[1]) # note: we are keeping MD5
})
test_that("ignoring the MD5 file works", {
writeLines(txt, con = testFN[1])
# MD5 is not used per function default
makeMD5(myDir = myPath, ignorePatternsIn = "")
x <- readLines(myMD5)
expect_equal(1, length(x)) # 1, not 2
expect_true(grepl(paste0("\\*", gsub("^.+/","", testFN[1])), x))
expect_true(tools::checkMD5sums("", myPath))
unlink(c(myMD5, testFN[1]))
})
test_that("several files in the tree create the right MD5", {
for (i in 1:7) {
writeLines(txt, con = testFN[i])
}
makeMD5(myDir = myPath, ignoreFiles = "", ignorePatternsIn = "")
x <- readLines(file.path(myPath, "MD5"))
expect_equal(6, length(x))
expect_equal(6, sum(grepl("\\*.*test0[1-7]\\.txt", x)))
expect_false(any(grepl("\\*.*test02\\.txt", x))) # in .hidden directory
expect_true(tools::checkMD5sums("", myPath))
unlink(myMD5) # keep test files in place
})
test_that("one or more regular expressions work in the ignoreFiles parameter", {
makeMD5(myDir = myPath,
ignoreFiles = c("^MD5$", "[567]"),
ignorePatternsIn = "")
x <- readLines(file.path(myPath, "MD5"))
expect_equal(3, length(x))
expect_equal(3, sum(grepl("\\*.*test0[1-7]\\.txt", x)))
expect_false(any(grepl("[567]\\.txt", x)))
expect_true(tools::checkMD5sums("", myPath))
unlink(myMD5) # keep test files in place
})
test_that("the default .Rbuildignore file works with one regex", {
writeLines("test0[13]\\.txt$", con = file.path(myPath, ".Rbuildignore"))
makeMD5(myDir = myPath)
x <- readLines(file.path(myPath, "MD5"))
expect_equal(4, length(x))
expect_equal(4, sum(grepl("\\*.*test0[4-7]\\.txt", x)))
expect_true(tools::checkMD5sums("", myPath))
unlink(c(myMD5, file.path(myPath, ".Rbuildignore")))
})
test_that("the default .Rbuildignore file works with two regexes", {
writeLines(c("test07\\.txt$",
"test05\\.txt$"),
con = file.path(myPath, ".Rbuildignore"))
makeMD5(myDir = myPath)
x <- readLines(file.path(myPath, "MD5"))
expect_equal(4, length(x))
expect_equal(4, sum(grepl("\\*.*test0[1346]\\.txt", x)))
expect_true(tools::checkMD5sums("", myPath))
unlink(c(myMD5, file.path(myPath, ".Rbuildignore")))
})
test_that("the default .Rbuildignore file works with two directories", {
writeLines(c("^skip$",
"^do/do$"),
con = file.path(myPath, ".Rbuildignore"))
makeMD5(myDir = myPath)
x <- readLines(file.path(myPath, "MD5"))
expect_equal(3, length(x))
expect_equal(3, sum(grepl("\\*.*test0[156]\\.txt", x)))
expect_true(tools::checkMD5sums("", myPath))
unlink(c(myMD5, file.path(myPath, ".Rbuildignore")))
})
test_that("a file .Rtestignore file works with two regexes", {
writeLines(c("test01\\.txt$",
"test06\\.txt$"),
con = file.path(myPath, ".Rtestignore"))
makeMD5(myDir = myPath, ignorePatternsIn = ".Rtestignore")
x <- readLines(file.path(myPath, "MD5"))
expect_equal(4, length(x))
expect_equal(4, sum(grepl("\\*.*test0[3457]\\.txt", x)))
expect_true(tools::checkMD5sums("", myPath))
unlink(c(myMD5, file.path(myPath, ".Rtestignore")))
})
test_that("two ignore files with two regexes each work", {
writeLines(c("^skip$",
"test01\\.txt$"),
con = file.path(myPath, ".Rbuildignore"))
writeLines(c("test05\\.txt$",
"^do/do$"),
con = file.path(myPath, ".Rtestignore"))
makeMD5(myDir = myPath, ignorePatternsIn = c(".Rbuildignore",
".Rtestignore"))
x <- readLines(file.path(myPath, "MD5"))
expect_equal(1, length(x))
expect_equal(1, sum(grepl("\\*.*test06\\.txt", x)))
expect_true(tools::checkMD5sums("", myPath))
unlink(c(myMD5,
file.path(myPath, ".Rtestignore"),
file.path(myPath, ".Rbuildignore")))
})
# Cleanup:
# unlink(testFN)
# unlink(myPath, recursive = TRUE)
# list.dirs(myPath)
# list.files(myPath, recursive = TRUE)
}Done. Now let's see if R-core would like to add this to tools::. :-)
The function fails if there is no .Rbuildignore, which was my case.
I had to "touch .Rbuildignore".