This gist describes a possible way to show diffs of R data files (files with extension e.g. .rds, .RData, .rda, ...) by reading the data into a temporary R session and converting the R object(s) in a "diffable" way to text output.
The following is required:
-
Rscript
executable can be found in the$PATH
-
the R package
docopt
has been installed
Install the docopt
package in R:
install.packages('docopt')
Create the script (rdata2txt
) that reads any R data file of format (.rds, .RData, etc.)
and print its content to the terminal:
#!/usr/bin/env Rscript
suppressMessages(library(docopt)) # we need docopt (>= 0.3) as on CRAN
doc <- "Usage: rdata2txt [-h] [-n] [-a] [-x EXT] [-t TYPE] [PATH]
-t --type TYPE choose type of conversion [default: default]
-a --all should attributes be checked too [default: FALSE]
-n --no-info should rds info not be attached as attribute [default: FALSE]
-x --ext EXT extension/file type if not clear [default: NULL]
-h --help show this help text"
opt <- docopt(doc)
if (is.null(opt$PATH)) {
cat('path argument is missing\n')
} else if (!is.character(opt$PATH)) {
cat('argument path must be of type character\n')
} else if (!file.exists(opt$PATH)) {
cat('path "', opt$PATH, '" does not exist\n', sep = '')
} else {
cat_nl <- function(...) cat(paste(..., '\n'))
print_all <- function(x, prefix = '.', all = FALSE) {
# class
x_class <- class(x)
is_matrix <- inherits(x, 'matrix')
is_list <- length(x_class) == 1 && x_class == 'list'
cat(prefix, ' ~ ',
switch(length(x_class)
, sprintf("class: '%s'", x_class)
, sprintf("class: '%s' and '%s'", x_class[1], x_class[2])
, sprintf(paste0("class: '", paste(x_class[seq_len(length(x_class) - 2)], collapse = ', ') ,"', '%s' and '%s'"),
x_class[length(x_class) - 1], x_class[length(x_class)])
),
': ', sep = '')
# dimension
x_dim <- dim(x)
if (x_dim_null <- is.null(x_dim)) {
# length: ... values
cat(length(x), 'values')
if (is.factor(x)) {
cat_nl(' - Levels:', paste(levels(x), collapse = ' '))
} else {
cat_nl()
}
} else if (is_matrix) {
# dim (matrix): ... rows x ... cols
cat_nl(x_dim[1], 'rows x', x_dim[2], 'columns')
} else {
# dim: .. obs. of ... variables
cat_nl(x_dim[1], 'obs. of ', x_dim[2], 'variables')
}
# names of x:
x_names <- names(x)
if (is_list) {
if (is.null(x_names)) {
# unnamed list
# vector to loop over
x_names <- paste0(prefix, '[[', seq_along(x), ']]')
} else {
# named list
# print names
cat_nl(prefix, '~ names:')
print(x_names)
# vector to loop over
x_names <- paste0(prefix, '$', x_names)
}
} else if (is_matrix) {
# matrix row/colnames?
# rownames
if (!is.null(rownames(x))) {
cat_nl(prefix, '~ rownames:')
print(rownames(x))
}
# colnames
if (!is.null(colnames(x))) {
cat_nl(prefix, '~ colnames:')
print(colnames(x))
}
} else if (!x_dim_null) {
# data.frame like -> row/colnames
# check if row == seq_along
x_rownames <- row.names(x)
if (!identical(x_rownames, sprintf('%i', seq_len(x_dim[1])))) {
# print rownames
cat_nl(prefix, '~ row.names:')
print(row.names(x))
}
# print colnames
cat_nl(prefix, '~ names:')
print(x_names)
# vector to loop over
x_names <- paste0(prefix, '$', x_names)
}
# print only if matrix or 'vector'
if (is_matrix) {
if (prefix != '') {
cat_nl(prefix, ':', sep = '')
}
print(x[seq_along(x)])
} else if (is_list || !x_dim_null) {
# loop over entries
for (l in seq_along(x_names)) {
print_all(x[[l]], x_names[l], all = all)
}
if (all) {
# check attributes (without names, row.names, dim, dimnames, class)
att_names_all <- names(attributes(x))
att_names <- setdiff(att_names_all, c('names', 'row.names', 'dim', 'dimnames', 'class'))
# loop over attributes
att_pre <- sprintf('attr(%s, "%s")', prefix, att_names)
for (l in seq_along(att_names)) {
print_all(attr(x, att_names[l]), att_pre[l], all = all)
}
}
} else {
if (prefix != '') {
cat_nl(prefix, ':', sep = '')
}
if (is.factor(x)) {
print.default(x)
} else {
print(x)
}
}
}
# set terminal width
width <- as.integer(sub('.*columns ([0-9]+)[;].*', '\\1', system('stty -a | head -n 1', intern = TRUE)))
options(width = width * 0.8)
# check extension -> rdata type
if (opt$ext != 'NULL') {
file_ext <- opt$ext
} else {
file_ext <- tolower(sub('.*[.]([a-zA-Z]{2, })$', '\\1', basename(opt$PATH)))
}
if (file_ext %in% c('rds', 'qs')) {
# get object
if (file_ext == 'rds') {
obj <- readRDS(opt$PATH)
if (!opt$no_info) attr(obj, 'infoRDS') <- infoRDS(opt$PATH)
} else {
obj <- qs::qread(opt$PATH)
}
# convert to text
switch(opt$type
, default = {
print_all(obj, all = opt$all)
}
, print = {
print(obj)
}
, print.default = {
print.default(obj)
}
, dump = {
dump('obj', '', control = NULL)
}
, dump.exact = {
dump('obj', '', control = 'exact')
}
, dump.all = {
dump('obj', '', control = 'all')
}
, dput = {
dput(obj, '', control = NULL)
}
, dput.all = {
dput(obj, '', control = 'all')
}
, dput.exact = {
dput(obj, '', control = 'exact')
}
, dim = {
print(dim(obj))
}
, attributes = {
print(attributes(obj))
}
, str = {
str(obj)
}
, {
parsed <- try(eval(parse(text = opt$type)))
if (is.function(parsed)) {
parsed <- parsed(obj)
}
if (inherits(parsed, 'try-error')) {
cat('argument -t/--type not valid! # random number for diff:', sample.int(1e9, 1), '\n')
} else {
print(parsed)
}
}
)
} else if (file_ext %in% c('rdata', 'rda', 'rdta', 'qdata')) {
# get objects
old <- c(ls(), 'old')
if (file_ext == 'qdata') {
qs::qload(opt$PATH)
} else {
load(opt$PATH)
}
new <- ls()
nms <- new[!(new %in% old)]
objs <- mget(nms)
# print object names
cat(nms, '\n')
# add prefix 'obj:'
nms <- paste0('obj:', names(objs))
# convert them to text
switch(opt$type
, default = {
for (l in seq_along(objs)) {
cat(nms[l], '\n')
print_all(objs[[l]], nms[l], opt$all)
}
}
, print = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
print(objs[[l]])
}
}
, print.default = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
print.default(objs[[l]])
}
}
, dump = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
dump(nms[l], '', control = NULL)
}
}
, dump.exact = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
dump(nms[l], '', control = 'exact')
}
}
, dump.all = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
dump(nms[l], '', control = 'all')
}
}
, dput = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
dput(objs[[l]], '', control = NULL)
}
}
, dput.all = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
dput(objs[[l]], '', control = 'all')
}
}
, dput.exact = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
dput(objs[[l]], '', control = 'exact')
}
}
, dim = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
print(dim(objs[[l]]))
}
}
, attributes = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
print(attributes(objs[[l]]))
}
}
, str = {
for (l in seq_along(nms)) {
cat('\n', nms[l], ': ', sep = '')
str(objs[[l]])
}
}
, {
parsed <- try(eval(parse(text = opt$type)))
if (is.function(parsed)) {
parsed <- parsed(obj)
}
if (inherits(parsed, 'try-error')) {
cat('argument -t/--type not valid! # random number for diff:', sample.int(1e9, 1), '\n')
} else {
print(parsed)
}
}
)
} else {
cat('File', opt$PATH, 'is not recognized as rdata file (.rds, .RData, etc.)\n')
}
}
You can name the script whatever you like, however, it should match the docopt "Usage" instruction.
The path where you save the script to (e.g. ~/.local/bin
) must exist in (or be added to) the $PATH
variable.
Finally, the script must be executable:
chmod +x rdata2txt
You have to tell git which files will use this text conversion.
Add the following to your global git attributes file (in my case: ~/.config/git/attributes
; see man gitattributes
):
# R data files
*.rds diff=rdata
*.Rds diff=rdata
*.RDS diff=rdata
*.RData diff=rdata
*.rdata diff=rdata
*.rda diff=rdata
*.Rda diff=rdata
*.Rdta diff=rdata
*.rdta diff=rdata
The following aliases give an idea on the implementation of the rdata2txt script.
Copy the following lines to your global .gitconfig
file (~/.gitconfig
):
[alias]
# completion not working, usage examples: git r -t dim show HEAD; git r -a show HEAD; git r diff HEAD~1; git r --type str log -p; etc.
r = "!f() { case "$1" in -t|--type) type="$2"; all=""; shift; shift ;; -a|--all) type=default; all="-a"; shift ;; *) type=default; all="" ;; esac && git -c diff.rdata.textconv=\"rdata2txt -t \\\"$type\\\" $all\" $@; }; f"
# completion works, usage ex.: git diff-r HEAD~1; etc.
diff-r = !git -c diff.rdata.textconv=rdata2txt diff
# completion works, can be very slow if there are many attributes, usage ex.: git diff-r-all HEAD~1; etc.
diff-r-all = !git -c diff.rdata.textconv='rdata2txt -a' diff
# completion not working, usage ex.: git diff-r2 -t str HEAD~1; etc.
diff-r2 = "!f() { case "$1" in -t|--type) type="$2"; shift ; shift ;; *) type=default ;; esac && git -c diff.rdata.textconv=\"rdata2txt -t $type\" diff $@; }; f"