Skip to content

Instantly share code, notes, and snippets.

@ChHaeni
Last active March 23, 2023 13:16
Show Gist options
  • Save ChHaeni/15da2817fe407d0dc7f533330231af7c to your computer and use it in GitHub Desktop.
Save ChHaeni/15da2817fe407d0dc7f533330231af7c to your computer and use it in GitHub Desktop.
git diff R data

Introduction

This gist describes a possible way to show diffs of R data files (files with extension e.g. .rds, .RData, .rda, ...) by reading the data into a temporary R session and converting the R object(s) in a "diffable" way to text output.

The following is required:

  • Rscript executable can be found in the $PATH

  • the R package docopt has been installed

Helper Script to Convert R Objects to Text

Install the docopt package in R:

install.packages('docopt')

Create the script (rdata2txt) that reads any R data file of format (.rds, .RData, etc.) and print its content to the terminal:

#!/usr/bin/env Rscript

suppressMessages(library(docopt))       # we need docopt (>= 0.3) as on CRAN

doc <- "Usage: rdata2txt [-h] [-n] [-a] [-x EXT] [-t TYPE] [PATH]
-t --type TYPE      choose type of conversion [default: default]
-a --all            should attributes be checked too [default: FALSE]
-n --no-info        should rds info not be attached as attribute [default: FALSE]
-x --ext EXT        extension/file type if not clear [default: NULL]
-h --help           show this help text"

opt <- docopt(doc)


if (is.null(opt$PATH)) {
    cat('path argument is missing\n')
} else if (!is.character(opt$PATH)) {
    cat('argument path must be of type character\n')
} else if (!file.exists(opt$PATH)) {
    cat('path "', opt$PATH, '" does not exist\n', sep = '')
} else {
    cat_nl <- function(...) cat(paste(..., '\n'))
    print_all <- function(x, prefix = '.', all = FALSE) {
        # class
        x_class <- class(x)
        is_matrix <- inherits(x, 'matrix')
        is_list <- length(x_class) == 1 && x_class == 'list'
        cat(prefix, ' ~ ',
            switch(length(x_class)
                , sprintf("class: '%s'", x_class)
                , sprintf("class: '%s' and '%s'", x_class[1], x_class[2])
                , sprintf(paste0("class: '", paste(x_class[seq_len(length(x_class) - 2)], collapse = ', ') ,"', '%s' and '%s'"), 
                    x_class[length(x_class) - 1], x_class[length(x_class)])
                ),
        ': ', sep = '')
        # dimension
        x_dim <- dim(x)
        if (x_dim_null <- is.null(x_dim)) {
            # length: ... values
            cat(length(x), 'values')
            if (is.factor(x)) {
                cat_nl(' - Levels:', paste(levels(x), collapse = ' '))
            } else {
                cat_nl()
            }
        } else if (is_matrix) {
            # dim (matrix): ... rows x ... cols
            cat_nl(x_dim[1], 'rows x', x_dim[2], 'columns')
        } else {
            # dim: .. obs. of ... variables
            cat_nl(x_dim[1], 'obs. of ', x_dim[2], 'variables')
        }
        # names of x:
        x_names <- names(x)
        if (is_list) {
            if (is.null(x_names)) {
                # unnamed list
                # vector to loop over
                x_names <- paste0(prefix, '[[', seq_along(x), ']]')
            } else {
                # named list
                # print names
                cat_nl(prefix, '~ names:')
                print(x_names)
                # vector to loop over
                x_names <- paste0(prefix, '$', x_names)
            }
        } else if (is_matrix) {
            # matrix row/colnames?
            # rownames
            if (!is.null(rownames(x))) {
                cat_nl(prefix, '~ rownames:')
                print(rownames(x))
            }
            # colnames
            if (!is.null(colnames(x))) {
                cat_nl(prefix, '~ colnames:')
                print(colnames(x))
            }
        } else if (!x_dim_null) {
            # data.frame like -> row/colnames
            # check if row == seq_along
            x_rownames <- row.names(x)
            if (!identical(x_rownames, sprintf('%i', seq_len(x_dim[1])))) {
                # print rownames
                cat_nl(prefix, '~ row.names:')
                print(row.names(x))
            }
            # print colnames
            cat_nl(prefix, '~ names:')
            print(x_names)
            # vector to loop over
            x_names <- paste0(prefix, '$', x_names)
        }
        # print only if matrix or 'vector'
        if (is_matrix) {
            if (prefix != '') {
                cat_nl(prefix, ':', sep = '')
            }
            print(x[seq_along(x)])
        } else if (is_list || !x_dim_null) {
            # loop over entries
            for (l in seq_along(x_names)) {
                print_all(x[[l]], x_names[l], all = all)
            }
            if (all) {
                # check attributes (without names, row.names, dim, dimnames, class)
                att_names_all <- names(attributes(x))
                att_names <- setdiff(att_names_all, c('names', 'row.names', 'dim', 'dimnames', 'class'))
                # loop over attributes
                att_pre <- sprintf('attr(%s, "%s")', prefix, att_names)
                for (l in seq_along(att_names)) {
                    print_all(attr(x, att_names[l]), att_pre[l], all = all)
                }
            }
        } else {
            if (prefix != '') {
                cat_nl(prefix, ':', sep = '')
            }
            if (is.factor(x)) {
                print.default(x)
            } else {
                print(x)
            }
        }
    }
    # set terminal width
    width <- as.integer(sub('.*columns ([0-9]+)[;].*', '\\1', system('stty -a | head -n 1', intern = TRUE)))
    options(width = width * 0.8)
    # check extension -> rdata type
    if (opt$ext != 'NULL') {
        file_ext <- opt$ext
    } else {
        file_ext <- tolower(sub('.*[.]([a-zA-Z]{2, })$', '\\1', basename(opt$PATH)))
    }
    if (file_ext %in% c('rds', 'qs')) {
        # get object
        if (file_ext == 'rds') {
            obj <- readRDS(opt$PATH)
            if (!opt$no_info) attr(obj, 'infoRDS') <- infoRDS(opt$PATH)
        } else {
            obj <- qs::qread(opt$PATH)
        }
        # convert to text
        switch(opt$type
            , default = {
                print_all(obj, all = opt$all)
            }
            , print = {
                print(obj)
            }
            , print.default = {
                print.default(obj)
            }
            , dump = {
                dump('obj', '', control = NULL)
            }
            , dump.exact = {
                dump('obj', '', control = 'exact')
            }
            , dump.all = {
                dump('obj', '', control = 'all')
            }
            , dput = {
                dput(obj, '', control = NULL)
            }
            , dput.all = {
                dput(obj, '', control = 'all')
            }
            , dput.exact = {
                dput(obj, '', control = 'exact')
            }
            , dim = {
                print(dim(obj))
            }
            , attributes = {
                print(attributes(obj))
            }
            , str = {
                str(obj)
            }
            , {
                parsed <- try(eval(parse(text = opt$type)))
                if (is.function(parsed)) {
                    parsed <- parsed(obj)
                }
                if (inherits(parsed, 'try-error')) {
                    cat('argument -t/--type not valid! # random number for diff:', sample.int(1e9, 1), '\n')
                } else {
                    print(parsed)
                }
            }
        )
    } else if (file_ext %in% c('rdata', 'rda', 'rdta', 'qdata')) {
        # get objects
        old <- c(ls(), 'old')
        if (file_ext == 'qdata') {
            qs::qload(opt$PATH)
        } else {
            load(opt$PATH)
        }
        new <- ls()
        nms <- new[!(new %in% old)]
        objs <- mget(nms)
        # print object names
        cat(nms, '\n')
        # add prefix 'obj:'
        nms <- paste0('obj:', names(objs))
        # convert them to text
        switch(opt$type
            , default = {
                for (l in seq_along(objs)) {
                    cat(nms[l], '\n')
                    print_all(objs[[l]], nms[l], opt$all)
                }
            }
            , print = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    print(objs[[l]])
                }
            }
            , print.default = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    print.default(objs[[l]])
                }
            }
            , dump = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    dump(nms[l], '', control = NULL)
                }
            }
            , dump.exact = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    dump(nms[l], '', control = 'exact')
                }
            }
            , dump.all = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    dump(nms[l], '', control = 'all')
                }
            }
            , dput = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    dput(objs[[l]], '', control = NULL)
                }
            }
            , dput.all = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    dput(objs[[l]], '', control = 'all')
                }
            }
            , dput.exact = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    dput(objs[[l]], '', control = 'exact')
                }
            }
            , dim = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    print(dim(objs[[l]]))
                }
            }
            , attributes = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    print(attributes(objs[[l]]))
                }
            }
            , str = {
                for (l in seq_along(nms)) {
                    cat('\n', nms[l], ': ', sep = '')
                    str(objs[[l]])
                }
            }
            , {
                parsed <- try(eval(parse(text = opt$type)))
                if (is.function(parsed)) {
                    parsed <- parsed(obj)
                }
                if (inherits(parsed, 'try-error')) {
                    cat('argument -t/--type not valid! # random number for diff:', sample.int(1e9, 1), '\n')
                } else {
                    print(parsed)
                }
            }
        )
    } else {
        cat('File', opt$PATH, 'is not recognized as rdata file (.rds, .RData, etc.)\n')
    }
}

You can name the script whatever you like, however, it should match the docopt "Usage" instruction.

The path where you save the script to (e.g. ~/.local/bin) must exist in (or be added to) the $PATH variable.

Finally, the script must be executable:

chmod +x rdata2txt

Git attributes

You have to tell git which files will use this text conversion.

Add the following to your global git attributes file (in my case: ~/.config/git/attributes; see man gitattributes):

# R data files
*.rds diff=rdata
*.Rds diff=rdata
*.RDS diff=rdata
*.RData diff=rdata
*.rdata diff=rdata
*.rda diff=rdata
*.Rda diff=rdata
*.Rdta diff=rdata
*.rdta diff=rdata

Git config - potential aliases

The following aliases give an idea on the implementation of the rdata2txt script.

Copy the following lines to your global .gitconfig file (~/.gitconfig):

[alias]
    # completion not working, usage examples: git r -t dim show HEAD; git r -a show HEAD; git r diff HEAD~1; git r --type str log -p; etc.
    r = "!f() { case "$1" in -t|--type) type="$2"; all=""; shift; shift ;; -a|--all) type=default; all="-a"; shift ;; *) type=default; all="" ;; esac && git -c diff.rdata.textconv=\"rdata2txt -t \\\"$type\\\" $all\" $@; }; f"
    # completion works, usage ex.: git diff-r HEAD~1; etc.
    diff-r = !git -c diff.rdata.textconv=rdata2txt diff
    # completion works, can be very slow if there are many attributes, usage ex.: git diff-r-all HEAD~1; etc.
    diff-r-all = !git -c diff.rdata.textconv='rdata2txt -a' diff
    # completion not working, usage ex.: git diff-r2 -t str HEAD~1; etc.
    diff-r2 = "!f() { case "$1" in -t|--type) type="$2"; shift ; shift ;; *) type=default ;; esac && git -c diff.rdata.textconv=\"rdata2txt -t $type\" diff $@; }; f"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment