daroczig · September 9, 2015 20:44
diff --git a/guess.R b/guess.R
 #' Guess a text data file's parameters
 #' @param file filename
 #' @return list of \code{sep}, \code{quote}, \code{dec}, \code{header},\code{fileEncoding}
 #' @export
 import.text.guess <- function(file) {

    ## check encoding -> encoding
    encoding <- sub('.*charset=', '', system(sprintf('file -bi %s', shQuote(file)), intern = TRUE))
    if (encoding %in% c('???', 'unknown-8bit')) {
        warning('Could not identify encoding!')
        encoding = 'UTF-8'
    }
    con <- file(file, encoding = encoding)
    text <- readLines(con, n = 100, warn = FALSE)
    close(con)

    ## check separators -> sep
    ncol <- 0
    separators <- c(',', ';', ' ', '\t', '|')
    for (separator in separators) {
        ## without quotes (TODO: DRY)
        t <- textConnection(text[2:min(length(text), 100)])
        n <- count.fields(t, sep = separator, quote = NULL, comment.char = '')
        close(t)
        if (length(unique(n)) != 1) {
            ## with apostrophe
            t <- textConnection(text[2:min(length(text), 100)])
            n <- count.fields(t, sep = separator, quote = "'", comment.char = '')
            close(t)
        }
        if (length(unique(n)) != 1) {
            ## with quotation mark
            t <- textConnection(text[2:min(length(text), 100)])
            n <- count.fields(t, sep = separator, quote = '"', comment.char = '')
            close(t)
        }
        if ((length(unique(n)) == 1) & (all(unique(n) != 1))) {
            if (n[1] > ncol) {
                sep <- separator
                ncol <- n[1]
            }
        }
    }
    if (!exists('sep'))
        stop('Could not identify separator!')

    ## check quote -> quote
    ## NOTE: the first row has quotes with the highest probability
    ncol <- 1
    separators <- c('"', '\'')
    quote <- NULL
    for (separator in separators) {
        t <- textConnection(text[1])
        n <- count.fields(t, separator, comment.char = '')
        close(t)
        if (n > ncol) {
            quote <- separator
            ncol <- n
        }
    }
    if (is.null(quote)) quote <- ''

    ## check decimal point -> dec
    dec <- ','
    t <- textConnection(text[2:min(length(text), 100)])
    if (any(sapply(read.table(t, sep = sep, quote = quote, dec = '.', comment.char = ''), class) == 'numeric'))
        dec <- '.'
    close(t)

    if (!exists('sep')) {

        t <- textConnection(text[2:min(length(text), 100)], 'r')
        if (any(sapply(read.table(t, sep = sep, quote = quote, dec = ',', comment.char = ''), class) == 'numeric'))
            dec <- ','
        close(t)

    }
    if (!exists('sep'))
        stop('Could not identify decimal mark!')

    ## if row.names was provided
    row.names <- NULL
    t <- textConnection(text)
    n <- count.fields(t, sep = sep, quote = quote)
    close(t)
    if (length(unique(n)) != 1)
        row.names <- 1

    ## check header -> header
    header <- FALSE
    t <- textConnection(text[1])
    t1 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '')
    close(t)
    t <- textConnection(text[2:min(length(text), 100)])
    t2 <- read.table(t,  sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '')
    close(t)
    if (ncol(t1) == ncol(t2)) {
        if (!all(sapply(t1, class) == sapply(t2, class)))
            header <- TRUE
    } else {
        header <- TRUE
    }

    return(list(sep = sep, quote = quote, dec = dec, header = header, row.names = row.names, fileEncoding = encoding))

 }
	#' Guess a text data file's parameters
	#' @param file filename
	#' @return list of \code{sep}, \code{quote}, \code{dec}, \code{header},\code{fileEncoding}
	#' @export
	import.text.guess <- function(file) {

	## check encoding -> encoding
	encoding <- sub('.*charset=', '', system(sprintf('file -bi %s', shQuote(file)), intern = TRUE))
	if (encoding %in% c('???', 'unknown-8bit')) {
	warning('Could not identify encoding!')
	encoding = 'UTF-8'
	}
	con <- file(file, encoding = encoding)
	text <- readLines(con, n = 100, warn = FALSE)
	close(con)

	## check separators -> sep
	ncol <- 0
	separators <- c(',', ';', ' ', '\t', '\|')
	for (separator in separators) {
	## without quotes (TODO: DRY)
	t <- textConnection(text[2:min(length(text), 100)])
	n <- count.fields(t, sep = separator, quote = NULL, comment.char = '')
	close(t)
	if (length(unique(n)) != 1) {
	## with apostrophe
	t <- textConnection(text[2:min(length(text), 100)])
	n <- count.fields(t, sep = separator, quote = "'", comment.char = '')
	close(t)
	}
	if (length(unique(n)) != 1) {
	## with quotation mark
	t <- textConnection(text[2:min(length(text), 100)])
	n <- count.fields(t, sep = separator, quote = '"', comment.char = '')
	close(t)
	}
	if ((length(unique(n)) == 1) & (all(unique(n) != 1))) {
	if (n[1] > ncol) {
	sep <- separator
	ncol <- n[1]
	}
	}
	}
	if (!exists('sep'))
	stop('Could not identify separator!')

	## check quote -> quote
	## NOTE: the first row has quotes with the highest probability
	ncol <- 1
	separators <- c('"', '\'')
	quote <- NULL
	for (separator in separators) {
	t <- textConnection(text[1])
	n <- count.fields(t, separator, comment.char = '')
	close(t)
	if (n > ncol) {
	quote <- separator
	ncol <- n
	}
	}
	if (is.null(quote)) quote <- ''

	## check decimal point -> dec
	dec <- ','
	t <- textConnection(text[2:min(length(text), 100)])
	if (any(sapply(read.table(t, sep = sep, quote = quote, dec = '.', comment.char = ''), class) == 'numeric'))
	dec <- '.'
	close(t)

	if (!exists('sep')) {

	t <- textConnection(text[2:min(length(text), 100)], 'r')
	if (any(sapply(read.table(t, sep = sep, quote = quote, dec = ',', comment.char = ''), class) == 'numeric'))
	dec <- ','
	close(t)

	}
	if (!exists('sep'))
	stop('Could not identify decimal mark!')

	## if row.names was provided
	row.names <- NULL
	t <- textConnection(text)
	n <- count.fields(t, sep = sep, quote = quote)
	close(t)
	if (length(unique(n)) != 1)
	row.names <- 1

	## check header -> header
	header <- FALSE
	t <- textConnection(text[1])
	t1 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '')
	close(t)
	t <- textConnection(text[2:min(length(text), 100)])
	t2 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '')
	close(t)
	if (ncol(t1) == ncol(t2)) {
	if (!all(sapply(t1, class) == sapply(t2, class)))
	header <- TRUE
	} else {
	header <- TRUE
	}

	return(list(sep = sep, quote = quote, dec = dec, header = header, row.names = row.names, fileEncoding = encoding))

	}