Skip to content

Instantly share code, notes, and snippets.

@daroczig
Created September 9, 2015 20:44
Show Gist options
  • Save daroczig/190d8ab95b828c460a30 to your computer and use it in GitHub Desktop.
Save daroczig/190d8ab95b828c460a30 to your computer and use it in GitHub Desktop.
Guess file delimiter, quote and decimal mark in text files
#' Guess a text data file's parameters
#' @param file filename
#' @return list of \code{sep}, \code{quote}, \code{dec}, \code{header},\code{fileEncoding}
#' @export
import.text.guess <- function(file) {
## check encoding -> encoding
encoding <- sub('.*charset=', '', system(sprintf('file -bi %s', shQuote(file)), intern = TRUE))
if (encoding %in% c('???', 'unknown-8bit')) {
warning('Could not identify encoding!')
encoding = 'UTF-8'
}
con <- file(file, encoding = encoding)
text <- readLines(con, n = 100, warn = FALSE)
close(con)
## check separators -> sep
ncol <- 0
separators <- c(',', ';', ' ', '\t', '|')
for (separator in separators) {
## without quotes (TODO: DRY)
t <- textConnection(text[2:min(length(text), 100)])
n <- count.fields(t, sep = separator, quote = NULL, comment.char = '')
close(t)
if (length(unique(n)) != 1) {
## with apostrophe
t <- textConnection(text[2:min(length(text), 100)])
n <- count.fields(t, sep = separator, quote = "'", comment.char = '')
close(t)
}
if (length(unique(n)) != 1) {
## with quotation mark
t <- textConnection(text[2:min(length(text), 100)])
n <- count.fields(t, sep = separator, quote = '"', comment.char = '')
close(t)
}
if ((length(unique(n)) == 1) & (all(unique(n) != 1))) {
if (n[1] > ncol) {
sep <- separator
ncol <- n[1]
}
}
}
if (!exists('sep'))
stop('Could not identify separator!')
## check quote -> quote
## NOTE: the first row has quotes with the highest probability
ncol <- 1
separators <- c('"', '\'')
quote <- NULL
for (separator in separators) {
t <- textConnection(text[1])
n <- count.fields(t, separator, comment.char = '')
close(t)
if (n > ncol) {
quote <- separator
ncol <- n
}
}
if (is.null(quote)) quote <- ''
## check decimal point -> dec
dec <- ','
t <- textConnection(text[2:min(length(text), 100)])
if (any(sapply(read.table(t, sep = sep, quote = quote, dec = '.', comment.char = ''), class) == 'numeric'))
dec <- '.'
close(t)
if (!exists('sep')) {
t <- textConnection(text[2:min(length(text), 100)], 'r')
if (any(sapply(read.table(t, sep = sep, quote = quote, dec = ',', comment.char = ''), class) == 'numeric'))
dec <- ','
close(t)
}
if (!exists('sep'))
stop('Could not identify decimal mark!')
## if row.names was provided
row.names <- NULL
t <- textConnection(text)
n <- count.fields(t, sep = sep, quote = quote)
close(t)
if (length(unique(n)) != 1)
row.names <- 1
## check header -> header
header <- FALSE
t <- textConnection(text[1])
t1 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '')
close(t)
t <- textConnection(text[2:min(length(text), 100)])
t2 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '')
close(t)
if (ncol(t1) == ncol(t2)) {
if (!all(sapply(t1, class) == sapply(t2, class)))
header <- TRUE
} else {
header <- TRUE
}
return(list(sep = sep, quote = quote, dec = dec, header = header, row.names = row.names, fileEncoding = encoding))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment