Created
September 9, 2015 20:44
-
-
Save daroczig/190d8ab95b828c460a30 to your computer and use it in GitHub Desktop.
Guess file delimiter, quote and decimal mark in text files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Guess a text data file's parameters | |
#' @param file filename | |
#' @return list of \code{sep}, \code{quote}, \code{dec}, \code{header},\code{fileEncoding} | |
#' @export | |
import.text.guess <- function(file) { | |
## check encoding -> encoding | |
encoding <- sub('.*charset=', '', system(sprintf('file -bi %s', shQuote(file)), intern = TRUE)) | |
if (encoding %in% c('???', 'unknown-8bit')) { | |
warning('Could not identify encoding!') | |
encoding = 'UTF-8' | |
} | |
con <- file(file, encoding = encoding) | |
text <- readLines(con, n = 100, warn = FALSE) | |
close(con) | |
## check separators -> sep | |
ncol <- 0 | |
separators <- c(',', ';', ' ', '\t', '|') | |
for (separator in separators) { | |
## without quotes (TODO: DRY) | |
t <- textConnection(text[2:min(length(text), 100)]) | |
n <- count.fields(t, sep = separator, quote = NULL, comment.char = '') | |
close(t) | |
if (length(unique(n)) != 1) { | |
## with apostrophe | |
t <- textConnection(text[2:min(length(text), 100)]) | |
n <- count.fields(t, sep = separator, quote = "'", comment.char = '') | |
close(t) | |
} | |
if (length(unique(n)) != 1) { | |
## with quotation mark | |
t <- textConnection(text[2:min(length(text), 100)]) | |
n <- count.fields(t, sep = separator, quote = '"', comment.char = '') | |
close(t) | |
} | |
if ((length(unique(n)) == 1) & (all(unique(n) != 1))) { | |
if (n[1] > ncol) { | |
sep <- separator | |
ncol <- n[1] | |
} | |
} | |
} | |
if (!exists('sep')) | |
stop('Could not identify separator!') | |
## check quote -> quote | |
## NOTE: the first row has quotes with the highest probability | |
ncol <- 1 | |
separators <- c('"', '\'') | |
quote <- NULL | |
for (separator in separators) { | |
t <- textConnection(text[1]) | |
n <- count.fields(t, separator, comment.char = '') | |
close(t) | |
if (n > ncol) { | |
quote <- separator | |
ncol <- n | |
} | |
} | |
if (is.null(quote)) quote <- '' | |
## check decimal point -> dec | |
dec <- ',' | |
t <- textConnection(text[2:min(length(text), 100)]) | |
if (any(sapply(read.table(t, sep = sep, quote = quote, dec = '.', comment.char = ''), class) == 'numeric')) | |
dec <- '.' | |
close(t) | |
if (!exists('sep')) { | |
t <- textConnection(text[2:min(length(text), 100)], 'r') | |
if (any(sapply(read.table(t, sep = sep, quote = quote, dec = ',', comment.char = ''), class) == 'numeric')) | |
dec <- ',' | |
close(t) | |
} | |
if (!exists('sep')) | |
stop('Could not identify decimal mark!') | |
## if row.names was provided | |
row.names <- NULL | |
t <- textConnection(text) | |
n <- count.fields(t, sep = sep, quote = quote) | |
close(t) | |
if (length(unique(n)) != 1) | |
row.names <- 1 | |
## check header -> header | |
header <- FALSE | |
t <- textConnection(text[1]) | |
t1 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '') | |
close(t) | |
t <- textConnection(text[2:min(length(text), 100)]) | |
t2 <- read.table(t, sep = sep, quote = quote, dec = dec, row.names = row.names, comment.char = '') | |
close(t) | |
if (ncol(t1) == ncol(t2)) { | |
if (!all(sapply(t1, class) == sapply(t2, class))) | |
header <- TRUE | |
} else { | |
header <- TRUE | |
} | |
return(list(sep = sep, quote = quote, dec = dec, header = header, row.names = row.names, fileEncoding = encoding)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment