Skip to content

Instantly share code, notes, and snippets.

@cimentadaj
Last active September 29, 2018 07:50
Show Gist options
  • Save cimentadaj/10acfcf260d6065e6aded97b087cd173 to your computer and use it in GitHub Desktop.
Save cimentadaj/10acfcf260d6065e6aded97b087cd173 to your computer and use it in GitHub Desktop.
# The delimiter must appear the same number of times in each row
# and should be present in all rows
# Keeping in line with the guess from readr::read_csv, we can use the
# first 1000 rows to guess the delimiter but also offer to increase the
# guess number
# this is ;
csv_delim("https://datosabiertos.ayto-arganda.es/dataset/4e2b6867-0c88-4b2e-a290-bad625a9b3ac/resource/740d3e9c-735a-4145-9149-7af8eb9e8405/download/registro-de-entrada.-4-trimestre-2017.-oficinas-de-registro.csv")
# this is ,
csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3b9e2138-a3c9-45da-8086-7eba0abce047/resource/e9c0b4a3-8955-46b4-a63b-ad619609588d/download/escuela-infantilalumnos.csv")
# this is ,
csv_delim('http://datos.apc.es/wcm/connect/42a6fc93-d6d7-4f4e-971b-046adf9578ed/ATRAQUES_CRUCEROS.CSV?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE-42a6fc93-d6d7-4f4e-971b-046adf9578ed-miMIved')
# this is ,
csv_delim('https://apirtod.dipucadiz.es/api/datos/casas_consistoriales.csv&rnd=1725731944')
# this is ,
csv_delim('https://datosabiertos.ayto-arganda.es/dataset/bb8079e5-1425-467c-ab87-860e3d1aba6d/resource/d3dfeb5b-fc76-4bf0-9577-0975a0342479/download/sectorpublicodocumentacionnotarialhistoricototalmunicipio20102014.csv')
# this is ,
csv_delim("1,2,3\n4,5,6")
csv_delim("a,b\n\n\n1,2")
csv_delim("a,b\n1,2\n\n\n2,3\n")
csv_delim("a,b\n1,2\n\n\n")
# Fails because the , is not repeated in over 90% of rows
csv_delim("v1,v2\n#foo\n1,2\n#bar\n3,4")
csv_delim("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6")
csv_delim("http://www.ine.es/jaxiT3/files/t/es/csv/24282.csv?nocab=1")
csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3fa98747-316d-4782-aab6-a7a42b8b00b8/resource/63c7c0ab-691c-4c75-929d-ae521bd67926/download/precipitacion.csv")
csv_delim("https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.csv?accessType=DOWNLOAD")
csv_delim("https://chronicdata.cdc.gov/views/g4ie-h725/rows.csv?accessType=DOWNLOAD")
# Ideally we'd like to eliminate this function altogether and port
# it into a package that can be called once this function
# is cleaned and refactored
csv_delim <- function(file, guess_max = 1000, threshold_rows = 0.9,
delim = c(',', '\t', ';', ' ', ':')) {
data <-
tryCatch(
readr::read_lines(file, n_max = guess_max),
error = function(e) NA_character_
)
data <- strsplit(data, "\n")
filtered_data <- data[!vapply(data, function(x) is.null(x) | length(x) == 0, logical(1))]
# Get the number of rows read after deleting the empty rows
# in filtered_data because otherwise the threshold is calculated
# wrongly
rows_read <- min(length(filtered_data), guess_max)
res <- lapply(filtered_data, function(x) table(strsplit(x, "")))
table_names <- lapply(res, names)
all_chars <- unlist(table_names)
all_chars <- all_chars[all_chars %in% delim]
prop_repetition <- table(all_chars) / rows_read
if (one_true(prop_repetition == 1) %in% c('one true', '> one true')) {
repeated_names <- names(which(prop_repetition == 1))
} else if (any(prop_repetition > threshold_rows)) {
repeated_names <- names(which(prop_repetition > threshold_rows))
} else {
# Because no character was matched at or over the threshold of rows
return (NA_character_)
}
unique_repetitions <- lapply(repeated_names, function(delimiter) {
unique_vals <- unique(unlist(lapply(res, `[`, delimiter)))
unique_vals[!is.na(unique_vals)]
})
unique_repetitions <- setNames(unique_repetitions, repeated_names)
if (length(unique_repetitions) == 0) return(NA_character_)
same_count_delimiter <-
sapply(unique_repetitions, function(x) length(unique(x)) == 1)
matched_delimiters <- one_true(same_count_delimiter)
if (matched_delimiters == "one true") {
unique_delimiter <- names(same_count_delimiter[which(same_count_delimiter)])
return(unique_delimiter)
} else if (matched_delimiters == '> one true') {
# If there were two delimiters that have a single number
# repeated in all rows and are at 90% of the rows or more
# filter whether these two are in the prefered delimiters and
# pick in the order of preference in the preferred delimiters
conflicting_delims <- names(same_count_delimiter)[which(same_count_delimiter)]
chosen_delimiter <- pick_preference(conflicting_delims, delim)
return(chosen_delimiter)
} else {
undecided_delims <- names(same_count_delimiter)
chosen_delimiter <- pick_preference(undecided_delims, delim)
if (!is.na(chosen_delimiter)) return(chosen_delimiter)
}
NA_character_ # no format was found
}
# internal fun
one_true <- function(x) {
table_trues <- table(x)
# If there's only ONE true, return 'one true',
# if more than one true, return '> one true'
# else 'no true'
if (any(as.logical(names(table_trues)))) {
if (table_trues['TRUE'] == 1) {
return("one true")
} else {
return("> one true")
}
}
"no true"
}
pick_preference <- function(match, pool_matches) {
available_delims <- match %in% pool_matches
if (any(available_delims)) {
# We turn to factor in order to sort according to the allowed formats.
# This way when we subset we keep the order of preference of files.
sorted_formats <- sort(factor(pool_matches, levels = pool_matches))
chosen_delimiter <- sorted_formats[which(sorted_formats %in% match)[1]]
chosen_delimiter <- as.character(chosen_delimiter)
return(chosen_delimiter)
}
NA_character_
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment