Last active
September 29, 2018 07:50
-
-
Save cimentadaj/10acfcf260d6065e6aded97b087cd173 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The delimiter must appear the same number of times in each row | |
# and should be present in all rows | |
# Keeping in line with the guess from readr::read_csv, we can use the | |
# first 1000 rows to guess the delimiter but also offer to increase the | |
# guess number | |
# this is ; | |
csv_delim("https://datosabiertos.ayto-arganda.es/dataset/4e2b6867-0c88-4b2e-a290-bad625a9b3ac/resource/740d3e9c-735a-4145-9149-7af8eb9e8405/download/registro-de-entrada.-4-trimestre-2017.-oficinas-de-registro.csv") | |
# this is , | |
csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3b9e2138-a3c9-45da-8086-7eba0abce047/resource/e9c0b4a3-8955-46b4-a63b-ad619609588d/download/escuela-infantilalumnos.csv") | |
# this is , | |
csv_delim('http://datos.apc.es/wcm/connect/42a6fc93-d6d7-4f4e-971b-046adf9578ed/ATRAQUES_CRUCEROS.CSV?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE-42a6fc93-d6d7-4f4e-971b-046adf9578ed-miMIved') | |
# this is , | |
csv_delim('https://apirtod.dipucadiz.es/api/datos/casas_consistoriales.csv&rnd=1725731944') | |
# this is , | |
csv_delim('https://datosabiertos.ayto-arganda.es/dataset/bb8079e5-1425-467c-ab87-860e3d1aba6d/resource/d3dfeb5b-fc76-4bf0-9577-0975a0342479/download/sectorpublicodocumentacionnotarialhistoricototalmunicipio20102014.csv') | |
# this is , | |
csv_delim("1,2,3\n4,5,6") | |
csv_delim("a,b\n\n\n1,2") | |
csv_delim("a,b\n1,2\n\n\n2,3\n") | |
csv_delim("a,b\n1,2\n\n\n") | |
# Fails because the , is not repeated in over 90% of rows | |
csv_delim("v1,v2\n#foo\n1,2\n#bar\n3,4") | |
csv_delim("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6") | |
csv_delim("http://www.ine.es/jaxiT3/files/t/es/csv/24282.csv?nocab=1") | |
csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3fa98747-316d-4782-aab6-a7a42b8b00b8/resource/63c7c0ab-691c-4c75-929d-ae521bd67926/download/precipitacion.csv") | |
csv_delim("https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.csv?accessType=DOWNLOAD") | |
csv_delim("https://chronicdata.cdc.gov/views/g4ie-h725/rows.csv?accessType=DOWNLOAD") | |
# Ideally we'd like to eliminate this function altogether and port | |
# it into a package that can be called once this function | |
# is cleaned and refactored | |
csv_delim <- function(file, guess_max = 1000, threshold_rows = 0.9, | |
delim = c(',', '\t', ';', ' ', ':')) { | |
data <- | |
tryCatch( | |
readr::read_lines(file, n_max = guess_max), | |
error = function(e) NA_character_ | |
) | |
data <- strsplit(data, "\n") | |
filtered_data <- data[!vapply(data, function(x) is.null(x) | length(x) == 0, logical(1))] | |
# Get the number of rows read after deleting the empty rows | |
# in filtered_data because otherwise the threshold is calculated | |
# wrongly | |
rows_read <- min(length(filtered_data), guess_max) | |
res <- lapply(filtered_data, function(x) table(strsplit(x, ""))) | |
table_names <- lapply(res, names) | |
all_chars <- unlist(table_names) | |
all_chars <- all_chars[all_chars %in% delim] | |
prop_repetition <- table(all_chars) / rows_read | |
if (one_true(prop_repetition == 1) %in% c('one true', '> one true')) { | |
repeated_names <- names(which(prop_repetition == 1)) | |
} else if (any(prop_repetition > threshold_rows)) { | |
repeated_names <- names(which(prop_repetition > threshold_rows)) | |
} else { | |
# Because no character was matched at or over the threshold of rows | |
return (NA_character_) | |
} | |
unique_repetitions <- lapply(repeated_names, function(delimiter) { | |
unique_vals <- unique(unlist(lapply(res, `[`, delimiter))) | |
unique_vals[!is.na(unique_vals)] | |
}) | |
unique_repetitions <- setNames(unique_repetitions, repeated_names) | |
if (length(unique_repetitions) == 0) return(NA_character_) | |
same_count_delimiter <- | |
sapply(unique_repetitions, function(x) length(unique(x)) == 1) | |
matched_delimiters <- one_true(same_count_delimiter) | |
if (matched_delimiters == "one true") { | |
unique_delimiter <- names(same_count_delimiter[which(same_count_delimiter)]) | |
return(unique_delimiter) | |
} else if (matched_delimiters == '> one true') { | |
# If there were two delimiters that have a single number | |
# repeated in all rows and are at 90% of the rows or more | |
# filter whether these two are in the prefered delimiters and | |
# pick in the order of preference in the preferred delimiters | |
conflicting_delims <- names(same_count_delimiter)[which(same_count_delimiter)] | |
chosen_delimiter <- pick_preference(conflicting_delims, delim) | |
return(chosen_delimiter) | |
} else { | |
undecided_delims <- names(same_count_delimiter) | |
chosen_delimiter <- pick_preference(undecided_delims, delim) | |
if (!is.na(chosen_delimiter)) return(chosen_delimiter) | |
} | |
NA_character_ # no format was found | |
} | |
# internal fun | |
one_true <- function(x) { | |
table_trues <- table(x) | |
# If there's only ONE true, return 'one true', | |
# if more than one true, return '> one true' | |
# else 'no true' | |
if (any(as.logical(names(table_trues)))) { | |
if (table_trues['TRUE'] == 1) { | |
return("one true") | |
} else { | |
return("> one true") | |
} | |
} | |
"no true" | |
} | |
pick_preference <- function(match, pool_matches) { | |
available_delims <- match %in% pool_matches | |
if (any(available_delims)) { | |
# We turn to factor in order to sort according to the allowed formats. | |
# This way when we subset we keep the order of preference of files. | |
sorted_formats <- sort(factor(pool_matches, levels = pool_matches)) | |
chosen_delimiter <- sorted_formats[which(sorted_formats %in% match)[1]] | |
chosen_delimiter <- as.character(chosen_delimiter) | |
return(chosen_delimiter) | |
} | |
NA_character_ | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment