cimentadaj · September 29, 2018 07:50
diff --git a/csv_delim.R b/csv_delim.R
 # The delimiter must appear the same number of times in each row
 # and should be present in all rows
 # Keeping in line with the guess from readr::read_csv, we can use the
 # first 1000 rows to guess the delimiter but also offer to increase the
 # guess number

 # this is ;
 csv_delim("https://datosabiertos.ayto-arganda.es/dataset/4e2b6867-0c88-4b2e-a290-bad625a9b3ac/resource/740d3e9c-735a-4145-9149-7af8eb9e8405/download/registro-de-entrada.-4-trimestre-2017.-oficinas-de-registro.csv")

 # this is ,
 csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3b9e2138-a3c9-45da-8086-7eba0abce047/resource/e9c0b4a3-8955-46b4-a63b-ad619609588d/download/escuela-infantilalumnos.csv")

 # this is ,
 csv_delim('http://datos.apc.es/wcm/connect/42a6fc93-d6d7-4f4e-971b-046adf9578ed/ATRAQUES_CRUCEROS.CSV?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE-42a6fc93-d6d7-4f4e-971b-046adf9578ed-miMIved')

 # this is ,
 csv_delim('https://apirtod.dipucadiz.es/api/datos/casas_consistoriales.csv&rnd=1725731944')

 # this is ,
 csv_delim('https://datosabiertos.ayto-arganda.es/dataset/bb8079e5-1425-467c-ab87-860e3d1aba6d/resource/d3dfeb5b-fc76-4bf0-9577-0975a0342479/download/sectorpublicodocumentacionnotarialhistoricototalmunicipio20102014.csv')

 # this is ,
 csv_delim("1,2,3\n4,5,6")

 csv_delim("a,b\n\n\n1,2")

 csv_delim("a,b\n1,2\n\n\n2,3\n")

 csv_delim("a,b\n1,2\n\n\n")

 # Fails because the , is not repeated in over 90% of rows
 csv_delim("v1,v2\n#foo\n1,2\n#bar\n3,4")

 csv_delim("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6")

 csv_delim("http://www.ine.es/jaxiT3/files/t/es/csv/24282.csv?nocab=1")

 csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3fa98747-316d-4782-aab6-a7a42b8b00b8/resource/63c7c0ab-691c-4c75-929d-ae521bd67926/download/precipitacion.csv")


 csv_delim("https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.csv?accessType=DOWNLOAD")

 csv_delim("https://chronicdata.cdc.gov/views/g4ie-h725/rows.csv?accessType=DOWNLOAD")

 # Ideally we'd like to eliminate this function altogether and port
 # it into a package that can be called once this function
 # is cleaned and refactored
 csv_delim <- function(file, guess_max = 1000, threshold_rows = 0.9,
                      delim = c(',', '\t', ';', ' ', ':')) {

  data <-
    tryCatch(
      readr::read_lines(file, n_max = guess_max),
      error = function(e) NA_character_
    )

  data <- strsplit(data, "\n")

  filtered_data <- data[!vapply(data, function(x) is.null(x) | length(x) == 0, logical(1))]

  # Get the number of rows read after deleting the empty rows
  # in filtered_data because otherwise the threshold is calculated
  # wrongly
  rows_read <- min(length(filtered_data), guess_max)

  res <- lapply(filtered_data, function(x) table(strsplit(x, "")))
  table_names <- lapply(res, names)

  all_chars <- unlist(table_names)

  all_chars <- all_chars[all_chars %in% delim]

  prop_repetition <- table(all_chars) / rows_read
  if (one_true(prop_repetition == 1) %in% c('one true', '> one true')) {
    repeated_names <- names(which(prop_repetition == 1))
  } else if (any(prop_repetition > threshold_rows)) {
    repeated_names <- names(which(prop_repetition > threshold_rows))
  } else {
    # Because no character was matched at or over the threshold of rows
    return (NA_character_)
  }

  unique_repetitions <- lapply(repeated_names, function(delimiter) {
    unique_vals <- unique(unlist(lapply(res, `[`, delimiter)))
    unique_vals[!is.na(unique_vals)]
  })

  unique_repetitions <- setNames(unique_repetitions, repeated_names)

  if (length(unique_repetitions) == 0) return(NA_character_)

  same_count_delimiter <-
    sapply(unique_repetitions, function(x) length(unique(x)) == 1)

  matched_delimiters <- one_true(same_count_delimiter)

  if (matched_delimiters == "one true") {

    unique_delimiter <- names(same_count_delimiter[which(same_count_delimiter)])

    return(unique_delimiter)

  } else if (matched_delimiters == '> one true') {
    # If there were two delimiters that have a single number
    # repeated in all rows and are at 90% of the rows or more
    # filter whether these two are in the prefered delimiters and
    # pick in the order of preference in the preferred delimiters

    conflicting_delims <- names(same_count_delimiter)[which(same_count_delimiter)]
    chosen_delimiter <- pick_preference(conflicting_delims, delim)

    return(chosen_delimiter)
  } else {
    undecided_delims <- names(same_count_delimiter)
    chosen_delimiter <- pick_preference(undecided_delims, delim)

    if (!is.na(chosen_delimiter)) return(chosen_delimiter)
  }

  NA_character_ # no format was found
 }


 # internal fun
 one_true <- function(x) {
  table_trues <- table(x)

  # If there's only ONE true, return 'one true',
  # if more than one true, return '> one true'
  # else 'no true'
  if (any(as.logical(names(table_trues)))) {
    if (table_trues['TRUE'] == 1) {
      return("one true")
    } else {
      return("> one true")
    }
  }

  "no true"
 }

 pick_preference <- function(match, pool_matches) {
  available_delims <- match %in% pool_matches

  if (any(available_delims)) {
    # We turn to factor in order to sort according to the allowed formats.
    # This way when we subset we keep the order of preference of files.
    sorted_formats <- sort(factor(pool_matches, levels = pool_matches))
    chosen_delimiter <- sorted_formats[which(sorted_formats %in% match)[1]]
    chosen_delimiter <- as.character(chosen_delimiter)
    return(chosen_delimiter)
  }

  NA_character_
 }
	# The delimiter must appear the same number of times in each row
	# and should be present in all rows
	# Keeping in line with the guess from readr::read_csv, we can use the
	# first 1000 rows to guess the delimiter but also offer to increase the
	# guess number

	# this is ;
	csv_delim("https://datosabiertos.ayto-arganda.es/dataset/4e2b6867-0c88-4b2e-a290-bad625a9b3ac/resource/740d3e9c-735a-4145-9149-7af8eb9e8405/download/registro-de-entrada.-4-trimestre-2017.-oficinas-de-registro.csv")

	# this is ,
	csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3b9e2138-a3c9-45da-8086-7eba0abce047/resource/e9c0b4a3-8955-46b4-a63b-ad619609588d/download/escuela-infantilalumnos.csv")

	# this is ,
	csv_delim('http://datos.apc.es/wcm/connect/42a6fc93-d6d7-4f4e-971b-046adf9578ed/ATRAQUES_CRUCEROS.CSV?MOD=AJPERES&CONVERT_TO=url&CACHEID=ROOTWORKSPACE-42a6fc93-d6d7-4f4e-971b-046adf9578ed-miMIved')

	# this is ,
	csv_delim('https://apirtod.dipucadiz.es/api/datos/casas_consistoriales.csv&rnd=1725731944')

	# this is ,
	csv_delim('https://datosabiertos.ayto-arganda.es/dataset/bb8079e5-1425-467c-ab87-860e3d1aba6d/resource/d3dfeb5b-fc76-4bf0-9577-0975a0342479/download/sectorpublicodocumentacionnotarialhistoricototalmunicipio20102014.csv')

	# this is ,
	csv_delim("1,2,3\n4,5,6")

	csv_delim("a,b\n\n\n1,2")

	csv_delim("a,b\n1,2\n\n\n2,3\n")

	csv_delim("a,b\n1,2\n\n\n")

	# Fails because the , is not repeated in over 90% of rows
	csv_delim("v1,v2\n#foo\n1,2\n#bar\n3,4")

	csv_delim("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6")

	csv_delim("http://www.ine.es/jaxiT3/files/t/es/csv/24282.csv?nocab=1")

	csv_delim("https://datosabiertos.ayto-arganda.es/dataset/3fa98747-316d-4782-aab6-a7a42b8b00b8/resource/63c7c0ab-691c-4c75-929d-ae521bd67926/download/precipitacion.csv")


	csv_delim("https://data.cityofnewyork.us/api/views/kku6-nxdu/rows.csv?accessType=DOWNLOAD")

	csv_delim("https://chronicdata.cdc.gov/views/g4ie-h725/rows.csv?accessType=DOWNLOAD")

	# Ideally we'd like to eliminate this function altogether and port
	# it into a package that can be called once this function
	# is cleaned and refactored
	csv_delim <- function(file, guess_max = 1000, threshold_rows = 0.9,
	delim = c(',', '\t', ';', ' ', ':')) {

	data <-
	tryCatch(
	readr::read_lines(file, n_max = guess_max),
	error = function(e) NA_character_
	)

	data <- strsplit(data, "\n")

	filtered_data <- data[!vapply(data, function(x) is.null(x) \| length(x) == 0, logical(1))]

	# Get the number of rows read after deleting the empty rows
	# in filtered_data because otherwise the threshold is calculated
	# wrongly
	rows_read <- min(length(filtered_data), guess_max)

	res <- lapply(filtered_data, function(x) table(strsplit(x, "")))
	table_names <- lapply(res, names)

	all_chars <- unlist(table_names)

	all_chars <- all_chars[all_chars %in% delim]

	prop_repetition <- table(all_chars) / rows_read
	if (one_true(prop_repetition == 1) %in% c('one true', '> one true')) {
	repeated_names <- names(which(prop_repetition == 1))
	} else if (any(prop_repetition > threshold_rows)) {
	repeated_names <- names(which(prop_repetition > threshold_rows))
	} else {
	# Because no character was matched at or over the threshold of rows
	return (NA_character_)
	}

	unique_repetitions <- lapply(repeated_names, function(delimiter) {
	unique_vals <- unique(unlist(lapply(res, `[`, delimiter)))
	unique_vals[!is.na(unique_vals)]
	})

	unique_repetitions <- setNames(unique_repetitions, repeated_names)

	if (length(unique_repetitions) == 0) return(NA_character_)

	same_count_delimiter <-
	sapply(unique_repetitions, function(x) length(unique(x)) == 1)

	matched_delimiters <- one_true(same_count_delimiter)

	if (matched_delimiters == "one true") {

	unique_delimiter <- names(same_count_delimiter[which(same_count_delimiter)])

	return(unique_delimiter)

	} else if (matched_delimiters == '> one true') {
	# If there were two delimiters that have a single number
	# repeated in all rows and are at 90% of the rows or more
	# filter whether these two are in the prefered delimiters and
	# pick in the order of preference in the preferred delimiters

	conflicting_delims <- names(same_count_delimiter)[which(same_count_delimiter)]
	chosen_delimiter <- pick_preference(conflicting_delims, delim)

	return(chosen_delimiter)
	} else {
	undecided_delims <- names(same_count_delimiter)
	chosen_delimiter <- pick_preference(undecided_delims, delim)

	if (!is.na(chosen_delimiter)) return(chosen_delimiter)
	}

	NA_character_ # no format was found
	}


	# internal fun
	one_true <- function(x) {
	table_trues <- table(x)

	# If there's only ONE true, return 'one true',
	# if more than one true, return '> one true'
	# else 'no true'
	if (any(as.logical(names(table_trues)))) {
	if (table_trues['TRUE'] == 1) {
	return("one true")
	} else {
	return("> one true")
	}
	}

	"no true"
	}

	pick_preference <- function(match, pool_matches) {
	available_delims <- match %in% pool_matches

	if (any(available_delims)) {
	# We turn to factor in order to sort according to the allowed formats.
	# This way when we subset we keep the order of preference of files.
	sorted_formats <- sort(factor(pool_matches, levels = pool_matches))
	chosen_delimiter <- sorted_formats[which(sorted_formats %in% match)[1]]
	chosen_delimiter <- as.character(chosen_delimiter)
	return(chosen_delimiter)
	}

	NA_character_
	}