gorkang · April 19, 2022 10:22
diff --git a/detect_file_encoding b/detect_file_encoding
 detect_file_encoding <- function(file_path) {
  
  # MUCH SIMPLER, and should work most of the time
  # file_path = "..."
  # ENC = readr::guess_encoding(file_path, n_max = 1000)
  # readr::read_csv(file_path, skip = 12, locale = locale(encoding = ENC$encoding[1]))

  
  library(cli)
  library(dplyr)
  library(purrr)
  library(readr)
  library(stringi)
  
  
  # Read file in UTF-8 and detect encodings present
  file_raw = readr::read_file(file_path, locale = locale(encoding = "UTF-8"))
  encodings_found = stringi::stri_enc_detect(file_raw)
  
  # Function to read the file using all the encodings found
  try_all_encodings <- function(file_path, ENCODING) {
    
    FILE = read_file(file_path, locale = locale(encoding = ENCODING))
    HAS_BAD_CHARS = grepl("\u0086", FILE)
    
    if (!HAS_BAD_CHARS) {
      tibble(encoding = ENCODING, 
            content_file = list(FILE))
    } else {
      tibble(encoding = ENCODING, 
             content_file = list("BAD_CHARS detected"))
    }

  }

  # Safe version of function  
  try_all_encodings_safely = safely(try_all_encodings)
  
  # Loop through all the encodings
  OUT = 1:length(encodings_found[[1]]$Encoding) %>% 
    purrr::map(~ try_all_encodings_safely(file_path, encodings_found[[1]]$Encoding[.x]))

  # Create nested clean tibble with all the working encodings and contents 
  OUT_clean = 1:length(OUT) %>% purrr::map(~ OUT[[.x]]$result) %>% dplyr::bind_rows() %>% dplyr::left_join(encodings_found[[1]] %>% dplyr::as_tibble(), by = c("encoding" = "Encoding"))
   
  # Read file with the most likely working encoding
  DF_proper_encoding = suppressMessages(readr::read_csv(file_path, skip = 12, locale = locale(encoding = encodings_found[[1]]$Encoding[1]), show_col_types = FALSE, name_repair = "unique"))

  # Output list
  OUT_final = list(OUT_clean = OUT_clean,
                   DF_proper_encoding = DF_proper_encoding)
  
  # Output message
  cli::cli_alert_info("Found {nrow(OUT_clean)} potential encodings: {paste(OUT_clean$encoding)} \n - DF_proper_encoding stored using {OUT_clean$encoding[1]}")
  
  return(OUT_final)
 }
	detect_file_encoding <- function(file_path) {

	# MUCH SIMPLER, and should work most of the time
	# file_path = "..."
	# ENC = readr::guess_encoding(file_path, n_max = 1000)
	# readr::read_csv(file_path, skip = 12, locale = locale(encoding = ENC$encoding[1]))


	library(cli)
	library(dplyr)
	library(purrr)
	library(readr)
	library(stringi)


	# Read file in UTF-8 and detect encodings present
	file_raw = readr::read_file(file_path, locale = locale(encoding = "UTF-8"))
	encodings_found = stringi::stri_enc_detect(file_raw)

	# Function to read the file using all the encodings found
	try_all_encodings <- function(file_path, ENCODING) {

	FILE = read_file(file_path, locale = locale(encoding = ENCODING))
	HAS_BAD_CHARS = grepl("\u0086", FILE)

	if (!HAS_BAD_CHARS) {
	tibble(encoding = ENCODING,
	content_file = list(FILE))
	} else {
	tibble(encoding = ENCODING,
	content_file = list("BAD_CHARS detected"))
	}

	}

	# Safe version of function
	try_all_encodings_safely = safely(try_all_encodings)

	# Loop through all the encodings
	OUT = 1:length(encodings_found[[1]]$Encoding) %>%
	purrr::map(~ try_all_encodings_safely(file_path, encodings_found[[1]]$Encoding[.x]))

	# Create nested clean tibble with all the working encodings and contents
	OUT_clean = 1:length(OUT) %>% purrr::map(~ OUT[[.x]]$result) %>% dplyr::bind_rows() %>% dplyr::left_join(encodings_found[[1]] %>% dplyr::as_tibble(), by = c("encoding" = "Encoding"))

	# Read file with the most likely working encoding
	DF_proper_encoding = suppressMessages(readr::read_csv(file_path, skip = 12, locale = locale(encoding = encodings_found[[1]]$Encoding[1]), show_col_types = FALSE, name_repair = "unique"))

	# Output list
	OUT_final = list(OUT_clean = OUT_clean,
	DF_proper_encoding = DF_proper_encoding)

	# Output message
	cli::cli_alert_info("Found {nrow(OUT_clean)} potential encodings: {paste(OUT_clean$encoding)} \n - DF_proper_encoding stored using {OUT_clean$encoding[1]}")

	return(OUT_final)
	}