Skip to content

Instantly share code, notes, and snippets.

@gorkang
Last active April 19, 2022 10:22
Show Gist options
  • Save gorkang/0a628c363a41b80f19b7313e035bb74d to your computer and use it in GitHub Desktop.
Save gorkang/0a628c363a41b80f19b7313e035bb74d to your computer and use it in GitHub Desktop.
detect_file_encoding() function to find most likely encoding of a csv file
detect_file_encoding <- function(file_path) {
# MUCH SIMPLER, and should work most of the time
# file_path = "..."
# ENC = readr::guess_encoding(file_path, n_max = 1000)
# readr::read_csv(file_path, skip = 12, locale = locale(encoding = ENC$encoding[1]))
library(cli)
library(dplyr)
library(purrr)
library(readr)
library(stringi)
# Read file in UTF-8 and detect encodings present
file_raw = readr::read_file(file_path, locale = locale(encoding = "UTF-8"))
encodings_found = stringi::stri_enc_detect(file_raw)
# Function to read the file using all the encodings found
try_all_encodings <- function(file_path, ENCODING) {
FILE = read_file(file_path, locale = locale(encoding = ENCODING))
HAS_BAD_CHARS = grepl("\u0086", FILE)
if (!HAS_BAD_CHARS) {
tibble(encoding = ENCODING,
content_file = list(FILE))
} else {
tibble(encoding = ENCODING,
content_file = list("BAD_CHARS detected"))
}
}
# Safe version of function
try_all_encodings_safely = safely(try_all_encodings)
# Loop through all the encodings
OUT = 1:length(encodings_found[[1]]$Encoding) %>%
purrr::map(~ try_all_encodings_safely(file_path, encodings_found[[1]]$Encoding[.x]))
# Create nested clean tibble with all the working encodings and contents
OUT_clean = 1:length(OUT) %>% purrr::map(~ OUT[[.x]]$result) %>% dplyr::bind_rows() %>% dplyr::left_join(encodings_found[[1]] %>% dplyr::as_tibble(), by = c("encoding" = "Encoding"))
# Read file with the most likely working encoding
DF_proper_encoding = suppressMessages(readr::read_csv(file_path, skip = 12, locale = locale(encoding = encodings_found[[1]]$Encoding[1]), show_col_types = FALSE, name_repair = "unique"))
# Output list
OUT_final = list(OUT_clean = OUT_clean,
DF_proper_encoding = DF_proper_encoding)
# Output message
cli::cli_alert_info("Found {nrow(OUT_clean)} potential encodings: {paste(OUT_clean$encoding)} \n - DF_proper_encoding stored using {OUT_clean$encoding[1]}")
return(OUT_final)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment