Last active
April 19, 2022 10:22
-
-
Save gorkang/0a628c363a41b80f19b7313e035bb74d to your computer and use it in GitHub Desktop.
detect_file_encoding() function to find most likely encoding of a csv file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
detect_file_encoding <- function(file_path) { | |
# MUCH SIMPLER, and should work most of the time | |
# file_path = "..." | |
# ENC = readr::guess_encoding(file_path, n_max = 1000) | |
# readr::read_csv(file_path, skip = 12, locale = locale(encoding = ENC$encoding[1])) | |
library(cli) | |
library(dplyr) | |
library(purrr) | |
library(readr) | |
library(stringi) | |
# Read file in UTF-8 and detect encodings present | |
file_raw = readr::read_file(file_path, locale = locale(encoding = "UTF-8")) | |
encodings_found = stringi::stri_enc_detect(file_raw) | |
# Function to read the file using all the encodings found | |
try_all_encodings <- function(file_path, ENCODING) { | |
FILE = read_file(file_path, locale = locale(encoding = ENCODING)) | |
HAS_BAD_CHARS = grepl("\u0086", FILE) | |
if (!HAS_BAD_CHARS) { | |
tibble(encoding = ENCODING, | |
content_file = list(FILE)) | |
} else { | |
tibble(encoding = ENCODING, | |
content_file = list("BAD_CHARS detected")) | |
} | |
} | |
# Safe version of function | |
try_all_encodings_safely = safely(try_all_encodings) | |
# Loop through all the encodings | |
OUT = 1:length(encodings_found[[1]]$Encoding) %>% | |
purrr::map(~ try_all_encodings_safely(file_path, encodings_found[[1]]$Encoding[.x])) | |
# Create nested clean tibble with all the working encodings and contents | |
OUT_clean = 1:length(OUT) %>% purrr::map(~ OUT[[.x]]$result) %>% dplyr::bind_rows() %>% dplyr::left_join(encodings_found[[1]] %>% dplyr::as_tibble(), by = c("encoding" = "Encoding")) | |
# Read file with the most likely working encoding | |
DF_proper_encoding = suppressMessages(readr::read_csv(file_path, skip = 12, locale = locale(encoding = encodings_found[[1]]$Encoding[1]), show_col_types = FALSE, name_repair = "unique")) | |
# Output list | |
OUT_final = list(OUT_clean = OUT_clean, | |
DF_proper_encoding = DF_proper_encoding) | |
# Output message | |
cli::cli_alert_info("Found {nrow(OUT_clean)} potential encodings: {paste(OUT_clean$encoding)} \n - DF_proper_encoding stored using {OUT_clean$encoding[1]}") | |
return(OUT_final) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment