Last active
May 24, 2020 18:35
-
-
Save pawelszydlo/936d9f2cf15d04ab80a0705d1a1bef93 to your computer and use it in GitHub Desktop.
Detect text file encoding and convert to UTF-8
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python3 | |
# Script to auto-detect file encoding and convert it to UTF-8. | |
# Uses cchardet which provides very good detection. | |
# | |
# Newest version can always be found at: | |
# https://gist.github.com/pawelszydlo/936d9f2cf15d04ab80a0705d1a1bef93 | |
import cchardet | |
import os | |
import sys | |
def convert_encoding(file_name, new_encoding): | |
"""Converts data in file_name to new_encoding.""" | |
if not os.path.isfile(file_name): | |
print('"%s" is not a file.' % file_name) | |
return | |
# Skip files larger than 1MB. | |
if os.path.getsize(file_name) > 1024 * 1024: | |
print('"%s" is too big.' % file_name) | |
return | |
data = open(file_name, 'rb').read() | |
encoding = cchardet.detect(data)['encoding'] | |
if not encoding: | |
print('Couldn\'t detect encoding for "%s".' % file_name) | |
return | |
print('Detected "%s" for file "%s" ...' % (encoding, file_name)) | |
if new_encoding.upper() != encoding.upper(): | |
print('... converting to "%s".' % new_encoding) | |
data = data.decode(encoding, errors='replace').encode(new_encoding) | |
open(file_name, 'wb').write(data) | |
else: | |
print('... not doing anything.') | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print('Pass filenames as parameters.') | |
sys.exit() | |
for file_name in sys.argv[1:]: | |
convert_encoding(file_name, 'UTF-8') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment