Skip to content

Instantly share code, notes, and snippets.

@mp4096
Created October 5, 2016 09:04
Show Gist options
  • Select an option

  • Save mp4096/f23997f88eadd2ec2bcb8eff11ba2dd6 to your computer and use it in GitHub Desktop.

Select an option

Save mp4096/f23997f88eadd2ec2bcb8eff11ba2dd6 to your computer and use it in GitHub Desktop.
Convert Latin 1 encoding to UTF-8
from __future__ import print_function
import codecs
import os
import fnmatch
def delete_if_exists(filename):
if os.path.exists(filename):
os.remove(filename)
def find_files(root_folder, filter_pattern):
for root, dirs, files in os.walk(root_directory):
for basename in filter(lambda n: fnmatch(n, filter_pattern), files):
yield os.path.join(root, basename)
def normalize_single_file(filename):
filename_bak = os.path.splitext(filename)[0] + ".bak"
delete_if_exists(filename_bak)
os.rename(filename, filename_bak)
with codecs.open(filename_bak, "r", encoding="latin_1") as f_in, \
codecs.open(filename, "w", encoding="utf-8") as f_out:
for line in f_in:
f_out.write(line)
f_out.write(preferred_linesep)
def normalize_in_all_subfolders(root_folder, filter_pattern):
for filename in find_files(root_folder, filter_pattern):
print("Processing '{:s}'...".format(filename))
normalize_single_file(filename)
preferred_linesep = "\n"
normalize_in_all_subfolders(r".", "*.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment