Created
May 3, 2016 13:53
-
-
Save reagle/eab6412de8fe8e76415d143022c85c64 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# -*- coding: utf-8 -*- | |
# detox filenames so as to across most file systems | |
# (c) Copyright 2016 by Joseph Reagle | |
# Licensed under the GPLv3, see <http://www.gnu.org/licenses/gpl-3.0.html> | |
# | |
import codecs | |
import glob | |
import locale | |
import logging | |
import os | |
from os import chdir, environ, mkdir, rename, walk | |
from os.path import abspath, basename, exists, isdir, isfile, splitext | |
import re | |
import sys | |
import unicodedata | |
HOME = environ['HOME'] | |
critical = logging.critical | |
info = logging.info | |
dbg = logging.debug | |
warn = logging.warn | |
error = logging.error | |
excpt = logging.exception | |
def remove_accents(s): | |
"""remove accents for ascii analogs""" | |
nkfd_form = unicodedata.normalize('NFKD', s) | |
return u''.join([c for c in nkfd_form if not unicodedata.combining(c)]) | |
def remove_reserved(s): | |
"""only keep characters liked across most file systems""" | |
# http://serverfault.com/questions/124611/special-characters-in-samba-filenames | |
# https://amigotechnotes.wordpress.com/2015/04/02/invalid-characters-in-file-names/ | |
PERMITTED_CHARS = '''[^\w\-_\. \^&'@\{\}\[\],$=\!-#\(\)%.+~]''' | |
s = s.replace(u'’', "'") # I like apostrophes, but no smart ones | |
s = s.replace(u'?', "") # makes more sense just to remove '?' | |
s = re.sub(PERMITTED_CHARS, '_', s) | |
return s | |
def detoxify_fn(path, fn): | |
new_fn = remove_accents(fn) | |
new_fn = remove_reserved(new_fn) | |
fn = os.path.join(path, fn) | |
new_fn = os.path.join(path, new_fn) | |
if new_fn != fn: | |
try: | |
print("detoxing '%s' to '%s'" %(fn, new_fn)) | |
if not args.no_rename: | |
os.rename(fn, new_fn) | |
except Exception as e: | |
print(e) | |
else: | |
info('no need to rename %s' %fn) | |
def recurse(directory): | |
for path, dirs, files in walk(directory): | |
for fn in files: | |
info("fn = %s" %fn) | |
detoxify_fn(path, fn) | |
if '__main__' == __name__: | |
import argparse # http://docs.python.org/dev/library/argparse.html | |
arg_parser = argparse.ArgumentParser(description='detoxify filenames so as to work over most file systems') | |
# positional arguments | |
arg_parser.add_argument('files', nargs='+', metavar='FILE') | |
# optional arguments | |
arg_parser.add_argument("-n", "--no-rename", | |
action="store_true", default=False, | |
help="no renaming; perform dry run") | |
arg_parser.add_argument('-L', '--log-to-file', | |
action="store_true", default=False, | |
help="log to file %(prog)s.log") | |
arg_parser.add_argument('-V', '--verbose', action='count', default=0, | |
help="Increase verbosity (specify multiple times for more)") | |
arg_parser.add_argument('--version', action='version', version='TBD') | |
args = arg_parser.parse_args() | |
log_level = 100 # default | |
if args.verbose == 1: log_level = logging.CRITICAL #50 | |
elif args.verbose == 2: log_level = logging.INFO #20 | |
elif args.verbose >= 3: log_level = logging.DEBUG #10 | |
LOG_FORMAT = "%(levelno)s %(funcName).5s: %(message)s" | |
if args.log_to_file: | |
logging.basicConfig(filename='PROG-TEMPLATE.log', filemode='w', | |
level=log_level, format = LOG_FORMAT) | |
else: | |
logging.basicConfig(level=log_level, format = LOG_FORMAT) | |
if len(args.files) > 1: | |
print("Sorry, I only take a single argument: " | |
"a directory to recurse on or filename.") | |
sys.exit() | |
fn = args.files[0] | |
if isfile(fn): | |
detoxify_fn(fn) | |
elif isdir(fn): | |
recurse(fn) | |
else: | |
print("Sorry, argument is unknown file type") | |
sys.exit() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment