|
#!/usr/bin/env python |
|
# -*- coding: utf-8 -*- |
|
"""Normalize unicode file names.""" |
|
|
|
from __future__ import unicode_literals |
|
|
|
from argparse import ArgumentParser |
|
from os import rename, walk |
|
from os.path import exists, isfile, join, split |
|
from sys import version_info |
|
from unicodedata import normalize |
|
|
|
|
|
def bytes_saved(old, new): |
|
"""Print difference of bytes between old an new string.""" |
|
diff = len(new) - len(old) |
|
s = "[\033[" |
|
if diff < 0: |
|
s += "32m" + str(diff) |
|
elif diff > 0: |
|
s += "31m+" + str(diff) |
|
else: |
|
s += "34m=" |
|
s += " byte" |
|
if abs(diff) > 1: |
|
s += "s" |
|
return s + "\033[0m]" |
|
|
|
|
|
def norm(root, file, form, proceed): |
|
"""Do the normalization.""" |
|
normed = ( |
|
normalize(form, file).replace("/", "/").replace("\\", "\").replace(":", ":") |
|
) |
|
if file != normed: |
|
old = join(root, file) |
|
new = join(root, normed) |
|
if exists(new): |
|
print("%s \033[31mcannot be renamed as\033[0m %s \033[31malready exists\033[0m" % (old, normed)) |
|
else: |
|
print("%s ▶︎ %s %s" % (old, normed, bytes_saved(file, normed))) |
|
if proceed: |
|
rename(old, new) |
|
|
|
|
|
def main(): |
|
"""Normalize unicode file names.""" |
|
parser = ArgumentParser(description="Normalize unicode file names.") |
|
parser.add_argument("source", help="the source file or directory") |
|
parser.add_argument( |
|
"-c", |
|
"--compatibility", |
|
action="store_true", |
|
help='normalize with compatibility (ex: "fi"' ' becomes "fi")', |
|
) |
|
parser.add_argument("-p", "--proceed", action="store_true", help="rename files") |
|
parser.add_argument( |
|
"-r", |
|
"--recursive", |
|
action="store_true", |
|
help="go through directories recursively", |
|
) |
|
args = parser.parse_args() |
|
|
|
if version_info < (3,): |
|
args.source = unicode(args.source, "utf8") |
|
|
|
norm_form = "NFKC" if args.compatibility else "NFC" |
|
|
|
# Source is a file |
|
if isfile(args.source): |
|
head, tail = split(args.source) |
|
norm(head, tail, norm_form, args.proceed) |
|
# Source is a directory |
|
else: |
|
for root, dirs, files in walk(args.source): |
|
for d in dirs: |
|
norm(root, d, norm_form, args.proceed) |
|
for f in files: |
|
norm(root, f, norm_form, args.proceed) |
|
if not args.recursive: |
|
break |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
I have found out why. There was fullwidth slash (/) in the original filename. It was converted to normal slash (/) by nfcfn.py and the new slash was interpreted as a path separator. I have global replaced those characters with a division slash (∕) and the script has completed successfully. I think they are better be handled by the script itself.
There is another problematic character fullwidth backslash(\) which should be replaced as small reverse slash(﹨) as well.
These fullwidth characters are very common in the Asian languages and file names.