Skip to content

Instantly share code, notes, and snippets.

@neumond
Last active July 12, 2020 16:05
Show Gist options
  • Select an option

  • Save neumond/46e05bdde3982ef255aa210158415c16 to your computer and use it in GitHub Desktop.

Select an option

Save neumond/46e05bdde3982ef255aa210158415c16 to your computer and use it in GitHub Desktop.
Util to fix names in zip files created under Windows
"""
Util to fix names in zip files created under Windows.
Usage:
First decide whether you really need to fix your archive,
or whether this program is able fix it correctly
(operation is read-only):
python fix_zip.py archive.zip
If result was positive, apply the fix:
python fix_zip.py archive.zip --fix
Now the resulting archive is written in separate file archive.fixed.zip
"""
import re
from contextlib import ExitStack
from itertools import islice
from os.path import exists
from zipfile import ZipFile, ZipInfo
import click
RECOGNIZED_CHARS = re.compile('[-a-zA-Zа-яА-ЯёЁ0-9 .]')
def decode_filename(zf, utf8name=True):
# zf is string, original bytes are encoded in utf-8 or cp437
# https://github.com/python/cpython/blob/master/Lib/zipfile.py#L1300
if utf8name:
b = bytes([ord(c) if ord(c) < 256 else ord('?') for c in zf])
return b.decode('cp866', errors='replace')
else:
return zf.encode('cp437').decode('cp866', errors='replace')
def decode_filename_from_zinfo(zinfo):
return decode_filename(zinfo.filename, bool(zinfo.flag_bits & 0x800))
def rate_filename(zf):
total = 0
for match in RECOGNIZED_CHARS.findall(zf):
total += len(match)
return total / len(zf)
def decision(orig_name, fixed_name):
if orig_name == fixed_name:
return 'NO'
before = rate_filename(orig_name)
after = rate_filename(fixed_name)
if after > 0.95 and before < 0.4:
return 'YES'
if before > 0.95 and after < 0.4:
return 'NO'
return 'DOUBT'
def overall_decision(decisions):
decisions = list(decisions)
if all(d == 'YES' for d in decisions):
return 'YES'
if all(d == 'NO' for d in decisions):
return 'NO'
return 'DOUBT'
def analyze(fname):
with ZipFile(fname, 'r') as z:
def print_some_names():
click.echo('Some of file names:')
for zinfo in islice(z.infolist(), 10):
click.echo(' {} → {}'.format(
zinfo.filename,
decode_filename_from_zinfo(zinfo),
))
dec = overall_decision(
decision(
zinfo.filename,
decode_filename_from_zinfo(zinfo),
)
for zinfo in z.infolist()
)
print_some_names()
if dec == 'YES':
click.secho('YES, this file DOES need fix', fg='green')
elif dec == 'NO':
raise click.ClickException(click.style('NO, this file DOES NOT need fix', fg='red'))
else:
raise click.ClickException(click.style("Can't decide...", fg='yellow'))
def make_fixed_zipinfo(zinfo):
outinfo = ZipInfo()
outinfo.filename = decode_filename_from_zinfo(zinfo)
for attr in (
'date_time', 'compress_type', 'comment', 'extra',
'create_system', 'create_version',
'extract_version', 'flag_bits', 'volume',
'internal_attr', 'external_attr',
):
setattr(outinfo, attr, getattr(zinfo, attr))
return outinfo
def fix_archive(input_fname, output_fname):
with ExitStack() as stack:
input_zip = stack.enter_context(ZipFile(input_fname, 'r'))
output_zip = stack.enter_context(ZipFile(output_fname, 'w'))
for zinfo in input_zip.infolist():
output_zip.writestr(
make_fixed_zipinfo(zinfo),
input_zip.read(zinfo.filename),
)
output_zip.comment = input_zip.comment
def pick_fixed_name(fname):
"""
Make output file name.
>>> pick_fixed_name('/home/user/file.zip')
'/home/user/file.fixed.zip'
>>> pick_fixed_name('/home/user/file.ZiP')
'/home/user/file.fixed.ZiP'
"""
ext = '.zip'
if fname.lower().endswith('.zip'):
ext = fname[-4:]
fname = fname[:-4]
fname += '.fixed'
return fname + ext
@click.command()
@click.option('--fix', help='Fix archive', is_flag=True)
@click.argument('zipfile', type=click.Path(exists=True))
def cmd(*, fix, zipfile):
if fix:
newname = pick_fixed_name(zipfile)
if exists(newname):
raise click.FileError(newname, 'Target file already exists')
fix_archive(zipfile, newname)
else:
analyze(zipfile)
if __name__ == '__main__':
cmd()
__all__ = ()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment