Last active
July 12, 2020 16:05
-
-
Save neumond/46e05bdde3982ef255aa210158415c16 to your computer and use it in GitHub Desktop.
Util to fix names in zip files created under Windows
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Util to fix names in zip files created under Windows. | |
| Usage: | |
| First decide whether you really need to fix your archive, | |
| or whether this program is able fix it correctly | |
| (operation is read-only): | |
| python fix_zip.py archive.zip | |
| If result was positive, apply the fix: | |
| python fix_zip.py archive.zip --fix | |
| Now the resulting archive is written in separate file archive.fixed.zip | |
| """ | |
| import re | |
| from contextlib import ExitStack | |
| from itertools import islice | |
| from os.path import exists | |
| from zipfile import ZipFile, ZipInfo | |
| import click | |
| RECOGNIZED_CHARS = re.compile('[-a-zA-Zа-яА-ЯёЁ0-9 .]') | |
| def decode_filename(zf, utf8name=True): | |
| # zf is string, original bytes are encoded in utf-8 or cp437 | |
| # https://github.com/python/cpython/blob/master/Lib/zipfile.py#L1300 | |
| if utf8name: | |
| b = bytes([ord(c) if ord(c) < 256 else ord('?') for c in zf]) | |
| return b.decode('cp866', errors='replace') | |
| else: | |
| return zf.encode('cp437').decode('cp866', errors='replace') | |
| def decode_filename_from_zinfo(zinfo): | |
| return decode_filename(zinfo.filename, bool(zinfo.flag_bits & 0x800)) | |
| def rate_filename(zf): | |
| total = 0 | |
| for match in RECOGNIZED_CHARS.findall(zf): | |
| total += len(match) | |
| return total / len(zf) | |
| def decision(orig_name, fixed_name): | |
| if orig_name == fixed_name: | |
| return 'NO' | |
| before = rate_filename(orig_name) | |
| after = rate_filename(fixed_name) | |
| if after > 0.95 and before < 0.4: | |
| return 'YES' | |
| if before > 0.95 and after < 0.4: | |
| return 'NO' | |
| return 'DOUBT' | |
| def overall_decision(decisions): | |
| decisions = list(decisions) | |
| if all(d == 'YES' for d in decisions): | |
| return 'YES' | |
| if all(d == 'NO' for d in decisions): | |
| return 'NO' | |
| return 'DOUBT' | |
| def analyze(fname): | |
| with ZipFile(fname, 'r') as z: | |
| def print_some_names(): | |
| click.echo('Some of file names:') | |
| for zinfo in islice(z.infolist(), 10): | |
| click.echo(' {} → {}'.format( | |
| zinfo.filename, | |
| decode_filename_from_zinfo(zinfo), | |
| )) | |
| dec = overall_decision( | |
| decision( | |
| zinfo.filename, | |
| decode_filename_from_zinfo(zinfo), | |
| ) | |
| for zinfo in z.infolist() | |
| ) | |
| print_some_names() | |
| if dec == 'YES': | |
| click.secho('YES, this file DOES need fix', fg='green') | |
| elif dec == 'NO': | |
| raise click.ClickException(click.style('NO, this file DOES NOT need fix', fg='red')) | |
| else: | |
| raise click.ClickException(click.style("Can't decide...", fg='yellow')) | |
| def make_fixed_zipinfo(zinfo): | |
| outinfo = ZipInfo() | |
| outinfo.filename = decode_filename_from_zinfo(zinfo) | |
| for attr in ( | |
| 'date_time', 'compress_type', 'comment', 'extra', | |
| 'create_system', 'create_version', | |
| 'extract_version', 'flag_bits', 'volume', | |
| 'internal_attr', 'external_attr', | |
| ): | |
| setattr(outinfo, attr, getattr(zinfo, attr)) | |
| return outinfo | |
| def fix_archive(input_fname, output_fname): | |
| with ExitStack() as stack: | |
| input_zip = stack.enter_context(ZipFile(input_fname, 'r')) | |
| output_zip = stack.enter_context(ZipFile(output_fname, 'w')) | |
| for zinfo in input_zip.infolist(): | |
| output_zip.writestr( | |
| make_fixed_zipinfo(zinfo), | |
| input_zip.read(zinfo.filename), | |
| ) | |
| output_zip.comment = input_zip.comment | |
| def pick_fixed_name(fname): | |
| """ | |
| Make output file name. | |
| >>> pick_fixed_name('/home/user/file.zip') | |
| '/home/user/file.fixed.zip' | |
| >>> pick_fixed_name('/home/user/file.ZiP') | |
| '/home/user/file.fixed.ZiP' | |
| """ | |
| ext = '.zip' | |
| if fname.lower().endswith('.zip'): | |
| ext = fname[-4:] | |
| fname = fname[:-4] | |
| fname += '.fixed' | |
| return fname + ext | |
| @click.command() | |
| @click.option('--fix', help='Fix archive', is_flag=True) | |
| @click.argument('zipfile', type=click.Path(exists=True)) | |
| def cmd(*, fix, zipfile): | |
| if fix: | |
| newname = pick_fixed_name(zipfile) | |
| if exists(newname): | |
| raise click.FileError(newname, 'Target file already exists') | |
| fix_archive(zipfile, newname) | |
| else: | |
| analyze(zipfile) | |
| if __name__ == '__main__': | |
| cmd() | |
| __all__ = () |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment