Last active
September 30, 2021 07:39
-
-
Save UserUnknownFactor/4003c2110d2eecf8286ca6a7b245d831 to your computer and use it in GitHub Desktop.
Python utility to unpack zips with unusual file name encoding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# unzip-enc.py | |
import os, sys, argparse, locale, zipfile, codecs | |
from charamel import Detector | |
def setup_console(sys_enc="utf-8"): | |
try: | |
if sys.version_info >= (3, 4): | |
importlib.reload(sys) | |
elif sys.version_info >= (3, 0): | |
imp.reload(sys) | |
else: | |
reload(sys) | |
if sys.platform.startswith("win"): | |
import ctypes | |
enc = "cp{}".format(ctypes.windll.kernel32.GetOEMCP()) | |
else: | |
enc = (sys.stdout.encoding if sys.stdout.isatty() else | |
sys.stderr.encoding if sys.stderr.isatty() else | |
sys.getfilesystemencoding() or sys_enc) | |
sys.setdefaultencoding(sys_enc) | |
if sys.stdout.isatty() and sys.stdout.encoding != enc: | |
sys.stdout = codecs.getwriter(enc)(sys.stdout, 'replace') | |
if sys.stderr.isatty() and sys.stderr.encoding != enc: | |
sys.stderr = codecs.getwriter(enc)(sys.stderr, 'replace') | |
return enc | |
except: | |
pass | |
console_chs = setup_console() | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-e", help="encoding for file names like cp932, autodetected by default", metavar=('encoding')) | |
parser.add_argument("-l", help="list file names in a zip file but don't unzip", action="store_true") | |
parser.add_argument("file", help="zip archive to unpack", metavar=('filename')) | |
args = parser.parse_args() | |
zfn = args.file #.decode(console_chs) | |
print("Processing " + zfn + " ...") | |
detector = Detector() | |
with zipfile.ZipFile(zfn, "r") as zf: | |
for name in zf.namelist(): | |
rawname = name.encode('cp437', 'ignore') | |
enc = args.e if args.e else detector.detect(rawname) | |
try: | |
utf8name=rawname.decode(enc) | |
except: | |
print("Error [" + enc + "] " + name) | |
utf8name = name | |
if args.l: | |
print("[" + enc + "] " + name) | |
else: | |
print("Extracting [" + enc + "]: " + utf8name) | |
pathname = os.path.dirname(utf8name) | |
if not os.path.exists(pathname) and pathname!= "": | |
os.makedirs(pathname) | |
if not os.path.exists(utf8name): | |
with open(utf8name, 'wb') as fout: | |
fout.write(zf.read(name)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment