Last active
August 22, 2023 17:08
-
-
Save mooware/f84abfdc31dd63a56d1bb7c78cbf781a to your computer and use it in GitHub Desktop.
A little script to extract old japanese zip files with the correct text encoding for filenames. Most applications assume a US text codepage for old zip formats, which is not always correct.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# A little script to extract old japanese zip files | |
# with the correct text encoding for filenames. | |
# Most applications assume a US text codepage for | |
# old zip formats, which is not always correct. | |
import os | |
import shutil | |
import sys | |
import zipfile | |
from datetime import datetime | |
# codepage 437 (aka 'DOS Latin US') seems to be the default | |
# for old zip files in most applications | |
_DEFAULT_CODEPAGE = 'cp437' | |
_JAPANESE_CODEPAGE = 'cp932' | |
# zipfile module does not support custom filenames, | |
# so we have to read/write the file ourselves | |
def extract_one_file(zip_file, entry, path): | |
# create subdirs if they don't exist yet | |
d = os.path.dirname(path) | |
if d and not os.path.isdir(d): | |
os.makedirs(d, exist_ok=True) | |
with zip_file.open(entry.filename) as infile: | |
with open(path, 'wb') as outfile: | |
shutil.copyfileobj(infile, outfile) | |
# set mtime/ctime as would happen when extracting | |
ctime = os.path.getctime(path) | |
mtime = datetime(*entry.date_time).timestamp() | |
os.utime(path, (ctime, mtime)) | |
def extract_with_codepage(zip_file, verbose=False, target_codepage=None): | |
if target_codepage is None: | |
target_codepage = _JAPANESE_CODEPAGE | |
files = zip_file.infolist() | |
for f in files: | |
if f.is_dir(): | |
continue | |
try: | |
filename = f.filename.encode(_DEFAULT_CODEPAGE).decode(target_codepage) | |
except UnicodeEncodeError as e: | |
if verbose: | |
print('encode error:', e) | |
filename = f.filename | |
except UnicodeDecodeError as e: | |
if verbose: | |
print('decode error:', e) | |
filename = f.filename | |
if verbose: | |
# on python < 3.6, printing to windows console does not work with unicode | |
try: | |
print(filename) | |
except UnicodeEncodeError: | |
enc = sys.stdout.encoding | |
if not enc: | |
enc = 'ascii' | |
print(filename.encode(enc, 'backslashreplace').decode(enc)) | |
extract_one_file(zip_file, f, filename) | |
if __name__ == '__main__': | |
if len(sys.argv) < 2: | |
print('usage:', __name__, 'zipfile [target codepage]') | |
sys.exit(1) | |
path = sys.argv[1] | |
codepage = sys.argv[2] if len(sys.argv) > 2 else None | |
zf = zipfile.ZipFile(path) | |
extract_with_codepage(zf, verbose=True, target_codepage=codepage) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment