Skip to content

Instantly share code, notes, and snippets.

@kylefeng28
Last active August 30, 2024 17:35
Show Gist options
  • Save kylefeng28/06a307ccaf5c08ff2a26562e805e7df5 to your computer and use it in GitHub Desktop.
Save kylefeng28/06a307ccaf5c08ff2a26562e805e7df5 to your computer and use it in GitHub Desktop.
Super hacky mojibake tools

mojibake-tools

Hacky scripts to fix Shift-JIS mojibake into UTF-8, especially filenames after unzipping zip files created on Japanese Windows computers.

Tested on macOS 10.13 on a APFS drive.

Usage

$ python3 fix_mojibake.py --create to_fix.txt

Original text:

DZDZÇ≈ï∂éöâªÇØÇÉyÅ[ÉXÉgâ∫Ç≥Ç¢

Fixed text:

ここで文字化けをペースト下さい
import sys
import os
import argparse
placeholder = 'Paste mojibake here\nここで文字化けをペースト下さい'
default_source_enc = 'sjis'
# Open a file in binary mode and overwrite contents
# with the text encoded in the given encoding
def write_file(filename, encoding, text):
with open(filename, 'wb') as f:
f.write(text.encode(encoding))
f.write(b'\n') # newline at end of file
# Open a file and return a string of the file contents decoded using the specified encoding
def read_file(filename, encoding):
with open(filename, 'r', encoding=encoding) as f:
return f.read()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Fix mojibake characters')
parser.add_argument('filename', metavar='file',
help='file to convert')
parser.add_argument('--create', dest='create_file', action='store_true',
help='create the file to paste mojibake text into, and open it in TextEdit')
parser.add_argument('--overwrite', dest='overwrite_file', action='store_true',
help='(only with --create) overwrite file')
parser.add_argument('--from', dest='from_encoding',
metavar='source_encoding',
default=default_source_enc,
help=f'source encoding (default: {default_source_enc})')
args = parser.parse_args()
from_encoding = args.from_encoding
# target_encoding = 'utf-8'
filename = args.filename
create_file = args.create_file
overwrite_file = args.overwrite_file
if create_file:
if os.path.exists(filename):
if not overwrite_file:
print('File already exists, skipping...')
else:
# 1. Write placeholder text to file, in original encoding
print('Writing placeholder text to file')
write_file(filename, from_encoding, placeholder)
# 2. Paste manually into TextEdit
print('Please paste the text manually into TextEdit')
print('Press enter when ready to proceed')
os.system('open -a TextEdit ' + filename)
input()
# 3. Output correctly-decoded file contents
# New encoding should be in utf-8, which (I think) usually corresponds to what Python uses to output and is what most terminals use by default
converted = read_file(filename, from_encoding)
print(converted)
import sys
import os
def read_file(file):
with open(file) as f:
content = f.read().splitlines()
return content
if __name__ == '__main__':
if len(sys.argv) != 3:
exit('invalid arguments')
rename_from_file = sys.argv[1]
rename_to_file = sys.argv[2]
rename_from = read_file(rename_from_file)
rename_to = read_file(rename_to_file)
zipped = zip(rename_from, rename_to)
for a in zipped:
try:
os.rename(a[0], a[1])
except Exception as e:
print(e)
Paste mojibake here
ここで文字化けをペースト下さい
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment