Last active
September 16, 2023 02:10
-
-
Save UserUnknownFactor/063a2e2cecbe4f3164a07df12aa51462 to your computer and use it in GitHub Desktop.
Convert a group of files with unknown encodings to UTF-16 (Python)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os, sys | |
| from glob import iglob | |
| work_dir = os.getcwd() | |
| file_list = ( | |
| [f for f in iglob(os.path.join(work_dir, '**/*.ks'), recursive=True) if os.path.isfile(f)] + | |
| [f for f in iglob(os.path.join(work_dir, '**/*.tjs'), recursive=True) if os.path.isfile(f)] | |
| ) | |
| all_codecs = ['utf_16', 'utf_8_sig', 'ascii', 'cp932', 'utf_8', 'shift_jis', 'shift_jis_2004', 'shift_jisx0213'] | |
| def find_codec(text): | |
| for j in all_codecs: | |
| try: | |
| return j, text.decode(j) | |
| except: | |
| pass | |
| return None, None | |
| arg_enc = None | |
| if len(sys.argv) > 1: | |
| arg_enc = sys.argv[2].strip('" ') | |
| print('Using provided encoding:', arg_enc) | |
| for efile in file_list: | |
| failed = False | |
| enc = arg_enc if len(sys.argv) > 1 else None | |
| with open(efile, 'rb') as f: | |
| bstr = f.read() | |
| bstr = bstr.replace(b'\r\n', b'\n') | |
| if not enc: | |
| enc, dstr = find_codec(bstr) | |
| else: | |
| try: | |
| dstr = bstr.decode(enc) | |
| except: | |
| enc, dstr = find_codec(bstr) | |
| if not dstr: | |
| failed = True | |
| print(efile.replace(work_dir, ''), ' ', enc + '... ', end='', flush=True) | |
| if not failed and enc != 'utf_16': | |
| with open(efile, 'w', encoding='utf-16') as f: | |
| f.write(dstr) | |
| print('OK') | |
| elif failed: | |
| print('FAILED') | |
| else: | |
| print('IGNORED') |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is useful when KiriKiri or Tyrano Builder games crash on non Japanese ANSI encoding systems.
In most cases utf-16 can be substituted with utf-8-sig too.