Created
October 23, 2025 14:44
-
-
Save Norod/ed53015f3c62b96263a8626e454a7e28 to your computer and use it in GitHub Desktop.
Convert a file containing literal \uXXXX escape sequences into clean UTF-8.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| r"""Convert a file containing literal \uXXXX escape sequences into clean UTF-8. | |
| Steps: | |
| 1. Read raw text. | |
| 2. Expand Python-style escape sequences (unicode_escape). | |
| 3. Merge valid UTF-16 surrogate pairs into real code points. | |
| 4. Replace any lone (invalid) surrogate with a replacement character (default '?'). | |
| 5. Write UTF-8 output. | |
| Usage: | |
| python3 conv2utf8.py INPUT.json OUTPUT-utf8.json [replacement] | |
| If replacement is provided, it will be used instead of '?' for lone surrogates. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| import os | |
| import codecs | |
| def combine_and_clean_surrogates(decoded: str, replacement: str = '?') -> tuple[str, int]: | |
| """Combine valid surrogate pairs; replace lone surrogates with replacement. | |
| A high surrogate is in D800–DBFF; low surrogate DC00–DFFF. A valid pair is | |
| a high immediately followed by a low. Lone surrogates are replaced. | |
| """ | |
| out = [] | |
| i = 0 | |
| n = len(decoded) | |
| replaced = 0 | |
| while i < n: | |
| cp = ord(decoded[i]) | |
| # High surrogate? | |
| if 0xD800 <= cp <= 0xDBFF: | |
| if i + 1 < n: | |
| cp2 = ord(decoded[i + 1]) | |
| if 0xDC00 <= cp2 <= 0xDFFF: | |
| full = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00) | |
| out.append(chr(full)) | |
| i += 2 | |
| continue | |
| # Lone high surrogate | |
| out.append(replacement) | |
| replaced += 1 | |
| i += 1 | |
| continue | |
| # Low surrogate (lone) | |
| if 0xDC00 <= cp <= 0xDFFF: | |
| out.append(replacement) | |
| replaced += 1 | |
| i += 1 | |
| continue | |
| out.append(decoded[i]) | |
| i += 1 | |
| return ''.join(out), replaced | |
| def expand_escapes(raw: str) -> str: | |
| r"""Expand \uXXXX and other escapes using unicode_escape codec. | |
| We catch errors and fall back to best-effort replacement. | |
| """ | |
| try: | |
| return codecs.decode(raw, 'unicode_escape') | |
| except Exception as e: | |
| # Fallback: replace problematic sequences by processing chunks. | |
| # Simpler: replace unknown errors by escaping backslashes to avoid infinite loop. | |
| sys.stderr.write(f"Warning: unicode_escape decode error: {e}\n") | |
| return codecs.decode(raw.replace('\\u', '\\u'), 'unicode_escape', errors='replace') | |
| def convert_file(in_path: str, out_path: str, replacement: str = '?') -> int: | |
| with open(in_path, 'r', encoding='utf-8', errors='replace') as f: | |
| raw = f.read() | |
| expanded = expand_escapes(raw) | |
| cleaned, replaced_count = combine_and_clean_surrogates(expanded, replacement=replacement) | |
| # Encode to UTF-8 safely; any remaining issues replaced (should be none now) | |
| data = cleaned.encode('utf-8', 'replace') | |
| with open(out_path, 'wb') as f: | |
| f.write(data) | |
| return replaced_count | |
| def main(argv: list[str]) -> None: | |
| if len(argv) < 3: | |
| print(f"Usage: {os.path.basename(argv[0])} INPUT.json OUTPUT-utf8.json [replacement_char]") | |
| sys.exit(1) | |
| in_path = argv[1] | |
| out_path = argv[2] | |
| replacement = argv[3] if len(argv) > 3 else '?' | |
| replaced = convert_file(in_path, out_path, replacement) | |
| print(f"Converted {in_path} -> {out_path}. Replaced {replaced} lone surrogate(s) with {repr(replacement)}.") | |
| if __name__ == '__main__': | |
| main(sys.argv) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment