Skip to content

Instantly share code, notes, and snippets.

@Norod
Created October 23, 2025 14:44
Show Gist options
  • Select an option

  • Save Norod/ed53015f3c62b96263a8626e454a7e28 to your computer and use it in GitHub Desktop.

Select an option

Save Norod/ed53015f3c62b96263a8626e454a7e28 to your computer and use it in GitHub Desktop.
Convert a file containing literal \uXXXX escape sequences into clean UTF-8.
#!/usr/bin/env python3
r"""Convert a file containing literal \uXXXX escape sequences into clean UTF-8.
Steps:
1. Read raw text.
2. Expand Python-style escape sequences (unicode_escape).
3. Merge valid UTF-16 surrogate pairs into real code points.
4. Replace any lone (invalid) surrogate with a replacement character (default '?').
5. Write UTF-8 output.
Usage:
python3 conv2utf8.py INPUT.json OUTPUT-utf8.json [replacement]
If replacement is provided, it will be used instead of '?' for lone surrogates.
"""
from __future__ import annotations
import sys
import os
import codecs
def combine_and_clean_surrogates(decoded: str, replacement: str = '?') -> tuple[str, int]:
"""Combine valid surrogate pairs; replace lone surrogates with replacement.
A high surrogate is in D800–DBFF; low surrogate DC00–DFFF. A valid pair is
a high immediately followed by a low. Lone surrogates are replaced.
"""
out = []
i = 0
n = len(decoded)
replaced = 0
while i < n:
cp = ord(decoded[i])
# High surrogate?
if 0xD800 <= cp <= 0xDBFF:
if i + 1 < n:
cp2 = ord(decoded[i + 1])
if 0xDC00 <= cp2 <= 0xDFFF:
full = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00)
out.append(chr(full))
i += 2
continue
# Lone high surrogate
out.append(replacement)
replaced += 1
i += 1
continue
# Low surrogate (lone)
if 0xDC00 <= cp <= 0xDFFF:
out.append(replacement)
replaced += 1
i += 1
continue
out.append(decoded[i])
i += 1
return ''.join(out), replaced
def expand_escapes(raw: str) -> str:
r"""Expand \uXXXX and other escapes using unicode_escape codec.
We catch errors and fall back to best-effort replacement.
"""
try:
return codecs.decode(raw, 'unicode_escape')
except Exception as e:
# Fallback: replace problematic sequences by processing chunks.
# Simpler: replace unknown errors by escaping backslashes to avoid infinite loop.
sys.stderr.write(f"Warning: unicode_escape decode error: {e}\n")
return codecs.decode(raw.replace('\\u', '\\u'), 'unicode_escape', errors='replace')
def convert_file(in_path: str, out_path: str, replacement: str = '?') -> int:
with open(in_path, 'r', encoding='utf-8', errors='replace') as f:
raw = f.read()
expanded = expand_escapes(raw)
cleaned, replaced_count = combine_and_clean_surrogates(expanded, replacement=replacement)
# Encode to UTF-8 safely; any remaining issues replaced (should be none now)
data = cleaned.encode('utf-8', 'replace')
with open(out_path, 'wb') as f:
f.write(data)
return replaced_count
def main(argv: list[str]) -> None:
if len(argv) < 3:
print(f"Usage: {os.path.basename(argv[0])} INPUT.json OUTPUT-utf8.json [replacement_char]")
sys.exit(1)
in_path = argv[1]
out_path = argv[2]
replacement = argv[3] if len(argv) > 3 else '?'
replaced = convert_file(in_path, out_path, replacement)
print(f"Converted {in_path} -> {out_path}. Replaced {replaced} lone surrogate(s) with {repr(replacement)}.")
if __name__ == '__main__':
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment