Norod · October 23, 2025 14:44
diff --git a/literal2utf8.py b/literal2utf8.py
 #!/usr/bin/env python3
 r"""Convert a file containing literal \uXXXX escape sequences into clean UTF-8.

 Steps:
 1. Read raw text.
 2. Expand Python-style escape sequences (unicode_escape).
 3. Merge valid UTF-16 surrogate pairs into real code points.
 4. Replace any lone (invalid) surrogate with a replacement character (default '?').
 5. Write UTF-8 output.

 Usage:
  python3 conv2utf8.py INPUT.json OUTPUT-utf8.json [replacement]

 If replacement is provided, it will be used instead of '?' for lone surrogates.
 """

 from __future__ import annotations
 import sys
 import os
 import codecs

 def combine_and_clean_surrogates(decoded: str, replacement: str = '?') -> tuple[str, int]:
 	"""Combine valid surrogate pairs; replace lone surrogates with replacement.

 	A high surrogate is in D800–DBFF; low surrogate DC00–DFFF. A valid pair is
 	a high immediately followed by a low. Lone surrogates are replaced.
 	"""
 	out = []
 	i = 0
 	n = len(decoded)
 	replaced = 0
 	while i < n:
 		cp = ord(decoded[i])
 		# High surrogate?
 		if 0xD800 <= cp <= 0xDBFF:
 			if i + 1 < n:
 				cp2 = ord(decoded[i + 1])
 				if 0xDC00 <= cp2 <= 0xDFFF:
 					full = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00)
 					out.append(chr(full))
 					i += 2
 					continue
 			# Lone high surrogate
 			out.append(replacement)
 			replaced += 1
 			i += 1
 			continue
 		# Low surrogate (lone)
 		if 0xDC00 <= cp <= 0xDFFF:
 			out.append(replacement)
 			replaced += 1
 			i += 1
 			continue
 		out.append(decoded[i])
 		i += 1
 	return ''.join(out), replaced

 def expand_escapes(raw: str) -> str:
 	r"""Expand \uXXXX and other escapes using unicode_escape codec.

 	We catch errors and fall back to best-effort replacement.
 	"""
 	try:
 		return codecs.decode(raw, 'unicode_escape')
 	except Exception as e:
 		# Fallback: replace problematic sequences by processing chunks.
 		# Simpler: replace unknown errors by escaping backslashes to avoid infinite loop.
 		sys.stderr.write(f"Warning: unicode_escape decode error: {e}\n")
 		return codecs.decode(raw.replace('\\u', '\\u'), 'unicode_escape', errors='replace')

 def convert_file(in_path: str, out_path: str, replacement: str = '?') -> int:
 	with open(in_path, 'r', encoding='utf-8', errors='replace') as f:
 		raw = f.read()
 	expanded = expand_escapes(raw)
 	cleaned, replaced_count = combine_and_clean_surrogates(expanded, replacement=replacement)
 	# Encode to UTF-8 safely; any remaining issues replaced (should be none now)
 	data = cleaned.encode('utf-8', 'replace')
 	with open(out_path, 'wb') as f:
 		f.write(data)
 	return replaced_count

 def main(argv: list[str]) -> None:
 	if len(argv) < 3:
 		print(f"Usage: {os.path.basename(argv[0])} INPUT.json OUTPUT-utf8.json [replacement_char]")
 		sys.exit(1)
 	in_path = argv[1]
 	out_path = argv[2]
 	replacement = argv[3] if len(argv) > 3 else '?'
 	replaced = convert_file(in_path, out_path, replacement)
 	print(f"Converted {in_path} -> {out_path}. Replaced {replaced} lone surrogate(s) with {repr(replacement)}.")

 if __name__ == '__main__':
 	main(sys.argv)
	#!/usr/bin/env python3
	r"""Convert a file containing literal \uXXXX escape sequences into clean UTF-8.

	Steps:
	1. Read raw text.
	2. Expand Python-style escape sequences (unicode_escape).
	3. Merge valid UTF-16 surrogate pairs into real code points.
	4. Replace any lone (invalid) surrogate with a replacement character (default '?').
	5. Write UTF-8 output.

	Usage:
	python3 conv2utf8.py INPUT.json OUTPUT-utf8.json [replacement]

	If replacement is provided, it will be used instead of '?' for lone surrogates.
	"""

	from __future__ import annotations
	import sys
	import os
	import codecs

	def combine_and_clean_surrogates(decoded: str, replacement: str = '?') -> tuple[str, int]:
	"""Combine valid surrogate pairs; replace lone surrogates with replacement.

	A high surrogate is in D800–DBFF; low surrogate DC00–DFFF. A valid pair is
	a high immediately followed by a low. Lone surrogates are replaced.
	"""
	out = []
	i = 0
	n = len(decoded)
	replaced = 0
	while i < n:
	cp = ord(decoded[i])
	# High surrogate?
	if 0xD800 <= cp <= 0xDBFF:
	if i + 1 < n:
	cp2 = ord(decoded[i + 1])
	if 0xDC00 <= cp2 <= 0xDFFF:
	full = 0x10000 + ((cp - 0xD800) << 10) + (cp2 - 0xDC00)
	out.append(chr(full))
	i += 2
	continue
	# Lone high surrogate
	out.append(replacement)
	replaced += 1
	i += 1
	continue
	# Low surrogate (lone)
	if 0xDC00 <= cp <= 0xDFFF:
	out.append(replacement)
	replaced += 1
	i += 1
	continue
	out.append(decoded[i])
	i += 1
	return ''.join(out), replaced

	def expand_escapes(raw: str) -> str:
	r"""Expand \uXXXX and other escapes using unicode_escape codec.

	We catch errors and fall back to best-effort replacement.
	"""
	try:
	return codecs.decode(raw, 'unicode_escape')
	except Exception as e:
	# Fallback: replace problematic sequences by processing chunks.
	# Simpler: replace unknown errors by escaping backslashes to avoid infinite loop.
	sys.stderr.write(f"Warning: unicode_escape decode error: {e}\n")
	return codecs.decode(raw.replace('\\u', '\\u'), 'unicode_escape', errors='replace')

	def convert_file(in_path: str, out_path: str, replacement: str = '?') -> int:
	with open(in_path, 'r', encoding='utf-8', errors='replace') as f:
	raw = f.read()
	expanded = expand_escapes(raw)
	cleaned, replaced_count = combine_and_clean_surrogates(expanded, replacement=replacement)
	# Encode to UTF-8 safely; any remaining issues replaced (should be none now)
	data = cleaned.encode('utf-8', 'replace')
	with open(out_path, 'wb') as f:
	f.write(data)
	return replaced_count

	def main(argv: list[str]) -> None:
	if len(argv) < 3:
	print(f"Usage: {os.path.basename(argv[0])} INPUT.json OUTPUT-utf8.json [replacement_char]")
	sys.exit(1)
	in_path = argv[1]
	out_path = argv[2]
	replacement = argv[3] if len(argv) > 3 else '?'
	replaced = convert_file(in_path, out_path, replacement)
	print(f"Converted {in_path} -> {out_path}. Replaced {replaced} lone surrogate(s) with {repr(replacement)}.")

	if __name__ == '__main__':
	main(sys.argv)