zacharysyoung · December 5, 2021 21:37
diff --git a/main.py b/main.py
 import re


 # Replace embedded escaped unicode with their actual unicode values:
 #
 #   `\Not wanted backslashes\ unicode: \u2019\u2026`
 #
 # to:
 #
 #   `\Not wanted backslashes\ unicode: ’…`
 #
 # or:
 #
 #   `\u0061\u0070\u0070\u006c\u0065\u0073\u0020\u0026\u0020\u006f\u0072\u0061\u006e\u0067\u0065\u0073`
 #
 # to:
 #
 #   `apples & oranges`
 #
 # The "pure" unicode example of 'apples & oranges' could be decoded w/bytes.decode('unicode_escape'),
 # but that won't work for the first example with back slashes


 UNICODE_PTN = re.compile(r'\\u([\da-f]{4})')

 s = '\\Not wanted backslashes\\ unicode: \\u2019\\u2026 || \\u0061\\u0070\\u0070\\u006c\\u0065\\u0073\\u0020\\u0026\\u0020\\u006f\\u0072\\u0061\\u006e\\u0067\\u0065\\u0073'

 # As s is iterated, build a list of substrings: the unmodified slices between codepoints, and 
 # the codepoints replaced with their unicode values
 s_builder = []


 s_start = 0
 match = UNICODE_PTN.search(s, s_start)
 while match:
    # Add unmodified slice between last start point and this match's start point
    s_builder.append(s[s_start:match.start()])

    # Convert matched codepoint to a char, and append
    codepoint = int(match.group(1), 16)
    char = chr(codepoint)
    s_builder.append(char)

    # Advance "start"
    s_start = match.end()

    # See if there are any more codepoints
    match = UNICODE_PTN.search(s, s_start)


 new_s = ''.join(s_builder)
 assert new_s == '\\Not wanted backslashes\\ unicode: ’… || apples & oranges'
 print(new_s)
	import re


	# Replace embedded escaped unicode with their actual unicode values:
	#
	# `\Not wanted backslashes\ unicode: \u2019\u2026`
	#
	# to:
	#
	# `\Not wanted backslashes\ unicode: ’…`
	#
	# or:
	#
	# `\u0061\u0070\u0070\u006c\u0065\u0073\u0020\u0026\u0020\u006f\u0072\u0061\u006e\u0067\u0065\u0073`
	#
	# to:
	#
	# `apples & oranges`
	#
	# The "pure" unicode example of 'apples & oranges' could be decoded w/bytes.decode('unicode_escape'),
	# but that won't work for the first example with back slashes


	UNICODE_PTN = re.compile(r'\\u([\da-f]{4})')

	s = '\\Not wanted backslashes\\ unicode: \\u2019\\u2026 \|\| \\u0061\\u0070\\u0070\\u006c\\u0065\\u0073\\u0020\\u0026\\u0020\\u006f\\u0072\\u0061\\u006e\\u0067\\u0065\\u0073'

	# As s is iterated, build a list of substrings: the unmodified slices between codepoints, and
	# the codepoints replaced with their unicode values
	s_builder = []


	s_start = 0
	match = UNICODE_PTN.search(s, s_start)
	while match:
	# Add unmodified slice between last start point and this match's start point
	s_builder.append(s[s_start:match.start()])

	# Convert matched codepoint to a char, and append
	codepoint = int(match.group(1), 16)
	char = chr(codepoint)
	s_builder.append(char)

	# Advance "start"
	s_start = match.end()

	# See if there are any more codepoints
	match = UNICODE_PTN.search(s, s_start)


	new_s = ''.join(s_builder)
	assert new_s == '\\Not wanted backslashes\\ unicode: ’… \|\| apples & oranges'
	print(new_s)