Skip to content

Instantly share code, notes, and snippets.

@zacharysyoung
Created December 5, 2021 21:37
Show Gist options
  • Save zacharysyoung/2e126a025bf2d5274520c867dc6d19e9 to your computer and use it in GitHub Desktop.
Save zacharysyoung/2e126a025bf2d5274520c867dc6d19e9 to your computer and use it in GitHub Desktop.
import re
# Replace embedded escaped unicode with their actual unicode values:
#
# `\Not wanted backslashes\ unicode: \u2019\u2026`
#
# to:
#
# `\Not wanted backslashes\ unicode: ’…`
#
# or:
#
# `\u0061\u0070\u0070\u006c\u0065\u0073\u0020\u0026\u0020\u006f\u0072\u0061\u006e\u0067\u0065\u0073`
#
# to:
#
# `apples & oranges`
#
# The "pure" unicode example of 'apples & oranges' could be decoded w/bytes.decode('unicode_escape'),
# but that won't work for the first example with back slashes
UNICODE_PTN = re.compile(r'\\u([\da-f]{4})')
s = '\\Not wanted backslashes\\ unicode: \\u2019\\u2026 || \\u0061\\u0070\\u0070\\u006c\\u0065\\u0073\\u0020\\u0026\\u0020\\u006f\\u0072\\u0061\\u006e\\u0067\\u0065\\u0073'
# As s is iterated, build a list of substrings: the unmodified slices between codepoints, and
# the codepoints replaced with their unicode values
s_builder = []
s_start = 0
match = UNICODE_PTN.search(s, s_start)
while match:
# Add unmodified slice between last start point and this match's start point
s_builder.append(s[s_start:match.start()])
# Convert matched codepoint to a char, and append
codepoint = int(match.group(1), 16)
char = chr(codepoint)
s_builder.append(char)
# Advance "start"
s_start = match.end()
# See if there are any more codepoints
match = UNICODE_PTN.search(s, s_start)
new_s = ''.join(s_builder)
assert new_s == '\\Not wanted backslashes\\ unicode: ’… || apples & oranges'
print(new_s)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment