Skip to content

Instantly share code, notes, and snippets.

@robertsez
Last active January 23, 2022 22:30
Show Gist options
  • Save robertsez/36b64abbac33c77d5f7bba8cdb2892f1 to your computer and use it in GitHub Desktop.
Save robertsez/36b64abbac33c77d5f7bba8cdb2892f1 to your computer and use it in GitHub Desktop.
Replace 16bit hex Unicode characters with named Unicode in Python scripts
# Example file content before: print("\u274c")
# After: print("\N{CROSS MARK}")
import re
REGEXP = re.compile(r"(\\u[0-9|a-f|A-F]{4})+")
with open("file.py", "r", encoding="utf-8") as file:
lines = file.readlines()
def replace_unicode(uc: str) -> str:
return uc.encode('utf-16', 'surrogatepass').decode('utf-16').encode('ascii', 'namereplace').decode('ascii')
res = []
for line in lines:
for match in re.finditer(REGEXP, line):
m = match[0]
r = eval(f"u'{m}'")
named_unicode = replace_unicode(r)
print(f"{m} -> {named_unicode}")
line = line.replace(m, named_unicode)
res.append(line)
with open("output.py", "w", encoding="utf-8") as file:
file.writelines(res)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment