Skip to content

Instantly share code, notes, and snippets.

@MaximeKjaer
Created January 23, 2021 23:41
Show Gist options
  • Save MaximeKjaer/7b141f200318e8e707012e6190f83f68 to your computer and use it in GitHub Desktop.
Save MaximeKjaer/7b141f200318e8e707012e6190f83f68 to your computer and use it in GitHub Desktop.
import sys
import unicodedata
def get_regex(category: str) -> str:
start = None
regex = '['
for i in range(sys.maxunicode):
if unicodedata.category(chr(i)) == category:
if start is None:
start = i
elif start is not None:
if start == i - 1:
regex += escape(i - 1)
else:
regex += escape(start) + '-' + escape(i - 1)
start = None
return regex + ']'
def escape(char: int) -> str:
return chr(char).encode('unicode-escape').decode('utf-8')
Lt = get_regex('Lt')
Lu = get_regex('Lu')
Lo = get_regex('Lo')
Nl = get_regex('Nl')
Ll = get_regex('Ll')
Sm = get_regex('Sm')
So = get_regex('So')
print("Lt = u'" + Lt + "'")
print("Lu = u'" + Lu + "'")
print("Lo = u'" + Lo + "'")
print("Nl = u'" + Nl + "'")
print("Ll = u'" + Ll + "'")
print("Sm = u'" + Sm + "'")
print("So = u'" + So + "'")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment