marzer · August 20, 2020 12:54
diff --git a/rangify_unicode_categories.py b/rangify_unicode_categories.py
 #!/usr/bin/env python3

 # dependencies:
 # pip install --upgrade requests

 import os.path
 import sys
 import re
 import requests
 import traceback


 def print_character_range(s, e, count):
 	if (count > 0):
 		print(" / ", end='')
 		if (count % 4 == 0):
 			print("\n\t", end='')
 	if (s == e or e == 0):
 		print("%x{:X}".format(s), end='')
 	else:
 		print("%x{:X}-{:X}".format(s, e), end='')


 def print_abnf_for_categories(name, categories, codepoints):
 	print("\n; unicode codepoints from categories {}".format(', '.join(categories)))
 	print("{} = ".format(name), end='')
 	s = -1
 	e = -1
 	print_count = 0
 	count = 0
 	for codepoint, category in codepoints:
 		if (category in categories):
 			if (s == -1):
 				s = codepoint
 				e = codepoint
 			elif (e == codepoint-1):
 				e = codepoint
 			else:
 				print_character_range(s, e, print_count)
 				count += e - s + 1
 				print_count += 1
 				s = codepoint
 				e = codepoint
 	if (s != -1):
 		print_character_range(s, e, print_count)
 		count += e - s + 1
 	print("\n\t; {} codepoints in total\n".format(count))


 def main():

 	# get unicode character database
 	codepoint_list = ''
 	codepoint_file_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'UnicodeData.txt')
 	if (not os.path.exists(codepoint_file_path)):
 		print("Couldn't find unicode database file, will download")
 		response = requests.get(
 			'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',
 			timeout=1
 		)
 		codepoint_list = response.text
 		codepoint_file = open(codepoint_file_path,'w') 
 		print(codepoint_list, end='', file=codepoint_file)
 		codepoint_file.close()
 	else:
 		print("Reading unicode database file into memory")
 		codepoint_file = open(codepoint_file_path,'r')
 		codepoint_list = codepoint_file.read()
 		codepoint_file.close()

 	# parse the database file into codepoints
 	re_codepoint = re.compile(r'^([0-9a-fA-F]+);(.+?);([a-zA-Z]+);')
 	current_range_start = -1
 	codepoints = []
 	for codepoint_entry in codepoint_list.split('\n'):
 		match = re_codepoint.search(codepoint_entry)
 		if (match is None):
 			if (current_range_start > -1):
 				raise Exception('Previous codepoint indicated the start of a range but the next one was null')
 			continue
 		codepoint = int('0x{}'.format(match.group(1)), 16)
 		if (current_range_start > -1):
 			for cp in range(current_range_start, codepoint+1):
 				codepoints.append((cp, match.group(3)))
 			current_range_start = -1
 		else:
 			if (match.group(2).endswith(', First>')):
 				current_range_start = codepoint
 			else:
 				codepoints.append((codepoint, match.group(3)))
 	print("Parsed {} codepoints from unicode database file.".format(len(codepoints)))
 	codepoints.sort(key=lambda r:r[0])


 	# print categories
 	print_abnf_for_categories("letters", ('Ll','Lm','Lo','Lt','Lu'), codepoints)
 	print_abnf_for_categories("numbers", ('Nd', 'Nl'), codepoints)
 	print_abnf_for_categories("combining_marks", ('Mn', 'Mc'), codepoints)


 if __name__ == '__main__':
 	try:
 		main()
 	except Exception as err:
 		print(
 			'Fatal error: [{}] {}'.format(
 				type(err).__name__,
 				str(err)
 			),
 			file=sys.stderr
 		)
 		traceback.print_exc(file=sys.stderr)
 		sys.exit(1)
 	sys.exit()
	#!/usr/bin/env python3

	# dependencies:
	# pip install --upgrade requests

	import os.path
	import sys
	import re
	import requests
	import traceback


	def print_character_range(s, e, count):
	if (count > 0):
	print(" / ", end='')
	if (count % 4 == 0):
	print("\n\t", end='')
	if (s == e or e == 0):
	print("%x{:X}".format(s), end='')
	else:
	print("%x{:X}-{:X}".format(s, e), end='')


	def print_abnf_for_categories(name, categories, codepoints):
	print("\n; unicode codepoints from categories {}".format(', '.join(categories)))
	print("{} = ".format(name), end='')
	s = -1
	e = -1
	print_count = 0
	count = 0
	for codepoint, category in codepoints:
	if (category in categories):
	if (s == -1):
	s = codepoint
	e = codepoint
	elif (e == codepoint-1):
	e = codepoint
	else:
	print_character_range(s, e, print_count)
	count += e - s + 1
	print_count += 1
	s = codepoint
	e = codepoint
	if (s != -1):
	print_character_range(s, e, print_count)
	count += e - s + 1
	print("\n\t; {} codepoints in total\n".format(count))


	def main():

	# get unicode character database
	codepoint_list = ''
	codepoint_file_path = os.path.join(os.path.dirname(os.path.realpath(sys.argv[0])), 'UnicodeData.txt')
	if (not os.path.exists(codepoint_file_path)):
	print("Couldn't find unicode database file, will download")
	response = requests.get(
	'https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt',
	timeout=1
	)
	codepoint_list = response.text
	codepoint_file = open(codepoint_file_path,'w')
	print(codepoint_list, end='', file=codepoint_file)
	codepoint_file.close()
	else:
	print("Reading unicode database file into memory")
	codepoint_file = open(codepoint_file_path,'r')
	codepoint_list = codepoint_file.read()
	codepoint_file.close()

	# parse the database file into codepoints
	re_codepoint = re.compile(r'^([0-9a-fA-F]+);(.+?);([a-zA-Z]+);')
	current_range_start = -1
	codepoints = []
	for codepoint_entry in codepoint_list.split('\n'):
	match = re_codepoint.search(codepoint_entry)
	if (match is None):
	if (current_range_start > -1):
	raise Exception('Previous codepoint indicated the start of a range but the next one was null')
	continue
	codepoint = int('0x{}'.format(match.group(1)), 16)
	if (current_range_start > -1):
	for cp in range(current_range_start, codepoint+1):
	codepoints.append((cp, match.group(3)))
	current_range_start = -1
	else:
	if (match.group(2).endswith(', First>')):
	current_range_start = codepoint
	else:
	codepoints.append((codepoint, match.group(3)))
	print("Parsed {} codepoints from unicode database file.".format(len(codepoints)))
	codepoints.sort(key=lambda r:r[0])


	# print categories
	print_abnf_for_categories("letters", ('Ll','Lm','Lo','Lt','Lu'), codepoints)
	print_abnf_for_categories("numbers", ('Nd', 'Nl'), codepoints)
	print_abnf_for_categories("combining_marks", ('Mn', 'Mc'), codepoints)


	if __name__ == '__main__':
	try:
	main()
	except Exception as err:
	print(
	'Fatal error: [{}] {}'.format(
	type(err).__name__,
	str(err)
	),
	file=sys.stderr
	)
	traceback.print_exc(file=sys.stderr)
	sys.exit(1)
	sys.exit()