acdha · June 12, 2015 22:32 · acdha · Jul 11, 2024
diff --git a/get-unicode-blocks.py b/get-unicode-blocks.py
 #!/usr/bin/env PYTHONIOENCODING=utf-8  python
 # encoding: utf-8

 from __future__ import absolute_import, print_function, unicode_literals

 import os
 import re

 import requests


 def get_block_for_codepoint(cp):
    """Return the Unicode block name for the provided numeric codepoint"""

    for start, end, block_name in UNICODE_BLOCKS:
        if start <= cp <= end:
            return block_name

    return 'No_Block'


 def load_unicode_blocks_from_file(f):
    file_contents = f.read().decode('utf-8')

    blocks = []
    for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
        if block_name == 'No_Block':
            continue

        blocks.append((int(start, 16), int(end, 16), block_name))

    return blocks


 def load_unicode_blocks(block_filename):
    if not os.path.exists(block_filename):
        print('Unicode block file %s does not exist. Downloading…' % block_filename)
        r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
        r.raise_for_status()

        with open(block_filename, 'wb') as f:
            for chunk in r.iter_content():
                f.write(chunk)

    with open(block_filename, 'rb') as f:
        blocks = load_unicode_blocks_from_file(f)

    return blocks

 UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')
diff --git a/pyicu-unicode-block-names.py b/pyicu-unicode-block-names.py
 import icu 

 # Astoundingly, PyICU has no documented way to get a Unicode block name.
 # There are two ways to get the offset into the UCodeBlock enum
 # – icu.Char.ublock_getCode and icu.Char.getIntPropertyValue(…, icu.UProperty.BLOCK) –
 # so we'll build a lookup table to turn that into a human-readable string:

 UNICODE_BLOCKS = {getattr(icu.UBlockCode, i): i for i in icu.UBlockCode.__dict__ if i.isupper()}
	#!/usr/bin/env PYTHONIOENCODING=utf-8 python
	# encoding: utf-8

	from __future__ import absolute_import, print_function, unicode_literals

	import os
	import re

	import requests


	def get_block_for_codepoint(cp):
	"""Return the Unicode block name for the provided numeric codepoint"""

	for start, end, block_name in UNICODE_BLOCKS:
	if start <= cp <= end:
	return block_name

	return 'No_Block'


	def load_unicode_blocks_from_file(f):
	file_contents = f.read().decode('utf-8')

	blocks = []
	for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents):
	if block_name == 'No_Block':
	continue

	blocks.append((int(start, 16), int(end, 16), block_name))

	return blocks


	def load_unicode_blocks(block_filename):
	if not os.path.exists(block_filename):
	print('Unicode block file %s does not exist. Downloading…' % block_filename)
	r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt')
	r.raise_for_status()

	with open(block_filename, 'wb') as f:
	for chunk in r.iter_content():
	f.write(chunk)

	with open(block_filename, 'rb') as f:
	blocks = load_unicode_blocks_from_file(f)

	return blocks

	UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt')
	import icu

	# Astoundingly, PyICU has no documented way to get a Unicode block name.
	# There are two ways to get the offset into the UCodeBlock enum
	# – icu.Char.ublock_getCode and icu.Char.getIntPropertyValue(…, icu.UProperty.BLOCK) –
	# so we'll build a lookup table to turn that into a human-readable string:

	UNICODE_BLOCKS = {getattr(icu.UBlockCode, i): i for i in icu.UBlockCode.__dict__ if i.isupper()}