Created
June 12, 2015 22:32
-
-
Save acdha/49a610089c2798db6fe2 to your computer and use it in GitHub Desktop.
Ways to get the name of a Unicode block for a character in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env PYTHONIOENCODING=utf-8 python | |
# encoding: utf-8 | |
from __future__ import absolute_import, print_function, unicode_literals | |
import os | |
import re | |
import requests | |
def get_block_for_codepoint(cp): | |
"""Return the Unicode block name for the provided numeric codepoint""" | |
for start, end, block_name in UNICODE_BLOCKS: | |
if start <= cp <= end: | |
return block_name | |
return 'No_Block' | |
def load_unicode_blocks_from_file(f): | |
file_contents = f.read().decode('utf-8') | |
blocks = [] | |
for start, end, block_name in re.findall(r'([0-9A-F]+)\.\.([0-9A-F]+);\ (\S.*\S)', file_contents): | |
if block_name == 'No_Block': | |
continue | |
blocks.append((int(start, 16), int(end, 16), block_name)) | |
return blocks | |
def load_unicode_blocks(block_filename): | |
if not os.path.exists(block_filename): | |
print('Unicode block file %s does not exist. Downloading…' % block_filename) | |
r = requests.get('http://unicode.org/Public/UNIDATA/Blocks.txt') | |
r.raise_for_status() | |
with open(block_filename, 'wb') as f: | |
for chunk in r.iter_content(): | |
f.write(chunk) | |
with open(block_filename, 'rb') as f: | |
blocks = load_unicode_blocks_from_file(f) | |
return blocks | |
UNICODE_BLOCKS = load_unicode_blocks('UNIDATA-Blocks.txt') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import icu | |
# Astoundingly, PyICU has no documented way to get a Unicode block name. | |
# There are two ways to get the offset into the UCodeBlock enum | |
# – icu.Char.ublock_getCode and icu.Char.getIntPropertyValue(…, icu.UProperty.BLOCK) – | |
# so we'll build a lookup table to turn that into a human-readable string: | |
UNICODE_BLOCKS = {getattr(icu.UBlockCode, i): i for i in icu.UBlockCode.__dict__ if i.isupper()} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thanks for the update - amusingly, it looks like
LONG_PROPERTY_NAME
was added 9 years ago, right around when I wrote that little utility script so if I'd been only a few months later it would have had a simple answer!https://gitlab.pyicu.org/main/pyicu/-/commit/47f2f2858aba6d6e5de21d41a809d2e46e50e0f4
My goal for this initially was to have something for a demo which didn't use anything more than stdlib Python but I definitely would recommend one of the dedicated libraries for serious use.