Skip to content

Instantly share code, notes, and snippets.

@rendello
Last active November 6, 2024 18:17
Show Gist options
  • Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.
Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.
Unicode codepoints that expand or contract when case is changed in UTF-8. Good for testing parsers. Includes the data `utf8_case_data.rs` and the script to generate it, `generate_utf8.py`.
/*
Copyright (c) 2024 Rendello
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
// ==========================================================================
//! Unicode codepoints that expand or contract when case is changed in UTF-8.
// ==========================================================================
pub const LOWERCASING_CONTRACTS: [&str; 22] = [
"ẞ", /* ß (3->2), -1 bytes */
"Ω", /* ω (3->2), -1 bytes */
"Å", /* å (3->2), -1 bytes */
"Ɫ", /* ɫ (3->2), -1 bytes */
"Ɽ", /* ɽ (3->2), -1 bytes */
"Ɑ", /* ɑ (3->2), -1 bytes */
"Ɱ", /* ɱ (3->2), -1 bytes */
"Ɐ", /* ɐ (3->2), -1 bytes */
"Ɒ", /* ɒ (3->2), -1 bytes */
"Ȿ", /* ȿ (3->2), -1 bytes */
"Ɀ", /* ɀ (3->2), -1 bytes */
"Ɥ", /* ɥ (3->2), -1 bytes */
"Ɦ", /* ɦ (3->2), -1 bytes */
"Ɜ", /* ɜ (3->2), -1 bytes */
"Ɡ", /* ɡ (3->2), -1 bytes */
"Ɬ", /* ɬ (3->2), -1 bytes */
"Ɪ", /* ɪ (3->2), -1 bytes */
"Ʞ", /* ʞ (3->2), -1 bytes */
"Ʇ", /* ʇ (3->2), -1 bytes */
"Ʝ", /* ʝ (3->2), -1 bytes */
"Ʂ", /* ʂ (3->2), -1 bytes */
"K", /* k (3->1), -2 bytes */
];
pub const LOWERCASING_EXPANDS: [&str; 2] = [
"Ⱥ", /* ⱥ (2->3), +1 bytes */
"Ⱦ", /* ⱦ (2->3), +1 bytes */
];
pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [
"İ", /* i̇ (2->3), +1 bytes, +1 chars */
];
pub const UPPERCASING_CONTRACTS: [&str; 13] = [
"ı", /* I (2->1), -1 bytes */
"ſ", /* S (2->1), -1 bytes */
"ᲀ", /* В (3->2), -1 bytes */
"ᲁ", /* Д (3->2), -1 bytes */
"ᲂ", /* О (3->2), -1 bytes */
"ᲃ", /* С (3->2), -1 bytes */
"ᲄ", /* Т (3->2), -1 bytes */
"ᲅ", /* Т (3->2), -1 bytes */
"ᲆ", /* Ъ (3->2), -1 bytes */
"ᲇ", /* Ѣ (3->2), -1 bytes */
"ι", /* Ι (3->2), -1 bytes */
"ⱥ", /* Ⱥ (3->2), -1 bytes */
"ⱦ", /* Ⱦ (3->2), -1 bytes */
];
pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [
"ff", /* FF (3->2), -1 bytes, +1 chars */
"fi", /* FI (3->2), -1 bytes, +1 chars */
"fl", /* FL (3->2), -1 bytes, +1 chars */
"ſt", /* ST (3->2), -1 bytes, +1 chars */
"st", /* ST (3->2), -1 bytes, +1 chars */
];
pub const UPPERCASING_EXPANDS: [&str; 18] = [
"ȿ", /* Ȿ (2->3), +1 bytes */
"ɀ", /* Ɀ (2->3), +1 bytes */
"ɐ", /* Ɐ (2->3), +1 bytes */
"ɑ", /* Ɑ (2->3), +1 bytes */
"ɒ", /* Ɒ (2->3), +1 bytes */
"ɜ", /* Ɜ (2->3), +1 bytes */
"ɡ", /* Ɡ (2->3), +1 bytes */
"ɥ", /* Ɥ (2->3), +1 bytes */
"ɦ", /* Ɦ (2->3), +1 bytes */
"ɪ", /* Ɪ (2->3), +1 bytes */
"ɫ", /* Ɫ (2->3), +1 bytes */
"ɬ", /* Ɬ (2->3), +1 bytes */
"ɱ", /* Ɱ (2->3), +1 bytes */
"ɽ", /* Ɽ (2->3), +1 bytes */
"ʂ", /* Ʂ (2->3), +1 bytes */
"ʇ", /* Ʇ (2->3), +1 bytes */
"ʝ", /* Ʝ (2->3), +1 bytes */
"ʞ", /* Ʞ (2->3), +1 bytes */
];
pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [
"ΐ", /* Ϊ́ (2->6), +4 bytes, +2 chars */
"ΰ", /* Ϋ́ (2->6), +4 bytes, +2 chars */
"ὒ", /* Υ̓̀ (3->6), +3 bytes, +2 chars */
"ὔ", /* Υ̓́ (3->6), +3 bytes, +2 chars */
"ὖ", /* Υ̓͂ (3->6), +3 bytes, +2 chars */
"ᾷ", /* Α͂Ι (3->6), +3 bytes, +2 chars */
"ῇ", /* Η͂Ι (3->6), +3 bytes, +2 chars */
"ῒ", /* Ϊ̀ (3->6), +3 bytes, +2 chars */
"ΐ", /* Ϊ́ (3->6), +3 bytes, +2 chars */
"ῗ", /* Ϊ͂ (3->6), +3 bytes, +2 chars */
"ῢ", /* Ϋ̀ (3->6), +3 bytes, +2 chars */
"ΰ", /* Ϋ́ (3->6), +3 bytes, +2 chars */
"ῧ", /* Ϋ͂ (3->6), +3 bytes, +2 chars */
"ῷ", /* Ω͂Ι (3->6), +3 bytes, +2 chars */
"և", /* ԵՒ (2->4), +2 bytes, +1 chars */
"ᾀ", /* ἈΙ (3->5), +2 bytes, +1 chars */
"ᾁ", /* ἉΙ (3->5), +2 bytes, +1 chars */
"ᾂ", /* ἊΙ (3->5), +2 bytes, +1 chars */
"ᾃ", /* ἋΙ (3->5), +2 bytes, +1 chars */
"ᾄ", /* ἌΙ (3->5), +2 bytes, +1 chars */
"ᾅ", /* ἍΙ (3->5), +2 bytes, +1 chars */
"ᾆ", /* ἎΙ (3->5), +2 bytes, +1 chars */
"ᾇ", /* ἏΙ (3->5), +2 bytes, +1 chars */
"ᾈ", /* ἈΙ (3->5), +2 bytes, +1 chars */
"ᾉ", /* ἉΙ (3->5), +2 bytes, +1 chars */
"ᾊ", /* ἊΙ (3->5), +2 bytes, +1 chars */
"ᾋ", /* ἋΙ (3->5), +2 bytes, +1 chars */
"ᾌ", /* ἌΙ (3->5), +2 bytes, +1 chars */
"ᾍ", /* ἍΙ (3->5), +2 bytes, +1 chars */
"ᾎ", /* ἎΙ (3->5), +2 bytes, +1 chars */
"ᾏ", /* ἏΙ (3->5), +2 bytes, +1 chars */
"ᾐ", /* ἨΙ (3->5), +2 bytes, +1 chars */
"ᾑ", /* ἩΙ (3->5), +2 bytes, +1 chars */
"ᾒ", /* ἪΙ (3->5), +2 bytes, +1 chars */
"ᾓ", /* ἫΙ (3->5), +2 bytes, +1 chars */
"ᾔ", /* ἬΙ (3->5), +2 bytes, +1 chars */
"ᾕ", /* ἭΙ (3->5), +2 bytes, +1 chars */
"ᾖ", /* ἮΙ (3->5), +2 bytes, +1 chars */
"ᾗ", /* ἯΙ (3->5), +2 bytes, +1 chars */
"ᾘ", /* ἨΙ (3->5), +2 bytes, +1 chars */
"ᾙ", /* ἩΙ (3->5), +2 bytes, +1 chars */
"ᾚ", /* ἪΙ (3->5), +2 bytes, +1 chars */
"ᾛ", /* ἫΙ (3->5), +2 bytes, +1 chars */
"ᾜ", /* ἬΙ (3->5), +2 bytes, +1 chars */
"ᾝ", /* ἭΙ (3->5), +2 bytes, +1 chars */
"ᾞ", /* ἮΙ (3->5), +2 bytes, +1 chars */
"ᾟ", /* ἯΙ (3->5), +2 bytes, +1 chars */
"ᾠ", /* ὨΙ (3->5), +2 bytes, +1 chars */
"ᾡ", /* ὩΙ (3->5), +2 bytes, +1 chars */
"ᾢ", /* ὪΙ (3->5), +2 bytes, +1 chars */
"ᾣ", /* ὫΙ (3->5), +2 bytes, +1 chars */
"ᾤ", /* ὬΙ (3->5), +2 bytes, +1 chars */
"ᾥ", /* ὭΙ (3->5), +2 bytes, +1 chars */
"ᾦ", /* ὮΙ (3->5), +2 bytes, +1 chars */
"ᾧ", /* ὯΙ (3->5), +2 bytes, +1 chars */
"ᾨ", /* ὨΙ (3->5), +2 bytes, +1 chars */
"ᾩ", /* ὩΙ (3->5), +2 bytes, +1 chars */
"ᾪ", /* ὪΙ (3->5), +2 bytes, +1 chars */
"ᾫ", /* ὫΙ (3->5), +2 bytes, +1 chars */
"ᾬ", /* ὬΙ (3->5), +2 bytes, +1 chars */
"ᾭ", /* ὭΙ (3->5), +2 bytes, +1 chars */
"ᾮ", /* ὮΙ (3->5), +2 bytes, +1 chars */
"ᾯ", /* ὯΙ (3->5), +2 bytes, +1 chars */
"ᾲ", /* ᾺΙ (3->5), +2 bytes, +1 chars */
"ῂ", /* ῊΙ (3->5), +2 bytes, +1 chars */
"ῲ", /* ῺΙ (3->5), +2 bytes, +1 chars */
"ʼn", /* ʼN (2->3), +1 bytes, +1 chars */
"ǰ", /* J̌ (2->3), +1 bytes, +1 chars */
"ὐ", /* Υ̓ (3->4), +1 bytes, +1 chars */
"ᾳ", /* ΑΙ (3->4), +1 bytes, +1 chars */
"ᾴ", /* ΆΙ (3->4), +1 bytes, +1 chars */
"ᾶ", /* Α͂ (3->4), +1 bytes, +1 chars */
"ᾼ", /* ΑΙ (3->4), +1 bytes, +1 chars */
"ῃ", /* ΗΙ (3->4), +1 bytes, +1 chars */
"ῄ", /* ΉΙ (3->4), +1 bytes, +1 chars */
"ῆ", /* Η͂ (3->4), +1 bytes, +1 chars */
"ῌ", /* ΗΙ (3->4), +1 bytes, +1 chars */
"ῖ", /* Ι͂ (3->4), +1 bytes, +1 chars */
"ῤ", /* Ρ̓ (3->4), +1 bytes, +1 chars */
"ῦ", /* Υ͂ (3->4), +1 bytes, +1 chars */
"ῳ", /* ΩΙ (3->4), +1 bytes, +1 chars */
"ῴ", /* ΏΙ (3->4), +1 bytes, +1 chars */
"ῶ", /* Ω͂ (3->4), +1 bytes, +1 chars */
"ῼ", /* ΩΙ (3->4), +1 bytes, +1 chars */
"ﬓ", /* ՄՆ (3->4), +1 bytes, +1 chars */
"ﬔ", /* ՄԵ (3->4), +1 bytes, +1 chars */
"ﬕ", /* ՄԻ (3->4), +1 bytes, +1 chars */
"ﬖ", /* ՎՆ (3->4), +1 bytes, +1 chars */
"ﬗ", /* ՄԽ (3->4), +1 bytes, +1 chars */
];
"""
Copyright (c) 2024 Rendello
Permission to use, copy, modify, and/or distribute this software for any
purpose with or without fee is hereby granted.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
"""
import sys
from dataclasses import dataclass
from typing import List, Dict
@dataclass
class Entry:
a: str
b: str
a_len: int
b_len: int
delta: int
a_char_count: int
b_char_count: int
delta_char_count: int
def sort_entries(l: List[Entry]) -> List[Entry]:
""" Sorted by size delta, then alphabetically. """
return sorted(l, key=
lambda p: (-(p.delta_char_count), -(p.delta), p.a))
def create_entry_map() -> dict[str, list[Entry]]:
entry_map = {}
for i in range(sys.maxunicode + 1):
a = chr(i)
for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())):
attributes = [case]
try:
a_len = len(a.encode("utf8"))
b_len = len(b.encode("utf8"))
except UnicodeEncodeError:
continue
if a_len == b_len:
continue
delta = b_len - a_len
a_char_count = len(a)
b_char_count = len(b)
delta_char_count = b_char_count - a_char_count
if a_len < b_len:
attributes.append('expands')
elif a_len > b_len:
attributes.append('contracts')
if b_char_count > 1:
attributes.append('multi_char')
key = "_".join(attributes)
value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count)
if key not in entry_map:
entry_map[key] = [value]
else:
entry_map[key].append(value)
return entry_map
def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str:
buffer = (
f'''// =======================================================================\n'''
f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n'''
f'''//! Unicode characters that behave oddly when the case is changed, for use\n'''
f'''//! with property tests.\n'''
f'''// =======================================================================\n\n'''
)
for key, unsorted_entries in sorted(list(entry_map.items())):
entries = sort_entries(unsorted_entries)
buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n'
for e in entries:
ds = ""
if e.delta_char_count != 0:
ds = f", {e.delta_char_count:+} chars"
buffer += f' "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n'
buffer += "];\n\n"
return buffer.strip()
def generate_utf8_case_data():
return entry_map_to_string(create_entry_map())
@rendello
Copy link
Author

rendello commented Nov 1, 2024

See also "Unicode roundtrip-unsafe characters":
https://gist.github.com/rendello/4d8266b7c52bf0e98eab2073b38829d9

@rept0id
Copy link

rept0id commented Nov 1, 2024

Make it a git instead of gist, it will be more timeless and people can contribute as well. Provide there jupiter notebook, generation script etc goodies.

(Gists are a terrible idea and they are made just to replace pastebin in my very own personal opinion. This nice work I see here deserves more than a "pastebin")

@rendello
Copy link
Author

rendello commented Nov 1, 2024

@rept0id

This nice work I see here deserves more than a "pastebin"

Thanks! I might combine the "Unicode roundtrip-unsafe characters" and make a little data file / library. Mostly, if if I'm going to put more effort into it, I'm not sure which direction to go in. This is really programming-language agnostic, so I could have a generator that spits out Rust, Python, etc. Or keep it as TSVs, or maybe I should include some of the property-testing generators I created in Rust, etc. In my own project I'm using this in automated property tests, so I suppose that's one avenue.

@rept0id
Copy link

rept0id commented Nov 1, 2024

a generator that spits out Rust, Python, etc

You can make it in the language that's more fluent to you and it will be more helpful for your own purposes and let others contribute with other languages. Something like a mega git that has directories "Rust", "Python", "Go", "C" and let other people port it to the languages that they like.

I do such gits and called them "various", for example, "monthy-hall-various" and there I have different implementations of different languages. Also some famous projects like the "cern-httpd" (the first server ever) have such structure but with platforms instead of languages - so it's not so much just my own kind of taste.

I could contribute with some languages.

@rept0id
Copy link

rept0id commented Nov 1, 2024

so I could have a generator that spits out Rust, Python, etc.

As for generators, I wouldnt trust so much a generator to do that work for me. I know a makefile can make miracles but still... different languages, different worlds.

@rendello
Copy link
Author

rendello commented Nov 1, 2024

@rept0id If you look at the included generate_utf8.py file, it's creating the whole list. Same with the "Unicode Roundtrip" Gist I linked. It's all automatic anyway, so changing the language generator would just be changing the output string format. The main issue would be structuring the repo, should it be just the generators and have the "outputs" be "releases"? Or should the outputs live beside the generators? I feel like having two languages in the same repo might not be useful, but at the same time I might like to use this test code for both Python and Rust.

Perhaps the best solution would be to have the generator files, and have them generate the files in the repo itself so they're easily visible, with the caveat in comment form saying they're auto-generated (this is what my current project does). Then, I could potentially use the GH releases features to build libraries for Python, Rust, etc. That way property-cased testing generators (a different kind of generator, basically a type containing random values) could be bundled in.

I don't know 😆

Feel free to remix this code yourself too, the licence is in the files and is basically "do anything".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment