-
-
Save rendello/d37552507a389656e248f3255a618127 to your computer and use it in GitHub Desktop.
/* | |
Copyright (c) 2024 Rendello | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
PERFORMANCE OF THIS SOFTWARE. | |
*/ | |
// ========================================================================== | |
//! Unicode codepoints that expand or contract when case is changed in UTF-8. | |
// ========================================================================== | |
pub const LOWERCASING_CONTRACTS: [&str; 22] = [ | |
"ẞ", /* ß (3->2), -1 bytes */ | |
"Ω", /* ω (3->2), -1 bytes */ | |
"Å", /* å (3->2), -1 bytes */ | |
"Ɫ", /* ɫ (3->2), -1 bytes */ | |
"Ɽ", /* ɽ (3->2), -1 bytes */ | |
"Ɑ", /* ɑ (3->2), -1 bytes */ | |
"Ɱ", /* ɱ (3->2), -1 bytes */ | |
"Ɐ", /* ɐ (3->2), -1 bytes */ | |
"Ɒ", /* ɒ (3->2), -1 bytes */ | |
"Ȿ", /* ȿ (3->2), -1 bytes */ | |
"Ɀ", /* ɀ (3->2), -1 bytes */ | |
"Ɥ", /* ɥ (3->2), -1 bytes */ | |
"Ɦ", /* ɦ (3->2), -1 bytes */ | |
"Ɜ", /* ɜ (3->2), -1 bytes */ | |
"Ɡ", /* ɡ (3->2), -1 bytes */ | |
"Ɬ", /* ɬ (3->2), -1 bytes */ | |
"Ɪ", /* ɪ (3->2), -1 bytes */ | |
"Ʞ", /* ʞ (3->2), -1 bytes */ | |
"Ʇ", /* ʇ (3->2), -1 bytes */ | |
"Ʝ", /* ʝ (3->2), -1 bytes */ | |
"Ʂ", /* ʂ (3->2), -1 bytes */ | |
"K", /* k (3->1), -2 bytes */ | |
]; | |
pub const LOWERCASING_EXPANDS: [&str; 2] = [ | |
"Ⱥ", /* ⱥ (2->3), +1 bytes */ | |
"Ⱦ", /* ⱦ (2->3), +1 bytes */ | |
]; | |
pub const LOWERCASING_EXPANDS_MULTI_CHAR: [&str; 1] = [ | |
"İ", /* i̇ (2->3), +1 bytes, +1 chars */ | |
]; | |
pub const UPPERCASING_CONTRACTS: [&str; 13] = [ | |
"ı", /* I (2->1), -1 bytes */ | |
"ſ", /* S (2->1), -1 bytes */ | |
"ᲀ", /* В (3->2), -1 bytes */ | |
"ᲁ", /* Д (3->2), -1 bytes */ | |
"ᲂ", /* О (3->2), -1 bytes */ | |
"ᲃ", /* С (3->2), -1 bytes */ | |
"ᲄ", /* Т (3->2), -1 bytes */ | |
"ᲅ", /* Т (3->2), -1 bytes */ | |
"ᲆ", /* Ъ (3->2), -1 bytes */ | |
"ᲇ", /* Ѣ (3->2), -1 bytes */ | |
"ι", /* Ι (3->2), -1 bytes */ | |
"ⱥ", /* Ⱥ (3->2), -1 bytes */ | |
"ⱦ", /* Ⱦ (3->2), -1 bytes */ | |
]; | |
pub const UPPERCASING_CONTRACTS_MULTI_CHAR: [&str; 5] = [ | |
"ff", /* FF (3->2), -1 bytes, +1 chars */ | |
"fi", /* FI (3->2), -1 bytes, +1 chars */ | |
"fl", /* FL (3->2), -1 bytes, +1 chars */ | |
"ſt", /* ST (3->2), -1 bytes, +1 chars */ | |
"st", /* ST (3->2), -1 bytes, +1 chars */ | |
]; | |
pub const UPPERCASING_EXPANDS: [&str; 18] = [ | |
"ȿ", /* Ȿ (2->3), +1 bytes */ | |
"ɀ", /* Ɀ (2->3), +1 bytes */ | |
"ɐ", /* Ɐ (2->3), +1 bytes */ | |
"ɑ", /* Ɑ (2->3), +1 bytes */ | |
"ɒ", /* Ɒ (2->3), +1 bytes */ | |
"ɜ", /* Ɜ (2->3), +1 bytes */ | |
"ɡ", /* Ɡ (2->3), +1 bytes */ | |
"ɥ", /* Ɥ (2->3), +1 bytes */ | |
"ɦ", /* Ɦ (2->3), +1 bytes */ | |
"ɪ", /* Ɪ (2->3), +1 bytes */ | |
"ɫ", /* Ɫ (2->3), +1 bytes */ | |
"ɬ", /* Ɬ (2->3), +1 bytes */ | |
"ɱ", /* Ɱ (2->3), +1 bytes */ | |
"ɽ", /* Ɽ (2->3), +1 bytes */ | |
"ʂ", /* Ʂ (2->3), +1 bytes */ | |
"ʇ", /* Ʇ (2->3), +1 bytes */ | |
"ʝ", /* Ʝ (2->3), +1 bytes */ | |
"ʞ", /* Ʞ (2->3), +1 bytes */ | |
]; | |
pub const UPPERCASING_EXPANDS_MULTI_CHAR: [&str; 89] = [ | |
"ΐ", /* Ϊ́ (2->6), +4 bytes, +2 chars */ | |
"ΰ", /* Ϋ́ (2->6), +4 bytes, +2 chars */ | |
"ὒ", /* Υ̓̀ (3->6), +3 bytes, +2 chars */ | |
"ὔ", /* Υ̓́ (3->6), +3 bytes, +2 chars */ | |
"ὖ", /* Υ̓͂ (3->6), +3 bytes, +2 chars */ | |
"ᾷ", /* Α͂Ι (3->6), +3 bytes, +2 chars */ | |
"ῇ", /* Η͂Ι (3->6), +3 bytes, +2 chars */ | |
"ῒ", /* Ϊ̀ (3->6), +3 bytes, +2 chars */ | |
"ΐ", /* Ϊ́ (3->6), +3 bytes, +2 chars */ | |
"ῗ", /* Ϊ͂ (3->6), +3 bytes, +2 chars */ | |
"ῢ", /* Ϋ̀ (3->6), +3 bytes, +2 chars */ | |
"ΰ", /* Ϋ́ (3->6), +3 bytes, +2 chars */ | |
"ῧ", /* Ϋ͂ (3->6), +3 bytes, +2 chars */ | |
"ῷ", /* Ω͂Ι (3->6), +3 bytes, +2 chars */ | |
"և", /* ԵՒ (2->4), +2 bytes, +1 chars */ | |
"ᾀ", /* ἈΙ (3->5), +2 bytes, +1 chars */ | |
"ᾁ", /* ἉΙ (3->5), +2 bytes, +1 chars */ | |
"ᾂ", /* ἊΙ (3->5), +2 bytes, +1 chars */ | |
"ᾃ", /* ἋΙ (3->5), +2 bytes, +1 chars */ | |
"ᾄ", /* ἌΙ (3->5), +2 bytes, +1 chars */ | |
"ᾅ", /* ἍΙ (3->5), +2 bytes, +1 chars */ | |
"ᾆ", /* ἎΙ (3->5), +2 bytes, +1 chars */ | |
"ᾇ", /* ἏΙ (3->5), +2 bytes, +1 chars */ | |
"ᾈ", /* ἈΙ (3->5), +2 bytes, +1 chars */ | |
"ᾉ", /* ἉΙ (3->5), +2 bytes, +1 chars */ | |
"ᾊ", /* ἊΙ (3->5), +2 bytes, +1 chars */ | |
"ᾋ", /* ἋΙ (3->5), +2 bytes, +1 chars */ | |
"ᾌ", /* ἌΙ (3->5), +2 bytes, +1 chars */ | |
"ᾍ", /* ἍΙ (3->5), +2 bytes, +1 chars */ | |
"ᾎ", /* ἎΙ (3->5), +2 bytes, +1 chars */ | |
"ᾏ", /* ἏΙ (3->5), +2 bytes, +1 chars */ | |
"ᾐ", /* ἨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾑ", /* ἩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾒ", /* ἪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾓ", /* ἫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾔ", /* ἬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾕ", /* ἭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾖ", /* ἮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾗ", /* ἯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾘ", /* ἨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾙ", /* ἩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾚ", /* ἪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾛ", /* ἫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾜ", /* ἬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾝ", /* ἭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾞ", /* ἮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾟ", /* ἯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾠ", /* ὨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾡ", /* ὩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾢ", /* ὪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾣ", /* ὫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾤ", /* ὬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾥ", /* ὭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾦ", /* ὮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾧ", /* ὯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾨ", /* ὨΙ (3->5), +2 bytes, +1 chars */ | |
"ᾩ", /* ὩΙ (3->5), +2 bytes, +1 chars */ | |
"ᾪ", /* ὪΙ (3->5), +2 bytes, +1 chars */ | |
"ᾫ", /* ὫΙ (3->5), +2 bytes, +1 chars */ | |
"ᾬ", /* ὬΙ (3->5), +2 bytes, +1 chars */ | |
"ᾭ", /* ὭΙ (3->5), +2 bytes, +1 chars */ | |
"ᾮ", /* ὮΙ (3->5), +2 bytes, +1 chars */ | |
"ᾯ", /* ὯΙ (3->5), +2 bytes, +1 chars */ | |
"ᾲ", /* ᾺΙ (3->5), +2 bytes, +1 chars */ | |
"ῂ", /* ῊΙ (3->5), +2 bytes, +1 chars */ | |
"ῲ", /* ῺΙ (3->5), +2 bytes, +1 chars */ | |
"ʼn", /* ʼN (2->3), +1 bytes, +1 chars */ | |
"ǰ", /* J̌ (2->3), +1 bytes, +1 chars */ | |
"ὐ", /* Υ̓ (3->4), +1 bytes, +1 chars */ | |
"ᾳ", /* ΑΙ (3->4), +1 bytes, +1 chars */ | |
"ᾴ", /* ΆΙ (3->4), +1 bytes, +1 chars */ | |
"ᾶ", /* Α͂ (3->4), +1 bytes, +1 chars */ | |
"ᾼ", /* ΑΙ (3->4), +1 bytes, +1 chars */ | |
"ῃ", /* ΗΙ (3->4), +1 bytes, +1 chars */ | |
"ῄ", /* ΉΙ (3->4), +1 bytes, +1 chars */ | |
"ῆ", /* Η͂ (3->4), +1 bytes, +1 chars */ | |
"ῌ", /* ΗΙ (3->4), +1 bytes, +1 chars */ | |
"ῖ", /* Ι͂ (3->4), +1 bytes, +1 chars */ | |
"ῤ", /* Ρ̓ (3->4), +1 bytes, +1 chars */ | |
"ῦ", /* Υ͂ (3->4), +1 bytes, +1 chars */ | |
"ῳ", /* ΩΙ (3->4), +1 bytes, +1 chars */ | |
"ῴ", /* ΏΙ (3->4), +1 bytes, +1 chars */ | |
"ῶ", /* Ω͂ (3->4), +1 bytes, +1 chars */ | |
"ῼ", /* ΩΙ (3->4), +1 bytes, +1 chars */ | |
"ﬓ", /* ՄՆ (3->4), +1 bytes, +1 chars */ | |
"ﬔ", /* ՄԵ (3->4), +1 bytes, +1 chars */ | |
"ﬕ", /* ՄԻ (3->4), +1 bytes, +1 chars */ | |
"ﬖ", /* ՎՆ (3->4), +1 bytes, +1 chars */ | |
"ﬗ", /* ՄԽ (3->4), +1 bytes, +1 chars */ | |
]; |
""" | |
Copyright (c) 2024 Rendello | |
Permission to use, copy, modify, and/or distribute this software for any | |
purpose with or without fee is hereby granted. | |
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH | |
REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY | |
AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, | |
INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM | |
LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR | |
OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR | |
PERFORMANCE OF THIS SOFTWARE. | |
""" | |
import sys | |
from dataclasses import dataclass | |
from typing import List, Dict | |
@dataclass | |
class Entry: | |
a: str | |
b: str | |
a_len: int | |
b_len: int | |
delta: int | |
a_char_count: int | |
b_char_count: int | |
delta_char_count: int | |
def sort_entries(l: List[Entry]) -> List[Entry]: | |
""" Sorted by size delta, then alphabetically. """ | |
return sorted(l, key= | |
lambda p: (-(p.delta_char_count), -(p.delta), p.a)) | |
def create_entry_map() -> dict[str, list[Entry]]: | |
entry_map = {} | |
for i in range(sys.maxunicode + 1): | |
a = chr(i) | |
for (case, b) in (('uppercasing', a.upper()), ('lowercasing', a.lower())): | |
attributes = [case] | |
try: | |
a_len = len(a.encode("utf8")) | |
b_len = len(b.encode("utf8")) | |
except UnicodeEncodeError: | |
continue | |
if a_len == b_len: | |
continue | |
delta = b_len - a_len | |
a_char_count = len(a) | |
b_char_count = len(b) | |
delta_char_count = b_char_count - a_char_count | |
if a_len < b_len: | |
attributes.append('expands') | |
elif a_len > b_len: | |
attributes.append('contracts') | |
if b_char_count > 1: | |
attributes.append('multi_char') | |
key = "_".join(attributes) | |
value = Entry(a, b, a_len, b_len, delta, a_char_count, b_char_count, delta_char_count) | |
if key not in entry_map: | |
entry_map[key] = [value] | |
else: | |
entry_map[key].append(value) | |
return entry_map | |
def entry_map_to_string(entry_map: Dict[str, List[Entry]]) -> str: | |
buffer = ( | |
f'''// =======================================================================\n''' | |
f'''//! Automatically generated using `task generate-utf8-case-data`.\n//!\n''' | |
f'''//! Unicode characters that behave oddly when the case is changed, for use\n''' | |
f'''//! with property tests.\n''' | |
f'''// =======================================================================\n\n''' | |
) | |
for key, unsorted_entries in sorted(list(entry_map.items())): | |
entries = sort_entries(unsorted_entries) | |
buffer += f'pub const {key.upper()}: [&str; {len(entries)}] = [\n' | |
for e in entries: | |
ds = "" | |
if e.delta_char_count != 0: | |
ds = f", {e.delta_char_count:+} chars" | |
buffer += f' "{e.a}",\t/* {e.b}\t({e.a_len}->{e.b_len}), {e.delta:+} bytes{ds} */\n' | |
buffer += "];\n\n" | |
return buffer.strip() | |
def generate_utf8_case_data(): | |
return entry_map_to_string(create_entry_map()) |
so I could have a generator that spits out Rust, Python, etc.
As for generators, I wouldnt trust so much a generator to do that work for me. I know a makefile can make miracles but still... different languages, different worlds.
@rept0id If you look at the included generate_utf8.py
file, it's creating the whole list. Same with the "Unicode Roundtrip" Gist I linked. It's all automatic anyway, so changing the language generator would just be changing the output string format. The main issue would be structuring the repo, should it be just the generators and have the "outputs" be "releases"? Or should the outputs live beside the generators? I feel like having two languages in the same repo might not be useful, but at the same time I might like to use this test code for both Python and Rust.
Perhaps the best solution would be to have the generator files, and have them generate the files in the repo itself so they're easily visible, with the caveat in comment form saying they're auto-generated (this is what my current project does). Then, I could potentially use the GH releases features to build libraries for Python, Rust, etc. That way property-cased testing generators (a different kind of generator, basically a type containing random values) could be bundled in.
I don't know 😆
Feel free to remix this code yourself too, the licence is in the files and is basically "do anything".
You can make it in the language that's more fluent to you and it will be more helpful for your own purposes and let others contribute with other languages. Something like a mega git that has directories "Rust", "Python", "Go", "C" and let other people port it to the languages that they like.
I do such gits and called them "various", for example, "monthy-hall-various" and there I have different implementations of different languages. Also some famous projects like the "cern-httpd" (the first server ever) have such structure but with platforms instead of languages - so it's not so much just my own kind of taste.
I could contribute with some languages.