andjc · January 6, 2025 08:56 · andjc · Jan 6, 2025
diff --git a/case_insensitive_sort.py b/case_insensitive_sort.py
 # Python functions to improve sorting of text in alphabetic scripts.

 # Copyright 2025 Enabling Languages
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 
 # associated documentation files (the “Software”), to deal in the Software without restriction, including 
 # without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 
 # copies of the Software, and to permit persons to whom the Software is furnished to do so, subject 
 # to the following conditions:
 #
 # The above copyright notice and this permission notice shall be included in all copies or substantial 
 # portions of the Software.

 # THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
 # TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF 
 # CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 
 # IN THE SOFTWARE.

 from collections.abc import Callable
 import regex as re
 import unicodedata as ud
 from unidecode import unidecode

 def create_mapping(data: str) -> dict[str, str]:
    """Create a dictionary of character correspondences for simple transliteration.

    Excludes Basic Latin letters and extended Latin letters consisting of a Basic Latin 
    letter as base character with diacritics.

    Args:
        text (str): text to process for character correspondences.

    Returns:
        dict[str, str]: a dictionary of character correspondences.
    """    
    chars: set[str] = set("".join([re.sub(r'[\p{Block=Basic_Latin}\p{Mn}]+', '', i) for i in ud.normalize('NFD', data)]))
    mapping: dict[str, str] = dict()
    for char in chars:
        mapping[char] = unidecode(char)
    return dict(sorted(mapping.items()))

 def normalised_sort_key(
    token: str, 
    lower_first: bool = True, 
    func: Callable[[str], str] = str.casefold, 
    nf: str = 'NFD', 
    correspondences: dict[str, str] | None = None) -> tuple[str, str]:
    
    """Create normalised sort key for a alphabetic string.
    1. Normalise token to NFD or NFKD form
    2. Ensure required ordering of case pairs
    3. If lowercase is required to sort before uppercase, create a tuple of two strings:
        - case mapped or casefolded string
        - swapcase string
       Else, create a tuple of two strings:
        - case mapped or casefolded string
        - original string
    Args:
        token (str): alphabetic string
        lower_first (bool, optional): Lowercase letters sort before uppercase letters. Defaults to True.
        func (Callable[[str], str], optional): Function to apply to token. Defaults to str.casefold.
        nf (str, optional): Normalisation form. Valid values are 'NFC', 'NFD', 'NFKC', or 'NFKD'. Defaults to 'NFD'.
        correspondences (dict[str, str], optional): A dictionary of character correspondences for simple transliteration. Defaults to None.
        
    Returns:
        tuple[str]: a tuple of two strings: case mapped or casefolded string and unfolded string.
    """
    
    nform = nf.upper() if nf.upper() in ['NFD', 'NFKD'] else 'NFD'
    token = ud.normalize(nform, token)
    token = "".join([i if i.upper() < i.lower() else i.swapcase() for i in token])
    if correspondences is not None:
        corr_table: dict[int, str] = str.maketrans(correspondences)
        token = token.translate(corr_table)
    if lower_first:
        return (func(token), token.swapcase())
    return (func(token), token)

 text = 'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.'
 tokens = [i for i in re.split(r'[\p{P}\p{Z}]+', text) if i != '']
 mapping = create_mapping(text)

 sorted(tokens, key = lambda x: normalised_sort_key(x, correspondences = mapping))
 # ['æþelingas', 'ða', 'ellen', 'fremedon', 'Gardena', 'geardagum', 'gefrunon', 'hu', 'Hwæt', 'in', 'þeodcyninga', 'þrym', 'We']
	# Python functions to improve sorting of text in alphabetic scripts.

	# Copyright 2025 Enabling Languages
	#
	# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
	# associated documentation files (the “Software”), to deal in the Software without restriction, including
	# without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is furnished to do so, subject
	# to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in all copies or substantial
	# portions of the Software.

	# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
	# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
	# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	# IN THE SOFTWARE.

	from collections.abc import Callable
	import regex as re
	import unicodedata as ud
	from unidecode import unidecode

	def create_mapping(data: str) -> dict[str, str]:
	"""Create a dictionary of character correspondences for simple transliteration.

	Excludes Basic Latin letters and extended Latin letters consisting of a Basic Latin
	letter as base character with diacritics.

	Args:
	text (str): text to process for character correspondences.

	Returns:
	dict[str, str]: a dictionary of character correspondences.
	"""
	chars: set[str] = set("".join([re.sub(r'[\p{Block=Basic_Latin}\p{Mn}]+', '', i) for i in ud.normalize('NFD', data)]))
	mapping: dict[str, str] = dict()
	for char in chars:
	mapping[char] = unidecode(char)
	return dict(sorted(mapping.items()))

	def normalised_sort_key(
	token: str,
	lower_first: bool = True,
	func: Callable[[str], str] = str.casefold,
	nf: str = 'NFD',
	correspondences: dict[str, str] \| None = None) -> tuple[str, str]:

	"""Create normalised sort key for a alphabetic string.
	1. Normalise token to NFD or NFKD form
	2. Ensure required ordering of case pairs
	3. If lowercase is required to sort before uppercase, create a tuple of two strings:
	- case mapped or casefolded string
	- swapcase string
	Else, create a tuple of two strings:
	- case mapped or casefolded string
	- original string
	Args:
	token (str): alphabetic string
	lower_first (bool, optional): Lowercase letters sort before uppercase letters. Defaults to True.
	func (Callable[[str], str], optional): Function to apply to token. Defaults to str.casefold.
	nf (str, optional): Normalisation form. Valid values are 'NFC', 'NFD', 'NFKC', or 'NFKD'. Defaults to 'NFD'.
	correspondences (dict[str, str], optional): A dictionary of character correspondences for simple transliteration. Defaults to None.

	Returns:
	tuple[str]: a tuple of two strings: case mapped or casefolded string and unfolded string.
	"""

	nform = nf.upper() if nf.upper() in ['NFD', 'NFKD'] else 'NFD'
	token = ud.normalize(nform, token)
	token = "".join([i if i.upper() < i.lower() else i.swapcase() for i in token])
	if correspondences is not None:
	corr_table: dict[int, str] = str.maketrans(correspondences)
	token = token.translate(corr_table)
	if lower_first:
	return (func(token), token.swapcase())
	return (func(token), token)

	text = 'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.'
	tokens = [i for i in re.split(r'[\p{P}\p{Z}]+', text) if i != '']
	mapping = create_mapping(text)

	sorted(tokens, key = lambda x: normalised_sort_key(x, correspondences = mapping))
	# ['æþelingas', 'ða', 'ellen', 'fremedon', 'Gardena', 'geardagum', 'gefrunon', 'hu', 'Hwæt', 'in', 'þeodcyninga', 'þrym', 'We']