Skip to content

Instantly share code, notes, and snippets.

@andjc
Last active January 6, 2025 08:56
Show Gist options
  • Save andjc/cdcd9558f2b09e4783e6fc6cad47599e to your computer and use it in GitHub Desktop.
Save andjc/cdcd9558f2b09e4783e6fc6cad47599e to your computer and use it in GitHub Desktop.
Language and locale insensitive, case insensitive sort.
# Python functions to improve sorting of text in alphabetic scripts.
# Copyright 2025 Enabling Languages
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the “Software”), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is furnished to do so, subject
# to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial
# portions of the Software.
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
# TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
# CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
# IN THE SOFTWARE.
from collections.abc import Callable
import regex as re
import unicodedata as ud
from unidecode import unidecode
def create_mapping(data: str) -> dict[str, str]:
"""Create a dictionary of character correspondences for simple transliteration.
Excludes Basic Latin letters and extended Latin letters consisting of a Basic Latin
letter as base character with diacritics.
Args:
text (str): text to process for character correspondences.
Returns:
dict[str, str]: a dictionary of character correspondences.
"""
chars: set[str] = set("".join([re.sub(r'[\p{Block=Basic_Latin}\p{Mn}]+', '', i) for i in ud.normalize('NFD', data)]))
mapping: dict[str, str] = dict()
for char in chars:
mapping[char] = unidecode(char)
return dict(sorted(mapping.items()))
def normalised_sort_key(
token: str,
lower_first: bool = True,
func: Callable[[str], str] = str.casefold,
nf: str = 'NFD',
correspondences: dict[str, str] | None = None) -> tuple[str, str]:
"""Create normalised sort key for a alphabetic string.
1. Normalise token to NFD or NFKD form
2. Ensure required ordering of case pairs
3. If lowercase is required to sort before uppercase, create a tuple of two strings:
- case mapped or casefolded string
- swapcase string
Else, create a tuple of two strings:
- case mapped or casefolded string
- original string
Args:
token (str): alphabetic string
lower_first (bool, optional): Lowercase letters sort before uppercase letters. Defaults to True.
func (Callable[[str], str], optional): Function to apply to token. Defaults to str.casefold.
nf (str, optional): Normalisation form. Valid values are 'NFC', 'NFD', 'NFKC', or 'NFKD'. Defaults to 'NFD'.
correspondences (dict[str, str], optional): A dictionary of character correspondences for simple transliteration. Defaults to None.
Returns:
tuple[str]: a tuple of two strings: case mapped or casefolded string and unfolded string.
"""
nform = nf.upper() if nf.upper() in ['NFD', 'NFKD'] else 'NFD'
token = ud.normalize(nform, token)
token = "".join([i if i.upper() < i.lower() else i.swapcase() for i in token])
if correspondences is not None:
corr_table: dict[int, str] = str.maketrans(correspondences)
token = token.translate(corr_table)
if lower_first:
return (func(token), token.swapcase())
return (func(token), token)
text = 'Hwæt! We Gardena in geardagum, þeodcyninga, þrym gefrunon, hu ða æþelingas ellen fremedon.'
tokens = [i for i in re.split(r'[\p{P}\p{Z}]+', text) if i != '']
mapping = create_mapping(text)
sorted(tokens, key = lambda x: normalised_sort_key(x, correspondences = mapping))
# ['æþelingas', 'ða', 'ellen', 'fremedon', 'Gardena', 'geardagum', 'gefrunon', 'hu', 'Hwæt', 'in', 'þeodcyninga', 'þrym', 'We']
@andjc
Copy link
Author

andjc commented Jan 6, 2025

Warning

This is a hack. If ICU4C is available on your system, using PyICU for sorting is preferred. Alternatively, if you are on Windows or a Linux distribution using glibc, the locale module can be an alternative mechanism for sorting.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment