Created
July 25, 2019 00:37
-
-
Save tingletech/03db322757764313784a3b19f4911ff1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF-8 -*- | |
import unicodedata | |
import re | |
RE_ALPHANUMSPACE = re.compile(r'[^0-9A-Za-z\s]*') # \W include "_" as does A-z | |
def normalize_sort_field(sort_field, | |
default_missing='~title unknown', | |
missing_equivalents=['title unknown']): | |
#fold case | |
sort_field = sort_field.lower() | |
# fold diacritics https://stackoverflow.com/a/518232/1763984 | |
sort_field = ''.join(c for c in unicodedata.normalize('NFD', sort_field) | |
if unicodedata.category(c) != 'Mn') | |
# remove punctuation | |
sort_field = RE_ALPHANUMSPACE.sub('', sort_field) | |
# remove English initial articles | |
words = sort_field.split() | |
if words: | |
if words[0] in ('the', 'a', 'an'): | |
sort_field = ' '.join(words[1:]) | |
if not sort_field or sort_field in missing_equivalents: | |
sort_field = default_missing | |
# normalize whitespace https://stackoverflow.com/a/46501496/1763984 | |
sort_field = u' '.join(sort_field.split()) | |
return sort_field | |
print(normalize_sort_field(u'嶋巡月弓張 [Shimameguri tsuki no yumihari]')) | |
print(normalize_sort_field(u'$ 容競出入湊 [Sugatakurabe deiri no minato]')) | |
print(normalize_sort_field(u'菅原伝授手習鑑 [Sugawara denju tenarai kagami]')) | |
print(normalize_sort_field(u'$ 崇禅寺馬場 [Sōzenji bama]')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment