Created
June 5, 2009 23:34
-
-
Save tav/124574 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_canonical_plexname(plexname): | |
"""Return a canonicalised form of a plexname.""" | |
if not isinstance(plexname, unicode): | |
plexname = unicode(plexname, 'utf-8') | |
# @/@ this has lotsa skope for optimisation. also, need to choose between: | |
# NFKD(toCasefold(NFKD(toCasefold(NFD(X))))) | |
# NFD(toCasefold(NFD(X))) | |
if u'\u0345' in plexname: # COMBINING GREEK YPOGEGRAMMENI | |
plexname = normalise_unicode('NFD', plexname) | |
canonised = []; out = canonised.append | |
space = False | |
for char in plexname: | |
# @/@ http://www.fileformat.info/info/unicode/category/index.htm | |
if (category(char) in ['Cc', 'Zs']) or (ord(char) in SPECIALS): | |
space = True | |
continue | |
if space: | |
space = False | |
if canonised: | |
out(u'-') | |
out(CASE_MAP.get(char, char)) | |
plexname = normalise_unicode('NFKD', u''.join(canonised)) | |
canonised[:] = [] | |
for char in plexname: | |
out(CASE_MAP.get(char, char)) | |
# @/@ we're using NFKC sinse it looks prettier | |
# @/@ see FC_NFKC_Closure to do properly -- DerivedNormalizationProps.txt | |
return normalise_unicode('NFKC', u''.join(canonised)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment