Skip to content

Instantly share code, notes, and snippets.

@tav
Created June 5, 2009 23:34
Show Gist options
  • Save tav/124574 to your computer and use it in GitHub Desktop.
Save tav/124574 to your computer and use it in GitHub Desktop.
def get_canonical_plexname(plexname):
"""Return a canonicalised form of a plexname."""
if not isinstance(plexname, unicode):
plexname = unicode(plexname, 'utf-8')
# @/@ this has lotsa skope for optimisation. also, need to choose between:
# NFKD(toCasefold(NFKD(toCasefold(NFD(X)))))
# NFD(toCasefold(NFD(X)))
if u'\u0345' in plexname: # COMBINING GREEK YPOGEGRAMMENI
plexname = normalise_unicode('NFD', plexname)
canonised = []; out = canonised.append
space = False
for char in plexname:
# @/@ http://www.fileformat.info/info/unicode/category/index.htm
if (category(char) in ['Cc', 'Zs']) or (ord(char) in SPECIALS):
space = True
continue
if space:
space = False
if canonised:
out(u'-')
out(CASE_MAP.get(char, char))
plexname = normalise_unicode('NFKD', u''.join(canonised))
canonised[:] = []
for char in plexname:
out(CASE_MAP.get(char, char))
# @/@ we're using NFKC sinse it looks prettier
# @/@ see FC_NFKC_Closure to do properly -- DerivedNormalizationProps.txt
return normalise_unicode('NFKC', u''.join(canonised))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment