Created
February 18, 2011 19:13
-
-
Save apeiros/834225 to your computer and use it in GitHub Desktop.
Unicode String operations, case mapping, natural sorting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Encoding: utf-8 | |
# The following code is under BSD 2-clause license | |
# -> http://en.wikipedia.org/wiki/BSD_licenses#2-clause_license_.28.22Simplified_BSD_License.22_or_.22FreeBSD_License.22.29 | |
# | |
# Author: Stefan Rusterholz <[email protected]> - https://github.com/apeiros/ | |
# | |
# A module to help with transliteration issues. | |
# Provides methods for: | |
# * Changing the case of characters/strings, mapping most latin characters | |
# * Generate a value to perform natural sorting of strings (e.g. "10" > "2", | |
# "ä" < "z" etc.) | |
# * Transliterate a utf-8 string to 7bit chars only (e.g. map "ä" to "a" or "ae") | |
# | |
# All methods output valid utf-8 strings | |
# | |
# Beware, this module does not perform proper unicode collation based operations | |
module Transliterate | |
BiggestSortableNumber = ((128**128)-1) | |
SingleDigitToNaturalKey = %W[\u000a \u000b \u000c \u000d \u000e \u000f \u0010 \u0011 \u0012 \u0013 \u0014] | |
module_function | |
UpperToLower = Hash[%W[ | |
A a | |
\u00c5 \u00e5 | |
\u00c4 \u00e4 | |
\u00c6 \u00e6 | |
\u00c2 \u00e2 | |
\u00c1 \u00e1 | |
\u00c0 \u00e0 | |
\u00c3 \u00e3 | |
B b | |
C c | |
\u00c7 \u00e7 | |
D d | |
E e | |
\u00cb \u00eb | |
\u00c9 \u00e9 | |
\u00ca \u00ea | |
\u00c8 \u00e8 | |
F f | |
G g | |
H h | |
I i | |
\u00cf \u00ef | |
\u00cd \u00ed | |
\u00ce \u00ee | |
\u00cc \u00ec | |
J j | |
K k | |
L l | |
M m | |
N n | |
\u00d1 \u00f1 | |
O o | |
\u00d6 \u00f6 | |
\u00d3 \u00f3 | |
\u00d4 \u00f4 | |
\u00d2 \u00f2 | |
\u00d5 \u00f5 | |
\u00d8 \u00f8 | |
P p | |
Q q | |
R r | |
S s | |
\u1e9e \u00df | |
T t | |
U u | |
\u00dc \u00fc | |
\u00da \u00fa | |
\u00db \u00fb | |
\u00d9 \u00f9 | |
V v | |
W w | |
X x | |
Y y | |
\u0178 \u00ff | |
Z z | |
].each_slice(2).to_a] | |
LowerToUpper = UpperToLower.invert | |
SwapCase = UpperToLower.merge(LowerToUpper) | |
UpperCase = UpperToLower.keys.join('') | |
LowerCase = LowerToUpper.keys.join('') | |
MatchUpper = /[#{Regexp.escape(UpperCase)}]/u | |
MatchLower = /[#{Regexp.escape(LowerCase)}]/u | |
MatchChar = /[#{Regexp.escape(SwapCase.keys.join(''))}]/u | |
MatchNoChar = /[^#{Regexp.escape(SwapCase.keys.join(''))}]/u | |
mixed_case = UpperToLower.dup | |
LowerToUpper.each do |key, value| | |
mixed_case[value] += key unless UpperToLower[value] == key | |
end | |
mixed_case = mixed_case.map { |upper,lower| upper+lower } | |
CaseSensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf#{UpperCase}#{LowerCase}" | |
CaseInsensitiveOrder = "!\"\#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~\u00a7\u00b2\u00bf".chars.to_a+mixed_case | |
Alphabet = (32...([CaseSensitiveOrder.length+32, (1<<15)+22528].max)).to_a.pack("U*") | |
CaseSensitiveIndex = Hash[CaseSensitiveOrder.chars.with_index.to_a.map { |char, index| [char, Alphabet[index]] }] | |
CaseInsensitiveIndex = {} | |
CaseInsensitiveOrder.each_with_index do |equivalents, index| | |
value = Alphabet[index] | |
equivalents.each_char { |char| CaseInsensitiveIndex[char] = value } | |
end | |
NaturalSortUnknown = /[\x00-\x19]|[^#{Regexp.escape(CaseSensitiveOrder)}]/u | |
NaturalSortKnown = /[#{Regexp.escape(CaseSensitiveOrder)}]/u | |
Transliterate = Hash[%W[ | |
\u00c5 A A | |
\u00c4 A Ae | |
\u00c6 A AE | |
\u00c2 A A | |
\u00c0 A A | |
\u00c3 A A | |
\u00c7 C C | |
\u00cb E E | |
\u00c9 E E | |
\u00ca E E | |
\u00c8 E E | |
\u00cf I I | |
\u00ce I I | |
\u00cc I I | |
\u00d1 N N | |
\u00d6 O Oe | |
\u00d4 O O | |
\u00d2 O O | |
\u00d5 O O | |
\u00d8 O O | |
\u1e9e S SS | |
\u00dc U U | |
\u00db U U | |
\u00d9 U U | |
\u0178 Y Y | |
\u00e5 a a | |
\u00e4 a ae | |
\u00e6 a ae | |
\u00e1 a a | |
\u00e2 a a | |
\u00e0 a a | |
\u00e3 a a | |
\u00e7 c c | |
\u00eb e e | |
\u00ea e e | |
\u00e8 e e | |
\u00ef i i | |
\u00ed i i | |
\u00ee i i | |
\u00ec i i | |
\u00f1 n n | |
\u00f6 o oe | |
\u00f3 o o | |
\u00f4 o o | |
\u00f2 o o | |
\u00f5 o o | |
\u00f8 o o | |
\u00df s ss | |
\u00fc u u | |
\u00fa u u | |
\u00fb u u | |
\u00f9 u u | |
\u00ff y y | |
].each_slice(3).map { |char, map1, map2| [char, [map1, map2]] }] | |
# 7bit ascii characters | |
("A".."Z").each do |char| Transliterate[char] = [char, char] end | |
("a".."z").each do |char| Transliterate[char] = [char, char] end | |
("0".."9").each do |char| Transliterate[char] = [char, char] end | |
%W[! \" # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \\ ] ^ _ ` { | } ~].each do |char| Transliterate[char] = [char, char] end | |
Transliterate[' '] = [' ', ' '] | |
Transliterate1 = Hash[Transliterate.map { |key, (map1, map2)| [key, map1] }] | |
Transliterate2 = Hash[Transliterate.map { |key, (map1, map2)| [key, map2] }] | |
TransliterateMatch = /[#{Regexp.escape(Transliterate.keys.join(''))}]/u | |
# Maps non-7bit characters to a single 7 bit character, e.g. "ä" is mapped | |
# to "a" | |
# | |
# Example: | |
# Transliterate.transliterate "Größe" # => "Grose" | |
# | |
# Also see transliterate2, which tries to retain the meaning | |
def transliterate(string, replace_unknown="", extend_transliteration=nil) | |
if extend_transliteration then | |
match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u) | |
replace = Transliterate1.merge(extend_transliteration) | |
else | |
match = TransliterateMatch | |
replace = Transliterate1 | |
end | |
string.encode(Encoding::UTF_8).gsub(match, replace) | |
end | |
# Maps non-7bit characters to a sequence of 7 bit characters which convey | |
# the same meaning, e.g. "ä" is mapped to "ae". | |
# | |
# Example: | |
# Transliterate.transliterate2 "Größe" # => "Groesse" | |
# | |
# Also see transliterate, which maps single characters to single characters | |
def transliterate2(string, replace_unknown="", extend_transliteration=nil) | |
if extend_transliteration then | |
match = Regexp.union(TransliterateMatch, /[#{Regexp.escape(extend_transliteration.keys.join(''))}]/u) | |
replace = Transliterate2.merge(extend_transliteration) | |
else | |
match = TransliterateMatch | |
replace = Transliterate2 | |
end | |
string.encode(Encoding::UTF_8).gsub(match, replace) | |
end | |
def downcase(string) | |
string.encode(Encoding::UTF_8).gsub(MatchUpper, UpperToLower) | |
end | |
def upcase(string) | |
string.encode(Encoding::UTF_8).gsub(MatchLower, LowerToUpper) | |
end | |
def swapcase(string) | |
string.encode(Encoding::UTF_8).gsub(MatchChar, SwapCase) | |
end | |
# stable makes that differently cased letters don't | |
def case_insensitive_natural_sort_key(string, stable=true) | |
base = string.encode(Encoding::UTF_8). | |
tr("\t\n\r\v\f",' '). # convert whitespace to space | |
strip. # remove leading & trailing whitespace | |
squeeze(' '). # remove duplicate whitespace | |
gsub(NaturalSortUnknown, '') # remove unknown characters | |
part1 = base. | |
gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) | |
gsub(NaturalSortKnown, CaseInsensitiveIndex) # map characters | |
if stable then | |
part2 = case_sensitive_natural_sort_key(base.gsub(MatchNoChar, '')) | |
part1 + part2 | |
else | |
part1 | |
end | |
end | |
def case_sensitive_natural_sort_key(string) | |
string.encode(Encoding::UTF_8). | |
tr("\t\n\r\v\f",' '). # convert whitespace to space | |
strip. # remove leading & trailing whitespace | |
squeeze(' '). # remove duplicate whitespace | |
gsub(NaturalSortUnknown, ''). # remove unknown characters | |
gsub(/[+-]?\d+/) { |value| digits_to_natural_key(value.to_i) }. # translate numbers (10 comes after 2, base 128 as it must be unicode-safe) | |
gsub(NaturalSortKnown, CaseSensitiveIndex) # map characters | |
end | |
# performs a binary encoding whose result can be compared on a binary level correctly. | |
# To do that it uses a runtime length encoding of the number plus a base128 encoding. | |
# It works with numbers up to 128 digits long (in base 128), that means the biggest | |
# number correctly processed is 270 digits long in decimal: | |
# 5282945311356652463523397849165166065188473260361215221279607090266739025567248594… | |
# 7441725588765718789467439499325712867888234755950268553725053897846293957690838668399… | |
# 9005084168731517676426441053024232908211188404148028292751561738838396898767036476489… | |
# 538580897737998335 - a pretty friggin huge number. Larger numbers are truncated to | |
# that. | |
# The scheme used will never use more bytes than the ascii representation of the number | |
# used. E.g. 0-9 will still use 1 byte only. Larger numbers even use less space, e.g. | |
# 999_999_999_999 (12 bytes in decimal in ascii) can be represented by 8 bytes. | |
# Theoretically the implementation could be improved to support numbers as big as | |
# 128^(128^8). | |
def digits_to_natural_key(value) | |
# * 1-8 are for runtime length encoded negative numbers | |
# * 9 is for 1 digit (in base 128) negative numbers | |
# * 10-29 for single digits | |
# * 21 for 1 digit (in base 128) positive numbers | |
# * 22-29 for runtime length encoded positive numbers | |
if value.between?(0,9) then | |
SingleDigitToNaturalKey[value] | |
else | |
sign = value < 0 | |
value = value.abs | |
value = BiggestSortableNumber if value > BiggestSortableNumber | |
converted = base128(value) | |
if converted.size == 1 then | |
converted << (sign ? 10 : 21) | |
else | |
converted << converted.size | |
converted << (sign ? 9 : 22) # simplified version | |
# converted << (sign ? 10-converted.size : 21+converted.size) # this version would enable very large numbers | |
end | |
converted.reverse.pack("U*").encode(Encoding::UTF_8) # must be big-endian | |
end | |
end | |
def base128(value) | |
base128 = [] | |
begin | |
value, digit = value.divmod(128) | |
base128 << digit | |
end until value.zero? | |
base128 | |
end | |
end | |
if __FILE__ == $0 then | |
include Transliterate | |
require 'test/unit' | |
class TestNatcmp < Test::Unit::TestCase | |
def test_natsortkey | |
assert_equal(%w(hello3.jpg Hello5.jpg hello10.jpg Hello329.jpg hello292349282912405965338291184.jpg), %w(hello292349282912405965338291184.jpg hello3.jpg Hello329.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_insensitive_natural_sort_key(a) }) | |
assert_equal(%w(Hello5.jpg hello3.jpg hello10.jpg), %w(hello3.jpg hello10.jpg Hello5.jpg).sort_by { |a| case_sensitive_natural_sort_key(a) }) | |
assert_equal(%w(Hagar Hägar Hugar), %w(Hugar Hägar Hagar).sort_by { |a| case_sensitive_natural_sort_key(a) }) | |
assert_equal(%w(Hagar Hugar Hägar), %w(Hugar Hägar Hagar).sort_by { |a| a }) | |
assert(case_sensitive_natural_sort_key("Hägar20").valid_encoding?) | |
assert(case_insensitive_natural_sort_key("Hägar20").valid_encoding?) | |
assert_equal(%w(Ä ä), %w(ä Ä).sort_by { |a| case_insensitive_natural_sort_key(a, true) }) | |
assert_equal(case_insensitive_natural_sort_key("Ä", false), case_insensitive_natural_sort_key("ä", false)) | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment