Skip to content

Instantly share code, notes, and snippets.

@asterite
Created January 5, 2018 15:11
Show Gist options
  • Save asterite/2ac6aeb59413e891d3f406072cd5e071 to your computer and use it in GitHub Desktop.
Save asterite/2ac6aeb59413e891d3f406072cd5e071 to your computer and use it in GitHub Desktop.
require "spec"
module UTF16
def self.encode(string : String) : Slice(UInt16)
size = 0
string.each_char do |char|
size += char.ord < 0x10000 ? 1 : 2
end
slice = Slice(UInt16).new(size)
i = 0
string.each_char do |char|
ord = char.ord
if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
# One UInt16 is enough
slice[i] = ord.to_u16
elsif ord >= 0x10000
# Needs surrogate pair
ord -= 0x10000
slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bytes
i += 1
slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bytes
else
# Invalid char: use replacement
slice[i] = 0xfffd_u16
end
i += 1
end
slice
end
def self.decode(slice : Slice(UInt16)) : String
bytesize = 0
size = 0
each_char(slice) do |char|
bytesize += char.bytesize
size += 1
end
String.new(bytesize) do |buffer|
each_char(slice) do |char|
char.each_byte do |byte|
buffer.value = byte
buffer += 1
end
end
{bytesize, size}
end
end
private def self.each_char(slice : Slice(UInt16))
i = 0
while i < slice.size
byte = slice[i].to_i
if byte < 0xd800 || byte >= 0xe000
# One byte
codepoint = byte
elsif 0xd800 <= byte < 0xdc00 &&
(i + 1) < slice.size &&
0xdc00 <= slice[i + 1] <= 0xdfff
# Surrougate pair
codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000
i += 1
else
# Invalid byte
codepoint = 0xfffd
end
yield codepoint.chr
i += 1
end
end
end
describe UTF16 do
describe "encode" do
it "in the range U+0000..U+D7FF" do
encoded = UTF16.encode("\u{0}hello\u{d7ff}")
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
end
it "in the range U+E000 to U+FFFF" do
encoded = UTF16.encode("\u{e000}\u{ffff}")
encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
end
it "in the range U+10000..U+10FFFF" do
encoded = UTF16.encode("\u{10000}\u{10FFFF}")
encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
end
it "in the range U+D800..U+DFFF" do
encoded = UTF16.encode("\u{D800}\u{DFFF}")
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
end
end
describe "decode" do
it "in the range U+0000..U+D7FF" do
input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]
UTF16.decode(input).should eq("\u{0}hello\u{d7ff}")
end
it "in the range U+E000 to U+FFFF" do
input = Slice[0xe000_u16, 0xffff_u16]
UTF16.decode(input).should eq("\u{e000}\u{ffff}")
end
it "in the range U+10000..U+10FFFF" do
input = Slice[0xd800_u16, 0xdc00_u16]
UTF16.decode(input).should eq("\u{10000}")
end
it "in the range U+D800..U+DFFF" do
input = Slice[0xdc00_u16, 0xd800_u16]
UTF16.decode(input).should eq("\u{fffd}\u{fffd}")
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment