Created
January 5, 2018 15:11
-
-
Save asterite/2ac6aeb59413e891d3f406072cd5e071 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "spec" | |
module UTF16 | |
def self.encode(string : String) : Slice(UInt16) | |
size = 0 | |
string.each_char do |char| | |
size += char.ord < 0x10000 ? 1 : 2 | |
end | |
slice = Slice(UInt16).new(size) | |
i = 0 | |
string.each_char do |char| | |
ord = char.ord | |
if ord <= 0xd800 || (0xe000 <= ord < 0x10000) | |
# One UInt16 is enough | |
slice[i] = ord.to_u16 | |
elsif ord >= 0x10000 | |
# Needs surrogate pair | |
ord -= 0x10000 | |
slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bytes | |
i += 1 | |
slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bytes | |
else | |
# Invalid char: use replacement | |
slice[i] = 0xfffd_u16 | |
end | |
i += 1 | |
end | |
slice | |
end | |
def self.decode(slice : Slice(UInt16)) : String | |
bytesize = 0 | |
size = 0 | |
each_char(slice) do |char| | |
bytesize += char.bytesize | |
size += 1 | |
end | |
String.new(bytesize) do |buffer| | |
each_char(slice) do |char| | |
char.each_byte do |byte| | |
buffer.value = byte | |
buffer += 1 | |
end | |
end | |
{bytesize, size} | |
end | |
end | |
private def self.each_char(slice : Slice(UInt16)) | |
i = 0 | |
while i < slice.size | |
byte = slice[i].to_i | |
if byte < 0xd800 || byte >= 0xe000 | |
# One byte | |
codepoint = byte | |
elsif 0xd800 <= byte < 0xdc00 && | |
(i + 1) < slice.size && | |
0xdc00 <= slice[i + 1] <= 0xdfff | |
# Surrougate pair | |
codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000 | |
i += 1 | |
else | |
# Invalid byte | |
codepoint = 0xfffd | |
end | |
yield codepoint.chr | |
i += 1 | |
end | |
end | |
end | |
describe UTF16 do | |
describe "encode" do | |
it "in the range U+0000..U+D7FF" do | |
encoded = UTF16.encode("\u{0}hello\u{d7ff}") | |
encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]) | |
end | |
it "in the range U+E000 to U+FFFF" do | |
encoded = UTF16.encode("\u{e000}\u{ffff}") | |
encoded.should eq(Slice[0xe000_u16, 0xffff_u16]) | |
end | |
it "in the range U+10000..U+10FFFF" do | |
encoded = UTF16.encode("\u{10000}\u{10FFFF}") | |
encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16]) | |
end | |
it "in the range U+D800..U+DFFF" do | |
encoded = UTF16.encode("\u{D800}\u{DFFF}") | |
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16]) | |
end | |
end | |
describe "decode" do | |
it "in the range U+0000..U+D7FF" do | |
input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16] | |
UTF16.decode(input).should eq("\u{0}hello\u{d7ff}") | |
end | |
it "in the range U+E000 to U+FFFF" do | |
input = Slice[0xe000_u16, 0xffff_u16] | |
UTF16.decode(input).should eq("\u{e000}\u{ffff}") | |
end | |
it "in the range U+10000..U+10FFFF" do | |
input = Slice[0xd800_u16, 0xdc00_u16] | |
UTF16.decode(input).should eq("\u{10000}") | |
end | |
it "in the range U+D800..U+DFFF" do | |
input = Slice[0xdc00_u16, 0xd800_u16] | |
UTF16.decode(input).should eq("\u{fffd}\u{fffd}") | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment