asterite · January 5, 2018 15:11
diff --git a/utf16.cr b/utf16.cr
 require "spec"

 module UTF16
  def self.encode(string : String) : Slice(UInt16)
    size = 0
    string.each_char do |char|
      size += char.ord < 0x10000 ? 1 : 2
    end

    slice = Slice(UInt16).new(size)

    i = 0
    string.each_char do |char|
      ord = char.ord
      if ord <= 0xd800 || (0xe000 <= ord < 0x10000)
        # One UInt16 is enough
        slice[i] = ord.to_u16
      elsif ord >= 0x10000
        # Needs surrogate pair
        ord -= 0x10000
        slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bytes
        i += 1
        slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bytes
      else
        # Invalid char: use replacement
        slice[i] = 0xfffd_u16
      end
      i += 1
    end

    slice
  end

  def self.decode(slice : Slice(UInt16)) : String
    bytesize = 0
    size = 0

    each_char(slice) do |char|
      bytesize += char.bytesize
      size += 1
    end

    String.new(bytesize) do |buffer|
      each_char(slice) do |char|
        char.each_byte do |byte|
          buffer.value = byte
          buffer += 1
        end
      end
      {bytesize, size}
    end
  end

  private def self.each_char(slice : Slice(UInt16))
    i = 0
    while i < slice.size
      byte = slice[i].to_i
      if byte < 0xd800 || byte >= 0xe000
        # One byte
        codepoint = byte
      elsif 0xd800 <= byte < 0xdc00 &&
            (i + 1) < slice.size &&
            0xdc00 <= slice[i + 1] <= 0xdfff
        # Surrougate pair
        codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000
        i += 1
      else
        # Invalid byte
        codepoint = 0xfffd
      end

      yield codepoint.chr

      i += 1
    end
  end
 end

 describe UTF16 do
  describe "encode" do
    it "in the range U+0000..U+D7FF" do
      encoded = UTF16.encode("\u{0}hello\u{d7ff}")
      encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
    end

    it "in the range U+E000 to U+FFFF" do
      encoded = UTF16.encode("\u{e000}\u{ffff}")
      encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
    end

    it "in the range U+10000..U+10FFFF" do
      encoded = UTF16.encode("\u{10000}\u{10FFFF}")
      encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
    end

    it "in the range U+D800..U+DFFF" do
      encoded = UTF16.encode("\u{D800}\u{DFFF}")
      encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
    end
  end

  describe "decode" do
    it "in the range U+0000..U+D7FF" do
      input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]
      UTF16.decode(input).should eq("\u{0}hello\u{d7ff}")
    end

    it "in the range U+E000 to U+FFFF" do
      input = Slice[0xe000_u16, 0xffff_u16]
      UTF16.decode(input).should eq("\u{e000}\u{ffff}")
    end

    it "in the range U+10000..U+10FFFF" do
      input = Slice[0xd800_u16, 0xdc00_u16]
      UTF16.decode(input).should eq("\u{10000}")
    end

    it "in the range U+D800..U+DFFF" do
      input = Slice[0xdc00_u16, 0xd800_u16]
      UTF16.decode(input).should eq("\u{fffd}\u{fffd}")
    end
  end
 end
	require "spec"

	module UTF16
	def self.encode(string : String) : Slice(UInt16)
	size = 0
	string.each_char do \|char\|
	size += char.ord < 0x10000 ? 1 : 2
	end

	slice = Slice(UInt16).new(size)

	i = 0
	string.each_char do \|char\|
	ord = char.ord
	if ord <= 0xd800 \|\| (0xe000 <= ord < 0x10000)
	# One UInt16 is enough
	slice[i] = ord.to_u16
	elsif ord >= 0x10000
	# Needs surrogate pair
	ord -= 0x10000
	slice[i] = 0xd800_u16 + ((ord >> 10) & 0x3ff) # Keep top 10 bytes
	i += 1
	slice[i] = 0xdc00_u16 + (ord & 0x3ff) # Keep low 10 bytes
	else
	# Invalid char: use replacement
	slice[i] = 0xfffd_u16
	end
	i += 1
	end

	slice
	end

	def self.decode(slice : Slice(UInt16)) : String
	bytesize = 0
	size = 0

	each_char(slice) do \|char\|
	bytesize += char.bytesize
	size += 1
	end

	String.new(bytesize) do \|buffer\|
	each_char(slice) do \|char\|
	char.each_byte do \|byte\|
	buffer.value = byte
	buffer += 1
	end
	end
	{bytesize, size}
	end
	end

	private def self.each_char(slice : Slice(UInt16))
	i = 0
	while i < slice.size
	byte = slice[i].to_i
	if byte < 0xd800 \|\| byte >= 0xe000
	# One byte
	codepoint = byte
	elsif 0xd800 <= byte < 0xdc00 &&
	(i + 1) < slice.size &&
	0xdc00 <= slice[i + 1] <= 0xdfff
	# Surrougate pair
	codepoint = ((byte - 0xd800) << 10) + (slice[i + 1] - 0xdc00) + 0x10000
	i += 1
	else
	# Invalid byte
	codepoint = 0xfffd
	end

	yield codepoint.chr

	i += 1
	end
	end
	end

	describe UTF16 do
	describe "encode" do
	it "in the range U+0000..U+D7FF" do
	encoded = UTF16.encode("\u{0}hello\u{d7ff}")
	encoded.should eq(Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16])
	end

	it "in the range U+E000 to U+FFFF" do
	encoded = UTF16.encode("\u{e000}\u{ffff}")
	encoded.should eq(Slice[0xe000_u16, 0xffff_u16])
	end

	it "in the range U+10000..U+10FFFF" do
	encoded = UTF16.encode("\u{10000}\u{10FFFF}")
	encoded.should eq(Slice[0xd800_u16, 0xdc00_u16, 0xdbff_u16, 0xdfff_u16])
	end

	it "in the range U+D800..U+DFFF" do
	encoded = UTF16.encode("\u{D800}\u{DFFF}")
	encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16])
	end
	end

	describe "decode" do
	it "in the range U+0000..U+D7FF" do
	input = Slice[0_u16, 0x68_u16, 0x65_u16, 0x6c_u16, 0x6c_u16, 0x6f_u16, 0xd7ff_u16]
	UTF16.decode(input).should eq("\u{0}hello\u{d7ff}")
	end

	it "in the range U+E000 to U+FFFF" do
	input = Slice[0xe000_u16, 0xffff_u16]
	UTF16.decode(input).should eq("\u{e000}\u{ffff}")
	end

	it "in the range U+10000..U+10FFFF" do
	input = Slice[0xd800_u16, 0xdc00_u16]
	UTF16.decode(input).should eq("\u{10000}")
	end

	it "in the range U+D800..U+DFFF" do
	input = Slice[0xdc00_u16, 0xd800_u16]
	UTF16.decode(input).should eq("\u{fffd}\u{fffd}")
	end
	end
	end