Last active
November 7, 2021 21:28
-
-
Save sevaa/5a50ba98762762ae960b87fb148a7df2 to your computer and use it in GitHub Desktop.
Converting a VARBINARY data block in UTF-8 to a NVARCHAR string in pure Transact-SQL
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
create function dbo.FromUTF8(@s varbinary(max)) | |
returns nvarchar(max) | |
as | |
begin | |
declare @i int = 1, @n int = datalength(@s), @r nvarchar(max) = N'' | |
declare @c int, @c2 int, @c3 int, @c4 int, @u int | |
while @i <= @n | |
begin | |
set @c = ascii(substring(@s, @i, 1)) | |
if (@c & 0x80) = 0 | |
begin | |
set @r += nchar(@c) | |
set @i += 1 | |
end | |
else if (@c & 0xE0) = 0xC0 | |
begin | |
if @i > @n-1 | |
return cast(cast('Malformed UTF-8 - two byte sequence cut short' as int) as nvarchar) | |
set @c2 = ascii(substring(@s, @i+1, 1)) | |
if (@c2 & 0xC0) <> 0x80 | |
return cast(cast('Malformed UTF-8 - continuation missing in a two byte sequence' as int) as nvarchar) | |
set @r += nchar(((@c & 0x1F) * 0x40) | (@c2 & 0x3F)) | |
set @i += 2 | |
end | |
else if (@c & 0xF0) = 0xE0 | |
begin | |
if @i > @n-2 | |
return cast(cast('Malformed UTF-8 - three byte sequence cut short' as int) as nvarchar) | |
set @c2 = ascii(substring(@s, @i+1, 1)) | |
set @c3 = ascii(substring(@s, @i+2, 1)) | |
if (@c2 & 0xC0) <> 0x80 or (@c3 & 0xC0) <> 0x80 | |
return cast(cast('Malformed UTF-8 - continuation missing in a three byte sequence' as int) as nvarchar) | |
set @r += nchar(((@c & 0xF) * 0x1000) | ((@c2 & 0x3F) * 0x40) | (@c3 & 0x3F)) | |
set @i += 3 | |
end | |
else if @c & 0xF8 = 0xF0 | |
begin | |
if @i > @n-3 | |
return cast(cast('Malformed UTF-8 - four byte sequence cut short' as int) as nvarchar) | |
set @c2 = ascii(substring(@s, @i+1, 1)) | |
set @c3 = ascii(substring(@s, @i+2, 1)) | |
set @c4 = ascii(substring(@s, @i+3, 1)) | |
if (@c2 & 0xC0) <> 0x80 or (@c3 & 0xC0) <> 0x80 or (@c4 & 0xC0) <> 0x80 | |
return cast(cast('Malformed UTF-8 - continuation missing in a four byte sequence' as int) as nvarchar) | |
set @u = (((@c & 7) * 0x40000) | ((@c2 & 0x3F) * 0x1000) | ((@c3 & 0x3F) * 0x40) | (@c4 & 0x3F)) - 0x10000 | |
set @r += nchar(0xD800 + @u / 0x400) + nchar(0xDC00 + (@u & 0x3FF)) | |
set @i += 4 | |
end | |
else | |
return cast(cast('Malformed UTF-8 - unexpected character' as int) as nvarchar) | |
end | |
return @r | |
end |
Since MSSQL 2019, there is native support for UTF-8. The following will do the same job, most likely much faster:
declare @s varchar(200) = 'Москва 東京 🙃', @t nvarchar(100)
set @t = CAST(@s AS nvarchar) COLLATE Latin1_General_100_CI_AI_SC_UTF8
print @t
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's a gist for the converse operation - conversion from a NVARCHAR string to a UTF-8 VARBINARY: https://gist.github.com/sevaa/f084a0a5a994c3bc28e518d5c708d5f6