Created
May 23, 2015 22:57
-
-
Save ScottPJones/f3fb082ac30d337d91bc to your computer and use it in GitHub Desktop.
Test generic vs. separate methods to check UTF-16/UTF-32/AbstractString
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file is a part of Julia. License is MIT: http://julialang.org/license | |
module CheckUTF | |
if VERSION < v"0.4-" | |
typealias AbstractString String | |
typealias UInt Uint | |
typealias UInt8 Uint8 | |
typealias UInt16 Uint16 | |
typealias UInt32 Uint32 | |
end | |
#= | |
@doc """ | |
@brief Error messages for Unicode / UTF support | |
""" -> | |
=# | |
const UTF_ERR_SHORT = 1 | |
const UTF_ERR_CONT = 2 | |
const UTF_ERR_LONG = 3 | |
const UTF_ERR_NOT_LEAD = 4 | |
const UTF_ERR_NOT_TRAIL = 5 | |
const UTF_ERR_NOT_SURROGATE = 6 | |
const UTF_ERR_MISSING_SURROGATE = 7 | |
const UTF_ERR_INVALID = 8 | |
const UTF_ERR_SURROGATE = 9 | |
const UTF_ERR_NULL_16_TERMINATE = 10 | |
const UTF_ERR_NULL_32_TERMINATE = 11 | |
const UTF_ERR_MAX = 11 | |
const errMsgs = [ | |
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>>) missing one or more continuation bytes)", | |
"invalid UTF-8 sequence starting at index <<1>> (0x<<2>> is not a continuation byte)", | |
"invalid UTF-8 sequence, overlong encoding starting at index <<1>> (0x<<2>>)", | |
"not a leading Unicode surrogate character at index <<1>> (0x<<2>>)", | |
"not a trailing Unicode surrogate character at index <<1>> (0x<<2>>)", | |
"not a valid Unicode surrogate character at index <<1>> (0x<<2>>", | |
"missing trailing Unicode surrogate character after index <<1>> (0x<<2>>)", | |
"invalid Unicode character starting at index <<1>> (0x<<2>> > 0x10ffff)", | |
"surrogate encoding not allowed in UTF-8 or UTF-32, at index <<1>> (0x<<2>>)", | |
"UTF16String data must be NULL-terminated", | |
"UTF32String data must be NULL-terminated" | |
] | |
#= | |
@doc """ | |
@brief Throws ArgumentError with information about the specific error, location, and character | |
@param[in] errCode::UTF_ERR | |
@param[in] errPos:: Integer | |
@param[in] errChar::Integer | |
@throws never returns, always throws ArgumentError | |
""" -> | |
=# | |
function utf_errfunc(errCode::Integer, errPos::Integer, errChar::Integer) | |
if errCode < 1 || errCode > UTF_ERR_MAX | |
throw(ArgumentError("Invalid error code for Unicode error: $errCode, Pos = $errPos, Char = $errChar")) | |
end | |
throw(ArgumentError(replace(replace(errMsgs[errCode],"<<1>>",string(errPos)),"<<2>>",hex(errChar)))) | |
-1 | |
end | |
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800) | |
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00) | |
is_surrogate_char(c::Unsigned) = ((c & ~0x007ff) == 0xd800) | |
is_valid_continuation(c) = ((c & 0xc0) == 0x80) | |
const UTF_NO_LONG_NULL = 1 # don't accept 0xc0 0x80 for '\0' | |
const UTF_NO_SURROGATES = 2 # don't accept surrogate pairs in UTF-8/UTF-32 | |
const UTF_ACCEPT_LONG = 4 # accept long encodings (other than long null in UTF-8) | |
const UTF_LONG = 1 # Long encodings are present | |
const UTF_LATIN1 = 2 # characters in range 0x80-0xFF present | |
const UTF_UNICODE2 = 4 # characters in range 0x100-0x7ff present | |
const UTF_UNICODE3 = 8 # characters in range 0x800-0xd7ff, 0xe000-0xffff | |
const UTF_UNICODE4 = 16 # non-BMP characters present | |
const UTF_SURROGATE = 32 # surrogate pairs present | |
# Get a UTF-8 continuation byte, give error if invalid, and update position and character value | |
macro get_continuation!(ch, byt, str, pos) | |
quote | |
$(esc(byt)) = $(esc(str))[$(esc(pos)) += 1] | |
!is_valid_continuation($(esc(byt))) && utf_errfunc(UTF_ERR_CONT, $(esc(pos)), $(esc(byt))) | |
$(esc(ch)) = ($(esc(ch)) << 6) | ($(esc(byt)) & 0x3f) | |
end | |
end | |
# Check a non-ASCII character to find out what type it is | |
# 0x80-0xff -> LATIN1 | |
# 0x100-0x7ff -> UNICODE2 | |
# 0x800-0xd7ff,0xe000-0xffff -> UNICODE3 | |
# 0xd800-0xdfff -> SURROGATE | |
# 0x10000-0x10ffff -> UNICODE4 | |
immutable UTF_String_Counts | |
cntT::Int # total # of characters | |
cnt2::Int # number of characters in the range 0x80:0x7ff (2-bytes in UTF-8) | |
cnt3::Int # number of characters in the range 0x800:0xd7ff,0xe000:0xffff (3-bytes) | |
cnt4::Int # number of characters in the range 0x10000:0x10ffff) (4-bytes) | |
flags::Int | |
end | |
#= | |
@doc """ | |
@brief Validates and calculates number of characters in a string | |
@param[in] str Vector of UInt8 | |
@param[in] options flags to determine error handling (default 0) | |
@return (total characters, 2-byte, 3-byte, 4-byte, flags) | |
@throws ArgumentError | |
""" -> | |
=# | |
function check_string_utf8(str::Vector{UInt8}, options::Integer=0) | |
local byt::UInt8 | |
local ch::UInt32 | |
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0 | |
pos = 0 | |
len = sizeof(str) | |
@inbounds while pos < len | |
ch = str[pos += 1] | |
cntT += 1 | |
if ch > 0x7f | |
# Check UTF-8 encoding | |
if ch < 0xe0 | |
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff) | |
(pos == len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) | |
ch &= 0x3f | |
@get_continuation!(ch, byt, str, pos) | |
if ch > 0x7f | |
cnt2 += 1 | |
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1 | |
elseif (options & UTF_ACCEPT_LONG) != 0 | |
flags |= UTF_LONG | |
elseif (ch == 0) && ((options & UTF_NO_LONG_NULL) == 0) | |
flags |= UTF_LONG | |
else | |
utf_errfunc(UTF_ERR_LONG, pos, ch) | |
end | |
elseif ch < 0xf0 | |
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff) | |
(pos + 2 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) | |
ch &= 0x0f | |
@get_continuation!(ch, byt, str, pos) | |
@get_continuation!(ch, byt, str, pos) | |
# check for surrogate pairs, make sure correct | |
if is_surrogate_char(ch) | |
!is_surrogate_lead(ch) && utf_errfunc(UTF_ERR_NOT_LEAD, pos-2, ch) | |
# next character *must* be a trailing surrogate character | |
(pos + 3 > len) && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos-2, ch) | |
byt = str[pos += 1] ; (byt != 0xed) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, byt) | |
surr::UInt32 = 0xd | |
@get_continuation!(surr, byt, str, pos) | |
@get_continuation!(surr, byt, str, pos) | |
!is_surrogate_trail(surr) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos-2, surr) | |
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos-2, surr) | |
flags |= UTF_SURROGATE | |
cnt4 += 1 | |
elseif ch > 0x07ff | |
cnt3 += 1 | |
elseif (options & UTF_ACCEPT_LONG) != 0 | |
flags |= UTF_LONG | |
cnt2 += 1 | |
else | |
utf_errfunc(UTF_ERR_LONG, pos-2, ch) | |
end | |
elseif ch < 0xf5 | |
# 4-byte UTF-8 sequence (i.e. characters > 0xffff) | |
(pos + 3 > len) && utf_errfunc(UTF_ERR_SHORT, pos, ch) | |
ch &= 0x07 | |
@get_continuation!(ch, byt, str, pos) | |
@get_continuation!(ch, byt, str, pos) | |
@get_continuation!(ch, byt, str, pos) | |
if ch > 0x10ffff | |
utf_errfunc(UTF_ERR_INVALID, pos-3, ch) | |
elseif ch > 0xffff | |
cnt4 += 1 | |
elseif is_surrogate_char(ch) | |
utf_errfunc(UTF_ERR_SURROGATE, pos-3, ch) | |
elseif (options & UTF_ACCEPT_LONG) != 0 | |
# This is an overly long encode character | |
flags |= UTF_LONG | |
if ch > 0x7ff | |
cnt3 += 1 | |
elseif ch > 0x7f | |
cnt2 += 1 | |
end | |
else | |
utf_errfunc(UTF_ERR_LONG, pos-2, ch) | |
end | |
else | |
utf_errfunc(UTF_ERR_INVALID, pos, ch) | |
end | |
end | |
end | |
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4)) | |
end | |
#= | |
@doc """ | |
@brief Validates and calculates number of characters in a UTF-16 string | |
@param[in] str Vector{UInt16} | |
@param[in] options flags to determine error handling (default 0) | |
@return (total characters, 2-byte, 3-byte, 4-byte, flags) | |
@throws ArgumentError | |
""" -> | |
=# | |
function check_string_utf16(str::Vector{UInt16}, len::Int) | |
local ch::UInt32 | |
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0 | |
local pos::Int = 0 | |
@inbounds while pos < len | |
ch = str[pos += 1] | |
cntT += 1 | |
if ch > 0x7f | |
if ch < 0x100 | |
cnt2 += 1 | |
flags |= UTF_LATIN1 | |
elseif ch < 0x800 | |
cnt2 += 1 | |
flags |= UTF_UNICODE2 | |
elseif !is_surrogate_char(ch) | |
cnt3 += 1 | |
elseif is_surrogate_lead(ch) | |
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | |
# next character *must* be a trailing surrogate character | |
ch = str[pos += 1] | |
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | |
cnt4 += 1 | |
else | |
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | |
end | |
end | |
end | |
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4)) | |
end | |
#= | |
@doc """ | |
@brief Validates and calculates number of characters in a UTF-32 string | |
@param[in] str Union(Vector{UInt32},AbstractString) | |
@param[in] options flags to determine error handling (default 0) | |
@return (total characters, 2-byte, 3-byte, 4-byte, flags) | |
@throws ArgumentError | |
""" -> | |
=# | |
function check_string_utf32(str::Vector{UInt32}, len::Int, options::Integer=0) | |
local ch::UInt32 | |
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0 | |
local pos::Int = 0 | |
@inbounds while pos < len | |
ch = str[pos += 1] | |
cntT += 1 | |
if ch > 0x7f | |
if ch < 0x100 | |
cnt2 += 1 | |
flags |= UTF_LATIN1 | |
elseif ch < 0x800 | |
cnt2 += 1 | |
flags |= UTF_UNICODE2 | |
elseif ch > 0xffff | |
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) | |
cnt4 += 1 | |
elseif !is_surrogate_char(ch) | |
cnt3 += 1 | |
elseif is_surrogate_lead(ch) | |
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | |
# next character *must* be a trailing surrogate character | |
ch = str[pos += 1] | |
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | |
cnt4 += 1 | |
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) | |
flags |= UTF_SURROGATE | |
else | |
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | |
end | |
end | |
end | |
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4)) | |
end | |
function check_string_abs(str::AbstractString, options::Integer=0) | |
local ch::UInt32 | |
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0 | |
local pos::Int = start(str) | |
local len::Int = endof(str) | |
@inbounds while pos < len | |
ch, pos = next(str, pos) | |
cntT += 1 | |
if ch > 0x7f | |
if ch < 0x100 | |
cnt2 += 1 | |
flags |= UTF_LATIN1 | |
elseif ch < 0x800 | |
cnt2 += 1 | |
flags |= UTF_UNICODE2 | |
elseif ch > 0xffff | |
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) | |
cnt4 += 1 | |
elseif !is_surrogate_char(ch) | |
cnt3 += 1 | |
elseif is_surrogate_lead(ch) | |
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | |
# next character *must* be a trailing surrogate character | |
ch, pos = next(str, pos) | |
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | |
cnt4 += 1 | |
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) | |
flags |= UTF_SURROGATE | |
else | |
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | |
end | |
end | |
end | |
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4)) | |
end | |
function check_string{T<:Union(Vector{UInt16},Vector{UInt32},Vector{Char},AbstractString)}(str::T, pos::Int, len::Int, options::Integer=0) | |
local ch::UInt32 | |
local cntT::Int=0, cnt2::Int=0, cnt3::Int=0, cnt4::Int=0, flags::Int=0 | |
@inbounds while pos < len | |
if T == AbstractString | |
ch, pos = next(str, pos) | |
else | |
ch = str[pos += 1] | |
end | |
cntT += 1 | |
if ch > 0x7f | |
if ch < 0x100 | |
cnt2 += 1 | |
flags |= UTF_LATIN1 | |
elseif ch < 0x800 | |
cnt2 += 1 | |
flags |= UTF_UNICODE2 | |
elseif T != Vector{UInt16} && ch > 0xffff | |
(ch > 0x10ffff) && utf_errfunc(UTF_ERR_INVALID, pos, ch) | |
cnt4 += 1 | |
elseif !is_surrogate_char(ch) | |
cnt3 += 1 | |
elseif is_surrogate_lead(ch) | |
pos == len && utf_errfunc(UTF_ERR_MISSING_SURROGATE, pos, ch) | |
# next character *must* be a trailing surrogate character | |
if T == AbstractString | |
ch, pos = next(str, pos) | |
else | |
ch = str[pos += 1] | |
end | |
!is_surrogate_trail(ch) && utf_errfunc(UTF_ERR_NOT_TRAIL, pos, ch) | |
cnt4 += 1 | |
if T != Vector{UInt16} | |
(options & UTF_NO_SURROGATES) != 0 && utf_errfunc(UTF_ERR_SURROGATE, pos, ch) | |
end | |
flags |= UTF_SURROGATE | |
else | |
utf_errfunc(UTF_ERR_NOT_LEAD, pos, ch) | |
end | |
end | |
end | |
return UTF_String_Counts(cntT, cnt2, cnt3, cnt4, flags | (cnt3 == 0 ? 0 : UTF_UNICODE3) | (cnt4 == 0 ? 0 : UTF_UNICODE4)) | |
end | |
function chkutf16(str::UTF16String, max::Int) | |
local out | |
local dat = str.data | |
for i=1:max | |
out = check_string(dat, 0, sizeof(dat)>>>1) | |
end | |
out | |
end | |
function chkutf32(str::UTF32String, max::Int) | |
local out | |
local dat = str.data | |
for i=1:max | |
out = check_string(dat, 0, sizeof(dat)>>>2) | |
end | |
out | |
end | |
function chkstr8(str::UTF8String, max::Int) | |
local out | |
local dat = str.data | |
for i=1:max | |
out = check_string_utf8(dat) | |
end | |
out | |
end | |
function chkstr16(str::UTF16String, max::Int) | |
local out | |
local dat = str.data | |
for i=1:max | |
out = check_string_utf16(dat, sizeof(dat)>>>1) | |
end | |
out | |
end | |
function chkstr32(str::UTF32String, max::Int) | |
local out | |
local dat = reinterpret(UInt32, str.data) | |
for i=1:max | |
out = check_string_utf32(dat, sizeof(dat)>>>2) | |
end | |
out | |
end | |
function tstchk(n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) | |
print("Check UTF-8 ") | |
@time chkstr8(strUTF8,n) | |
print("Check UTF-16 ") | |
@time chkstr16(strUTF16,n) | |
print("Check UTF-32 ") | |
@time chkstr32(strUTF32,n) | |
print("Generic UTF-16 ") | |
@time chkutf16(strUTF16,n) | |
print("Generic UTF-32 ") | |
@time chkutf32(strUTF32,n) | |
end | |
function tstall(str::String, n::Int, strUTF8::UTF8String, strUTF16::UTF16String, strUTF32::UTF32String) | |
println("\n\n$str: Looping $n times, length=$(length(strUTF32))") | |
println("UTF-8: $(sizeof(strUTF8)), UTF-16: $(sizeof(strUTF16)), UTF-32: $(sizeof(strUTF32))\n") | |
tstchk(n, strUTF8, strUTF16, strUTF32) | |
end | |
function tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,strS_UTF8) | |
strA_UTF16 = utf16(strA_UTF8) | |
strS_UTF16 = utf16(strS_UTF8) | |
strA_UTF32 = utf32(strA_UTF8) | |
strS_UTF32 = utf32(strS_UTF8) | |
tstall("ASCII:",n,strA_UTF8,strA_UTF16,strA_UTF32) | |
tstall("Surrogates:",n,strS_UTF8,strS_UTF16,strS_UTF32) | |
end | |
export dotest | |
function dotest(n) | |
# Create some ASCII, UTF8, UTF16, and UTF32 strings | |
baseascii = "abcdefghijklmnop\uff" | |
binstr = b"abcdefghijk\xc3\xbf\xdf\xbf\xe7\xbf\xbf\xed\xa0\x80\xed\xb0\x80\xed\xaf\xbf\xed\xbf\xbf" | |
strAscii = "abcdefghijklmnop" | |
strA_UTF8 = baseascii[1:16] | |
strL_UTF8 = "abcdefghijk\uff\uff\uff\uff\uff" | |
str2_UTF8 = "abcdefghijk\uff\uff\uff\u7ff\u7ff" | |
str3_UTF8 = "abcdefghijk\uff\uff\uff\u7fff\u7fff" | |
str4_UTF8 = "abcdefghijk\uff\u7ff\u7fff\U7ffff\U0fffff" | |
strAscii ^= 262144 | |
strA_UTF8 ^= 262144 | |
strL_UTF8 ^= 262144 | |
str2_UTF8 ^= 262144 | |
str3_UTF8 ^= 262144 | |
str4_UTF8 ^= 262144 | |
for i=1:9 | |
binstr = vcat(binstr,binstr,binstr,binstr) | |
end | |
tstsiz(n,strAscii,strA_UTF8,strL_UTF8,str2_UTF8,str3_UTF8,str4_UTF8,UTF8String(binstr)) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment