Last active
August 29, 2015 14:12
-
-
Save nalimilan/5652da74380c7e211131 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Base.<, Base.next, Base.nextind, Base.prevind, Base.colon, Base.length, Base.getindex, Base.first, Base.last, | |
Base.is_utf8_start, Base.utf8_trailing, Base.utf8_offset | |
# Simple index with string plus byte offset | |
immutable StringIndex{S<:AbstractString} | |
str::S | |
i::Int | |
end | |
<(a::StringIndex, b::StringIndex) = a.i < b.i | |
function next(s::UTF8String, idx::StringIndex{UTF8String}) | |
# potentially faster version | |
# d = s.data | |
# a::UInt32 = d[i] | |
# if a < 0x80; return char(a); end | |
# #if a&0xc0==0x80; return '\ufffd'; end | |
# b::UInt32 = a<<6 + d[i+1] | |
# if a < 0xe0; return char(b - 0x00003080); end | |
# c::UInt32 = b<<6 + d[i+2] | |
# if a < 0xf0; return char(c - 0x000e2080); end | |
# return char(c<<6 + d[i+3] - 0x03c82080) | |
i = idx.i | |
d = s.data | |
b = d[i] | |
if !is_utf8_start(b) | |
j = i-1 | |
while 0 < j && !is_utf8_start(d[j]) | |
j -= 1 | |
end | |
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d) | |
# b is a continuation byte of a valid UTF-8 character | |
error("invalid UTF-8 character index") | |
end | |
# move past 1 byte in case the data is actually Latin-1 | |
return '\ufffd', i+1 | |
end | |
trailing = utf8_trailing[b+1] | |
if length(d) < i + trailing | |
return '\ufffd', i+1 | |
end | |
c::UInt32 = 0 | |
for j = 1:trailing+1 | |
c <<= 6 | |
c += d[i] | |
i += 1 | |
end | |
c -= utf8_offset[trailing+1] | |
char(c), i | |
end | |
getindex{S<:AbstractString}(s::S, i::StringIndex{S}) = next(s,i)[1] | |
nextind(i::StringIndex) = StringIndex(i.str, nextind(i.str, i.i)) | |
prevind(i::StringIndex) = StringIndex(i.str, prevind(i.str, i.i)) | |
function +(si::StringIndex, j::Integer) | |
j < 0 && return si - (-j) | |
i = si.i | |
while j > 0 | |
i = nextind(si.str, i) | |
j -= 1 | |
end | |
return StringIndex(si.str, i) | |
end | |
function -(si::StringIndex, j::Integer) | |
j < 0 && return si + (-j) | |
i = si.i | |
while j > 0 | |
i = prevind(si.str, i) | |
j -= 1 | |
end | |
return StringIndex(si.str, i) | |
end | |
# Variant with byte index + character offset | |
immutable StringIndexOffset{S<:AbstractString} | |
i::Int | |
offset::Int | |
end | |
# StringIndexOffset{S<:AbstractString}(i::Int) = StringIndexOffset{S}(i, 0) | |
function <(a::StringIndexOffset, b::StringIndexOffset) | |
a.i == b.i || throw(ArgumentError()) | |
return a.offset < b.offset | |
end | |
@inline function byte_index(s::UTF8String, idx::StringIndexOffset{UTF8String}) | |
i = idx.i | |
idx.offset == 0 && return i | |
if idx.offset > 0 | |
for j in 1:idx.offset | |
i = nextind(s, i) | |
end | |
elseif idx.offset < 0 | |
for j in idx.offset:1 | |
i = prevind(s, i) | |
end | |
end | |
return i | |
end | |
function next(s::UTF8String, idx::StringIndexOffset{UTF8String}) | |
# potentially faster version | |
# d = s.data | |
# a::UInt32 = d[i] | |
# if a < 0x80; return char(a); end | |
# #if a&0xc0==0x80; return '\ufffd'; end | |
# b::UInt32 = a<<6 + d[i+1] | |
# if a < 0xe0; return char(b - 0x00003080); end | |
# c::UInt32 = b<<6 + d[i+2] | |
# if a < 0xf0; return char(c - 0x000e2080); end | |
# return char(c<<6 + d[i+3] - 0x03c82080) | |
i = byte_index(s, idx) | |
d = s.data | |
b = d[i] | |
if !is_utf8_start(b) | |
j = i-1 | |
while 0 < j && !is_utf8_start(d[j]) | |
j -= 1 | |
end | |
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d) | |
# b is a continuation byte of a valid UTF-8 character | |
error("invalid UTF-8 character index") | |
end | |
# move past 1 byte in case the data is actually Latin-1 | |
return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0) | |
end | |
trailing = utf8_trailing[b+1] | |
if length(d) < i + trailing | |
return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0) | |
end | |
c::UInt32 = 0 | |
for j = 1:trailing+1 | |
c <<= 6 | |
c += d[i] | |
i += 1 | |
end | |
c -= utf8_offset[trailing+1] | |
char(c), StringIndexOffset{UTF8String}(i, 0) | |
end | |
getindex{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = next(s,i)[1] | |
nextind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0) | |
prevind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0) | |
nextind{S<:AbstractString}(i::StringIndexOffset{S}) = i + 1 | |
prevind{S<:AbstractString}(i::StringIndexOffset{S}) = i - 1 | |
+{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset + j) | |
-{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset - j) | |
# More efficient version using pointers | |
immutable StringIndexPointer{S<:AbstractString} | |
ptr::Ptr{Uint8} | |
# Required to check lower bound | |
start::Ptr{Uint8} | |
end | |
<(a::StringIndexPointer, b::StringIndexPointer) = a.ptr < b.ptr | |
isvalid(i::StringIndexPointer) = (i.start <= i.ptr && is_utf8_start(unsafe_load(i.ptr))) | |
function nextind{S<:AbstractString}(i::StringIndexPointer{S}) | |
p = i.ptr + 1 | |
while unsafe_load(p) != 0 | |
idx = StringIndexPointer{UTF8String}(p, i.start) | |
if isvalid(idx) | |
return idx | |
end | |
p += 1 | |
end | |
throw(BoundsError()) | |
end | |
function prevind{S<:AbstractString}(i::StringIndexPointer{S}) | |
p = i.ptr | |
while p >= i.ptr | |
idx = StringIndexPointer{UTF8String}(p, i.start) | |
if isvalid(idx) | |
return idx | |
end | |
p -= 1 | |
end | |
throw(BoundsError()) | |
end | |
function +(idx::StringIndexPointer{UTF8String}, j::Integer) | |
j < 0 && return idx - (-j) | |
while j > 0 | |
idx = nextind(idx) | |
j -= 1 | |
end | |
return idx | |
end | |
function -(idx::StringIndexPointer{UTF8String}, j::Integer) | |
j < 0 && return idx + (-j) | |
while j > 0 | |
idx = prevind(idx) | |
j -= 1 | |
end | |
return idx | |
end | |
function next(s::UTF8String, idx::StringIndexPointer{UTF8String}) | |
# potentially faster version | |
# d = s.data | |
# a::UInt32 = d[i] | |
# if a < 0x80; return char(a); end | |
# #if a&0xc0==0x80; return '\ufffd'; end | |
# b::UInt32 = a<<6 + d[i+1] | |
# if a < 0xe0; return char(b - 0x00003080); end | |
# c::UInt32 = b<<6 + d[i+2] | |
# if a < 0xf0; return char(c - 0x000e2080); end | |
# return char(c<<6 + d[i+3] - 0x03c82080) | |
startptr = pointer(s) | |
endptr = startptr + endof(s) | |
# Not checked in current implementation either | |
# startptr <= idx.ptr <= endptr || throw(BoundsError()) | |
i = idx.ptr | |
b = unsafe_load(i, 1) | |
if !is_utf8_start(b) | |
j = i - 1 | |
while startptr < j && !is_utf8_start(unsafe_load(j, 1)) | |
j -= 1 | |
end | |
if startptr < j && i <= j + utf8_trailing[unsafe_load(j, 1) + 1] - 1 <= endptr | |
# b is a continuation byte of a valid UTF-8 character | |
error("invalid UTF-8 character index") | |
end | |
# move past 1 byte in case the data is actually Latin-1 | |
return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr) | |
end | |
trailing = utf8_trailing[b+1] | |
if endptr < i + trailing - 1 | |
return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr) | |
end | |
c::UInt32 = 0 | |
for k = 1:trailing+1 | |
c <<= 6 | |
c += unsafe_load(i, 1) | |
i += 1 | |
end | |
c -= utf8_offset[trailing+1] | |
char(c), StringIndexPointer{UTF8String}(i, startptr) | |
end | |
getindex{S<:AbstractString}(s::S, i::StringIndexPointer{S}) = next(s,i)[1] | |
x = "café crème" | |
i = StringIndexPointer{UTF8String}(pointer(x, 4), pointer(x)) | |
j = StringIndex(x, 4) | |
k = StringIndexOffset{UTF8String}(4, 0) | |
f_ind(x, i) = for l in 1:100000 x[i] end | |
f_nextind(x, i) = for l in 1:100000 x[nextind(i)] end | |
f_plusone(x, i) = for l in 1:100000 x[i+1] end | |
g(x, i) = for l in 1:100000 x[nextind(x, i)] end | |
f_ind(x, i); f_nextind(x, i); f_plusone(x, i); | |
f_ind(x, j); f_nextind(x, j); f_plusone(x, j); | |
f_ind(x, k); f_nextind(x, k); f_plusone(x, k); | |
f_ind(x, 4); g(x, k); g(x, 4); | |
@time for l in 1:100 f_ind(x, i) end | |
@time for l in 1:100 f_nextind(x, i) end | |
@time for l in 1:100 f_plusone(x, i) end | |
@time for l in 1:100 f_ind(x, j) end | |
@time for l in 1:100 f_nextind(x, j) end | |
@time for l in 1:100 f_plusone(x, j) end | |
@time for l in 1:100 f_ind(x, k) end | |
@time for l in 1:100 f_nextind(x, k) end | |
@time for l in 1:100 f_plusone(x, k) end | |
@time for l in 1:100 g(x, k) end | |
@time for l in 1:100 f_ind(x, 4) end | |
@time for l in 1:100 g(x, 4) end | |
# julia> @time for l in 1:100 f_ind(x, i) end | |
# elapsed time: 0.105896672 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_nextind(x, i) end | |
# elapsed time: 0.199674082 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_plusone(x, i) end | |
# elapsed time: 0.238348901 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_ind(x, j) end | |
# elapsed time: 0.098285909 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_nextind(x, j) end | |
# elapsed time: 0.621622011 seconds (240000000 bytes allocated, 38.06% gc time) | |
# julia> @time for l in 1:100 f_plusone(x, j) end | |
# elapsed time: 0.655069184 seconds (240000000 bytes allocated, 34.15% gc time) | |
# julia> @time for l in 1:100 f_ind(x, k) end | |
# elapsed time: 0.109709424 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_nextind(x, k) end | |
# elapsed time: 0.28053811 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_plusone(x, k) end | |
# elapsed time: 0.279604258 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 g(x, k) end | |
# elapsed time: 0.317775374 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 f_ind(x, 4) end | |
# elapsed time: 0.089790495 seconds (0 bytes allocated) | |
# julia> @time for l in 1:100 g(x, 4) end | |
# elapsed time: 0.24096117 seconds (0 bytes allocated) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment