Skip to content

Instantly share code, notes, and snippets.

@nalimilan
Last active August 29, 2015 14:12
Show Gist options
  • Save nalimilan/5652da74380c7e211131 to your computer and use it in GitHub Desktop.
Save nalimilan/5652da74380c7e211131 to your computer and use it in GitHub Desktop.
import Base.<, Base.next, Base.nextind, Base.prevind, Base.colon, Base.length, Base.getindex, Base.first, Base.last,
Base.is_utf8_start, Base.utf8_trailing, Base.utf8_offset
# Simple index with string plus byte offset
immutable StringIndex{S<:AbstractString}
str::S
i::Int
end
<(a::StringIndex, b::StringIndex) = a.i < b.i
function next(s::UTF8String, idx::StringIndex{UTF8String})
# potentially faster version
# d = s.data
# a::UInt32 = d[i]
# if a < 0x80; return char(a); end
# #if a&0xc0==0x80; return '\ufffd'; end
# b::UInt32 = a<<6 + d[i+1]
# if a < 0xe0; return char(b - 0x00003080); end
# c::UInt32 = b<<6 + d[i+2]
# if a < 0xf0; return char(c - 0x000e2080); end
# return char(c<<6 + d[i+3] - 0x03c82080)
i = idx.i
d = s.data
b = d[i]
if !is_utf8_start(b)
j = i-1
while 0 < j && !is_utf8_start(d[j])
j -= 1
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
error("invalid UTF-8 character index")
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', i+1
end
trailing = utf8_trailing[b+1]
if length(d) < i + trailing
return '\ufffd', i+1
end
c::UInt32 = 0
for j = 1:trailing+1
c <<= 6
c += d[i]
i += 1
end
c -= utf8_offset[trailing+1]
char(c), i
end
getindex{S<:AbstractString}(s::S, i::StringIndex{S}) = next(s,i)[1]
nextind(i::StringIndex) = StringIndex(i.str, nextind(i.str, i.i))
prevind(i::StringIndex) = StringIndex(i.str, prevind(i.str, i.i))
function +(si::StringIndex, j::Integer)
j < 0 && return si - (-j)
i = si.i
while j > 0
i = nextind(si.str, i)
j -= 1
end
return StringIndex(si.str, i)
end
function -(si::StringIndex, j::Integer)
j < 0 && return si + (-j)
i = si.i
while j > 0
i = prevind(si.str, i)
j -= 1
end
return StringIndex(si.str, i)
end
# Variant with byte index + character offset
immutable StringIndexOffset{S<:AbstractString}
i::Int
offset::Int
end
# StringIndexOffset{S<:AbstractString}(i::Int) = StringIndexOffset{S}(i, 0)
function <(a::StringIndexOffset, b::StringIndexOffset)
a.i == b.i || throw(ArgumentError())
return a.offset < b.offset
end
@inline function byte_index(s::UTF8String, idx::StringIndexOffset{UTF8String})
i = idx.i
idx.offset == 0 && return i
if idx.offset > 0
for j in 1:idx.offset
i = nextind(s, i)
end
elseif idx.offset < 0
for j in idx.offset:1
i = prevind(s, i)
end
end
return i
end
function next(s::UTF8String, idx::StringIndexOffset{UTF8String})
# potentially faster version
# d = s.data
# a::UInt32 = d[i]
# if a < 0x80; return char(a); end
# #if a&0xc0==0x80; return '\ufffd'; end
# b::UInt32 = a<<6 + d[i+1]
# if a < 0xe0; return char(b - 0x00003080); end
# c::UInt32 = b<<6 + d[i+2]
# if a < 0xf0; return char(c - 0x000e2080); end
# return char(c<<6 + d[i+3] - 0x03c82080)
i = byte_index(s, idx)
d = s.data
b = d[i]
if !is_utf8_start(b)
j = i-1
while 0 < j && !is_utf8_start(d[j])
j -= 1
end
if 0 < j && i <= j+utf8_trailing[d[j]+1] <= length(d)
# b is a continuation byte of a valid UTF-8 character
error("invalid UTF-8 character index")
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0)
end
trailing = utf8_trailing[b+1]
if length(d) < i + trailing
return '\ufffd', StringIndexOffset{UTF8String}(i+1, 0)
end
c::UInt32 = 0
for j = 1:trailing+1
c <<= 6
c += d[i]
i += 1
end
c -= utf8_offset[trailing+1]
char(c), StringIndexOffset{UTF8String}(i, 0)
end
getindex{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = next(s,i)[1]
nextind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0)
prevind{S<:AbstractString}(s::S, i::StringIndexOffset{S}) = StringIndexOffset{S}(byte_index(s, nextind(i)), 0)
nextind{S<:AbstractString}(i::StringIndexOffset{S}) = i + 1
prevind{S<:AbstractString}(i::StringIndexOffset{S}) = i - 1
+{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset + j)
-{S<:AbstractString}(i::StringIndexOffset{S}, j::Integer) = StringIndexOffset{S}(i.i, i.offset - j)
# More efficient version using pointers
immutable StringIndexPointer{S<:AbstractString}
ptr::Ptr{Uint8}
# Required to check lower bound
start::Ptr{Uint8}
end
<(a::StringIndexPointer, b::StringIndexPointer) = a.ptr < b.ptr
isvalid(i::StringIndexPointer) = (i.start <= i.ptr && is_utf8_start(unsafe_load(i.ptr)))
function nextind{S<:AbstractString}(i::StringIndexPointer{S})
p = i.ptr + 1
while unsafe_load(p) != 0
idx = StringIndexPointer{UTF8String}(p, i.start)
if isvalid(idx)
return idx
end
p += 1
end
throw(BoundsError())
end
function prevind{S<:AbstractString}(i::StringIndexPointer{S})
p = i.ptr
while p >= i.ptr
idx = StringIndexPointer{UTF8String}(p, i.start)
if isvalid(idx)
return idx
end
p -= 1
end
throw(BoundsError())
end
function +(idx::StringIndexPointer{UTF8String}, j::Integer)
j < 0 && return idx - (-j)
while j > 0
idx = nextind(idx)
j -= 1
end
return idx
end
function -(idx::StringIndexPointer{UTF8String}, j::Integer)
j < 0 && return idx + (-j)
while j > 0
idx = prevind(idx)
j -= 1
end
return idx
end
function next(s::UTF8String, idx::StringIndexPointer{UTF8String})
# potentially faster version
# d = s.data
# a::UInt32 = d[i]
# if a < 0x80; return char(a); end
# #if a&0xc0==0x80; return '\ufffd'; end
# b::UInt32 = a<<6 + d[i+1]
# if a < 0xe0; return char(b - 0x00003080); end
# c::UInt32 = b<<6 + d[i+2]
# if a < 0xf0; return char(c - 0x000e2080); end
# return char(c<<6 + d[i+3] - 0x03c82080)
startptr = pointer(s)
endptr = startptr + endof(s)
# Not checked in current implementation either
# startptr <= idx.ptr <= endptr || throw(BoundsError())
i = idx.ptr
b = unsafe_load(i, 1)
if !is_utf8_start(b)
j = i - 1
while startptr < j && !is_utf8_start(unsafe_load(j, 1))
j -= 1
end
if startptr < j && i <= j + utf8_trailing[unsafe_load(j, 1) + 1] - 1 <= endptr
# b is a continuation byte of a valid UTF-8 character
error("invalid UTF-8 character index")
end
# move past 1 byte in case the data is actually Latin-1
return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr)
end
trailing = utf8_trailing[b+1]
if endptr < i + trailing - 1
return '\ufffd', StringIndexPointer{UTF8String}(i+1, startptr)
end
c::UInt32 = 0
for k = 1:trailing+1
c <<= 6
c += unsafe_load(i, 1)
i += 1
end
c -= utf8_offset[trailing+1]
char(c), StringIndexPointer{UTF8String}(i, startptr)
end
getindex{S<:AbstractString}(s::S, i::StringIndexPointer{S}) = next(s,i)[1]
x = "café crème"
i = StringIndexPointer{UTF8String}(pointer(x, 4), pointer(x))
j = StringIndex(x, 4)
k = StringIndexOffset{UTF8String}(4, 0)
f_ind(x, i) = for l in 1:100000 x[i] end
f_nextind(x, i) = for l in 1:100000 x[nextind(i)] end
f_plusone(x, i) = for l in 1:100000 x[i+1] end
g(x, i) = for l in 1:100000 x[nextind(x, i)] end
f_ind(x, i); f_nextind(x, i); f_plusone(x, i);
f_ind(x, j); f_nextind(x, j); f_plusone(x, j);
f_ind(x, k); f_nextind(x, k); f_plusone(x, k);
f_ind(x, 4); g(x, k); g(x, 4);
@time for l in 1:100 f_ind(x, i) end
@time for l in 1:100 f_nextind(x, i) end
@time for l in 1:100 f_plusone(x, i) end
@time for l in 1:100 f_ind(x, j) end
@time for l in 1:100 f_nextind(x, j) end
@time for l in 1:100 f_plusone(x, j) end
@time for l in 1:100 f_ind(x, k) end
@time for l in 1:100 f_nextind(x, k) end
@time for l in 1:100 f_plusone(x, k) end
@time for l in 1:100 g(x, k) end
@time for l in 1:100 f_ind(x, 4) end
@time for l in 1:100 g(x, 4) end
# julia> @time for l in 1:100 f_ind(x, i) end
# elapsed time: 0.105896672 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_nextind(x, i) end
# elapsed time: 0.199674082 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_plusone(x, i) end
# elapsed time: 0.238348901 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_ind(x, j) end
# elapsed time: 0.098285909 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_nextind(x, j) end
# elapsed time: 0.621622011 seconds (240000000 bytes allocated, 38.06% gc time)
# julia> @time for l in 1:100 f_plusone(x, j) end
# elapsed time: 0.655069184 seconds (240000000 bytes allocated, 34.15% gc time)
# julia> @time for l in 1:100 f_ind(x, k) end
# elapsed time: 0.109709424 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_nextind(x, k) end
# elapsed time: 0.28053811 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_plusone(x, k) end
# elapsed time: 0.279604258 seconds (0 bytes allocated)
# julia> @time for l in 1:100 g(x, k) end
# elapsed time: 0.317775374 seconds (0 bytes allocated)
# julia> @time for l in 1:100 f_ind(x, 4) end
# elapsed time: 0.089790495 seconds (0 bytes allocated)
# julia> @time for l in 1:100 g(x, 4) end
# elapsed time: 0.24096117 seconds (0 bytes allocated)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment