Skip to content

Instantly share code, notes, and snippets.

@haampie
Last active March 27, 2018 12:28
Show Gist options
  • Save haampie/abac017f7220abf10395d41179ef196d to your computer and use it in GitHub Desktop.
Save haampie/abac017f7220abf10395d41179ef196d to your computer and use it in GitHub Desktop.
Range performance
With bounds check Using @inbounds
Range type Index type Current New x faster Current New x faster
StepRange{Bool,Bool} Int8 93.605 μs 46.465 μs 2.0 83.849 μs 29.769 μs 2.8
StepRange{Bool,Bool} UInt8 92.478 μs 42.743 μs 2.2 81.760 μs 29.775 μs 2.7
StepRange{Bool,Bool} Int32 91.492 μs 45.854 μs 2.0 83.108 μs 29.780 μs 2.8
StepRange{Bool,Bool} UInt32 89.812 μs 41.530 μs 2.2 82.168 μs 26.783 μs 3.1
StepRange{Bool,Bool} Int64 92.273 μs 45.849 μs 2.0 78.521 μs 29.781 μs 2.6
StepRange{Bool,Bool} UInt64 86.641 μs 44.617 μs 1.9 74.546 μs 29.793 μs 2.5
StepRange{Int8,Int8} Int8 24.084 μs 21.805 μs 1.1 11.753 μs 1.444 μs 8.1
StepRange{Int8,Int8} UInt8 24.099 μs 28.102 μs 0.9 11.755 μs 1.444 μs 8.1
StepRange{Int8,Int8} Int32 24.081 μs 15.217 μs 1.6 11.743 μs 1.760 μs 6.7
StepRange{Int8,Int8} UInt32 24.096 μs 31.887 μs 0.8 10.234 μs 1.761 μs 5.8
StepRange{Int8,Int8} Int64 24.093 μs 15.199 μs 1.6 11.764 μs 5.499 μs 2.1
StepRange{Int8,Int8} UInt64 23.916 μs 31.311 μs 0.8 12.814 μs 5.491 μs 2.3
StepRange{UInt8,UInt8} Int8 23.910 μs 17.051 μs 1.4 10.159 μs 1.444 μs 7.0
StepRange{UInt8,UInt8} UInt8 23.935 μs 16.806 μs 1.4 10.158 μs 1.443 μs 7.0
StepRange{UInt8,UInt8} Int32 23.950 μs 26.457 μs 0.9 10.189 μs 1.698 μs 6.0
StepRange{UInt8,UInt8} UInt32 23.913 μs 18.844 μs 1.3 10.180 μs 1.706 μs 6.0
StepRange{UInt8,UInt8} Int64 23.936 μs 26.469 μs 0.9 10.216 μs 5.540 μs 1.8
StepRange{UInt8,UInt8} UInt64 23.926 μs 17.925 μs 1.3 10.193 μs 5.495 μs 1.9
StepRange{Int32,Int32} Int8 22.954 μs 16.733 μs 1.4 10.483 μs 1.726 μs 6.1
StepRange{Int32,Int32} UInt8 22.932 μs 16.748 μs 1.4 10.483 μs 1.727 μs 6.1
StepRange{Int32,Int32} Int32 21.213 μs 15.457 μs 1.4 10.451 μs 1.243 μs 8.4
StepRange{Int32,Int32} UInt32 22.262 μs 29.627 μs 0.8 10.502 μs 1.282 μs 8.2
StepRange{Int32,Int32} Int64 21.435 μs 14.883 μs 1.4 10.517 μs 5.415 μs 1.9
StepRange{Int32,Int32} UInt64 22.568 μs 29.779 μs 0.8 11.716 μs 5.497 μs 2.1
StepRange{UInt32,UInt32} Int8 22.910 μs 20.571 μs 1.1 9.049 μs 1.729 μs 5.2
StepRange{UInt32,UInt32} UInt8 22.956 μs 18.893 μs 1.2 9.049 μs 1.792 μs 5.0
StepRange{UInt32,UInt32} Int32 22.924 μs 18.602 μs 1.2 9.069 μs 1.284 μs 7.1
StepRange{UInt32,UInt32} UInt32 22.965 μs 17.402 μs 1.3 9.062 μs 1.285 μs 7.1
StepRange{UInt32,UInt32} Int64 22.950 μs 25.225 μs 0.9 9.087 μs 5.495 μs 1.7
StepRange{UInt32,UInt32} UInt64 23.856 μs 15.760 μs 1.5 9.079 μs 5.500 μs 1.7
StepRange{Int64,Int64} Int8 19.992 μs 15.061 μs 1.3 4.366 μs 5.299 μs 0.8
StepRange{Int64,Int64} UInt8 19.968 μs 14.130 μs 1.4 4.368 μs 5.497 μs 0.8
StepRange{Int64,Int64} Int32 19.993 μs 15.501 μs 1.3 4.374 μs 4.342 μs 1.0
StepRange{Int64,Int64} UInt32 19.275 μs 15.462 μs 1.2 4.375 μs 4.322 μs 1.0
StepRange{Int64,Int64} Int64 20.569 μs 15.456 μs 1.3 4.498 μs 4.664 μs 1.0
StepRange{Int64,Int64} UInt64 21.181 μs 29.582 μs 0.7 9.080 μs 4.587 μs 2.0
StepRange{UInt64,UInt64} Int8 22.288 μs 19.575 μs 1.1 4.364 μs 5.501 μs 0.8
StepRange{UInt64,UInt64} UInt8 22.289 μs 19.330 μs 1.2 4.259 μs 5.494 μs 0.8
StepRange{UInt64,UInt64} Int32 22.293 μs 19.618 μs 1.1 4.388 μs 4.336 μs 1.0
StepRange{UInt64,UInt64} UInt32 22.283 μs 17.552 μs 1.3 4.375 μs 4.259 μs 1.0
StepRange{UInt64,UInt64} Int64 23.523 μs 19.394 μs 1.2 4.438 μs 4.599 μs 1.0
StepRange{UInt64,UInt64} UInt64 23.499 μs 17.566 μs 1.3 4.596 μs 4.587 μs 1.0
OneTo{Int8} Int8 13.987 μs 9.021 μs 1.6 68.712 ns 68.639 ns 1.0
OneTo{Int8} UInt8 17.611 μs 14.241 μs 1.2 10.150 μs 69.523 ns 146.0
OneTo{Int8} Int32 14.199 μs 8.683 μs 1.6 7.527 μs 873.633 ns 8.6
OneTo{Int8} UInt32 18.168 μs 11.869 μs 1.5 8.818 μs 873.323 ns 10.1
OneTo{Int8} Int64 14.242 μs 7.818 μs 1.8 6.783 μs 1.940 μs 3.5
OneTo{Int8} UInt64 18.383 μs 10.642 μs 1.7 8.836 μs 1.940 μs 4.6
OneTo{UInt8} Int8 19.073 μs 9.014 μs 2.1 10.080 μs 69.136 ns 145.8
OneTo{UInt8} UInt8 15.122 μs 10.840 μs 1.4 69.752 ns 69.798 ns 1.0
OneTo{UInt8} Int32 17.464 μs 8.681 μs 2.0 7.512 μs 873.487 ns 8.6
OneTo{UInt8} UInt32 13.545 μs 10.195 μs 1.3 7.511 μs 817.519 ns 9.2
OneTo{UInt8} Int64 17.483 μs 9.016 μs 1.9 7.511 μs 1.940 μs 3.9
OneTo{UInt8} UInt64 13.550 μs 8.746 μs 1.5 7.540 μs 1.957 μs 3.9
OneTo{Int32} Int8 11.585 μs 8.678 μs 1.3 458.773 ns 475.768 ns 1.0
OneTo{Int32} UInt8 13.537 μs 11.158 μs 1.2 475.836 ns 459.177 ns 1.0
OneTo{Int32} Int32 11.157 μs 9.015 μs 1.2 473.398 ns 473.394 ns 1.0
OneTo{Int32} UInt32 15.450 μs 11.162 μs 1.4 5.723 μs 473.656 ns 12.1
OneTo{Int32} Int64 13.539 μs 9.019 μs 1.5 7.724 μs 1.940 μs 4.0
OneTo{Int32} UInt64 17.443 μs 11.178 μs 1.6 7.899 μs 1.940 μs 4.1
OneTo{UInt32} Int8 17.380 μs 10.293 μs 1.7 5.899 μs 475.468 ns 12.4
OneTo{UInt32} UInt8 12.879 μs 10.299 μs 1.3 459.718 ns 476.452 ns 1.0
OneTo{UInt32} Int32 16.492 μs 9.016 μs 1.8 5.940 μs 478.136 ns 12.4
OneTo{UInt32} UInt32 12.877 μs 8.677 μs 1.5 483.615 ns 463.864 ns 1.0
OneTo{UInt32} Int64 17.383 μs 8.684 μs 2.0 6.000 μs 1.939 μs 3.1
OneTo{UInt32} UInt64 13.535 μs 8.964 μs 1.5 6.250 μs 1.954 μs 3.2
OneTo{Int64} Int8 11.155 μs 9.008 μs 1.2 869.208 ns 900.748 ns 1.0
OneTo{Int64} UInt8 13.523 μs 11.159 μs 1.2 869.371 ns 869.459 ns 1.0
OneTo{Int64} Int32 11.159 μs 9.012 μs 1.2 915.134 ns 912.096 ns 1.0
OneTo{Int64} UInt32 13.537 μs 11.592 μs 1.2 914.779 ns 919.678 ns 1.0
OneTo{Int64} Int64 11.585 μs 8.954 μs 1.3 955.439 ns 945.531 ns 1.0
OneTo{Int64} UInt64 14.616 μs 11.602 μs 1.3 6.351 μs 907.490 ns 7.0
OneTo{UInt64} Int8 17.385 μs 9.655 μs 1.8 5.896 μs 869.403 ns 6.8
OneTo{UInt64} UInt8 12.775 μs 9.012 μs 1.4 869.321 ns 869.346 ns 1.0
OneTo{UInt64} Int32 17.384 μs 10.297 μs 1.7 5.941 μs 916.900 ns 6.5
OneTo{UInt64} UInt32 12.886 μs 9.012 μs 1.4 913.698 ns 913.812 ns 1.0
OneTo{UInt64} Int64 17.386 μs 8.952 μs 1.9 6.225 μs 919.212 ns 6.8
OneTo{UInt64} UInt64 13.529 μs 9.010 μs 1.5 957.888 ns 936.333 ns 1.0
@haampie
Copy link
Author

haampie commented Mar 27, 2018

using Base: OneTo, throw_boundserror, @_inline_meta, step_hp, checked_mul, checked_add
using BenchmarkTools
using BenchmarkTools: prettytime
using Base: BitInteger
using Printf

import Base: mul_with_overflow, add_with_overflow

mul_with_overflow(x::Integer, y::Integer) = mul_with_overflow(promote(x,y)...)
add_with_overflow(x::Integer, y::Integer) = add_with_overflow(promote(x,y)...)

const OverflowSafe = Union{BitInteger,Bool}

function new_getindex(v::OneTo{Tr}, i::Ti) where {Tr<:OverflowSafe,Ti<:OverflowSafe}
    @_inline_meta
    @boundscheck i > 0 && i  v.stop || throw_boundserror(v, i)
    i % Tr
end

function new_getindex(v::AbstractRange{Tr}, i::Ti) where {Tr<:OverflowSafe,Ti<:OverflowSafe}
    @_inline_meta
    value = v.start + (i - Ti(1)) * step(v)
    @boundscheck begin
        # Redo the computation with overflow checking, but do not use the
        # functions that throw, cause that's too slow here.
        checked_prod, a = mul_with_overflow(i - one(i), step(v))
        checked_value, b = add_with_overflow(v.start, checked_prod)
        !a && !b && (step(v) > 0 ? v.start  value  v.stop : v.stop  value  v.start) || throw_boundserror(v, i)
    end
    value % Tr
end

function curr_impl(r::AbstractRange{T}, inds::Vector{Ti}) where {T,Ti}
    sum = zero(T)
    for i in inds
        sum += r[i]
    end
    sum
end

function new_impl(r::AbstractRange{T}, inds::Vector{Ti}) where {T,Ti}
    sum = zero(T)
    for i in inds
        sum += new_getindex(r, i)
    end
    sum
end

function curr_impl_inbounds(r::AbstractRange{T}, inds::Vector{Ti}) where {T,Ti}
    sum = zero(T)
    @inbounds for i in inds
        sum += r[i]
    end
    sum
end

function new_impl_inbounds(r::AbstractRange{T}, inds::Vector{Ti}) where {T,Ti}
    sum = zero(T)
    @inbounds for i in inds
        sum += new_getindex(r, i)
    end
    sum
end

const Ranges = (
    false : true : true,
    Int8(1) : Int8(2) : Int8(127),
    UInt8(1) : UInt8(2) : UInt8(127),
    Int32(1) : Int32(2) : Int32(200),
    UInt32(1) : UInt32(2) : UInt32(200),
    Int64(1) : Int64(2) : Int64(200),
    UInt64(1) : UInt64(2) : UInt64(200),
    OneTo(Int8(100)),
    OneTo(UInt8(100)),
    OneTo(Int32(100)),
    OneTo(UInt32(100)),
    OneTo(Int64(100)),
    OneTo(UInt64(100))
)

const NumberTypes = (Int8, UInt8, Int32, UInt32, Int64, UInt64)

function bench_indexing()
    p = 12
    println(rpad("",2p+13), rpad("With bounds check", 3p), rpad("Using @inbounds", 3p))
    println(
        rpad("Range type", p+13), 
        rpad("Index type", p), 
        rpad("Current", p), 
        rpad("New", p), 
        rpad("x faster", p),
        rpad("Current", p), 
        rpad("New", p),
        rpad("x faster", p)
    )

    for rng = Ranges, Ti = NumberTypes
        inds = [Ti(rand(eachindex(rng))) for i = 1 : 10_000]
        a = @benchmark curr_impl($rng, $inds) seconds = 1 samples = 100
        b = @benchmark new_impl($rng, $inds) seconds = 1 samples = 100
        c = @benchmark curr_impl_inbounds($rng, $inds) seconds = 1 samples = 100
        d = @benchmark new_impl_inbounds($rng, $inds) seconds = 1 samples = 100

        println(
            rpad(typeof(rng), p+13), 
            rpad(Ti, p), 
            rpad(prettytime(time(minimum(a))), p), 
            rpad(prettytime(time(minimum(b))), p), 
            rpad(@sprintf("%.1f", time(minimum(a)) / time(minimum(b))), p),
            rpad(prettytime(time(minimum(c))), p), 
            rpad(prettytime(time(minimum(d))), p),
            rpad(@sprintf("%.1f", time(minimum(c)) / time(minimum(d))), p)
        )
    end
end

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment