unleashed · August 29, 2015 14:17
diff --git a/set_vs_array.rb b/set_vs_array.rb
 # Info:
 #
 # A performance benchmark of `Set` vs `Array` on a 5 integer element data set*.
 #
 # S: a class INSTANTIATED containing a Set INSTANTIATED for each `include?` query
 # A: a class INSTANTIATED with an already instantiated Array for each `include?` query
 # SS: an already instantiated class with an already instantiated Set for each `include?` query
 # AA: an already instantiated class with an already instantiated Array for each `include?` query
 #
 # For pure comparison purposes of Set and Array, you should only pay attention to SS and AA benchmarks.
 # For `Array#include?` vs `Set#include?` taking Set instantiation into account, see the accumulator
 # benchmarks.
 #
 # Results:
 #
 # See calculations below for reference. Basically, **ONCE** we have a `Set` of 5 integers instantiated,
 # assuming each one can be queried with the same probability, each query gains performance:
 #
 # * When queries have 100% probability of matching an element in the set: ~72% faster the `Set`
 # * When queries have 1/6 probability of not matching an element in the set: ~145% faster the `Set`
 # * When queries have 100% probability of matching the:
 #     1st element: Array is ~30% faster
 #     2nd element: Array is ~3% faster
 #     3rd element: Array is ~16% slower
 #     4th element: Array is ~33% slower
 #     5th element: Array is ~56% slower
 #     Not   found: Array is ~73% slower
 #
 # When you take `Set` initialization into account, things look slightly different:
 #
 # * On a 5 integer Array, a Set needs to perform 15 queries of elements NOT found
 #   in the Array to compensate for the initialization costs. And that's just the
 #   best looking case! Typical numbers for 1/6 probability between all elements
 #   and an element not found vary wildly, but they can go anywhere from low 20's
 #   to more than 50 queries for compensation.
 #
 # The results highlight the necessity to assess your data set and the queries (workload) you are
 # expecting. `Set` is O(1), `Array` is O(n), but the setup costs of `Set` make it vulnerable to small
 # data sets with a limited number of queries, or to data sets where the expected queries have very
 # high probabilities to be found in the first couple of elements. For those cases `Array` would be
 # better suited.
 #
 # So if you can predict your queries and they are much more likely to hit a couple of elements in
 # your data set, and you can put both of them at the head of the `Array`, this setup outperforms
 # a `Set` when the element comparison cost is negligible, such as with integers.*
 #
 # That's because when you are way more likely to hit the 1st or the 2nd element, the needs only
 # access to a couple of pointers. That makes sense, since Set is likely to be implemented as a Hash,
 # which, while O(1), needs to perform heavier computations on the key than just accessing a couple
 # of pointers and performing a simple comparison.
 #
 # * Note: of course, this depends on the elements in the Set/Array. For integers, Array can win
 # because comparisons are relatively cheap, but when complex comparisons are needed, Array
 # performs much worse, since Set does not need to compare anything but the result of the hashing
 # function.
 #
 # Numbers:
 #
 # Calculating -------------------------------------
 #         set S, e=200     33802 i/100ms
 #         ary A, e=200    131828 i/100ms
 #         set S, e=404     33707 i/100ms
 #         ary A, e=404    126802 i/100ms
 #         set S, e=403     32487 i/100ms
 #         ary A, e=403    124124 i/100ms
 #         set S, e=500     33520 i/100ms
 #         ary A, e=500    119245 i/100ms
 #         set S, e=503     32964 i/100ms
 #         ary A, e=503    114926 i/100ms
 #         set S, e=600     33280 i/100ms
 #         ary A, e=600    112830 i/100ms
 #        set SS, e=200    168433 i/100ms
 #        ary AA, e=200    180201 i/100ms
 #        set SS, e=404    166385 i/100ms
 #        ary AA, e=404    167936 i/100ms
 #        set SS, e=403    166437 i/100ms
 #        ary AA, e=403    161662 i/100ms
 #        set SS, e=500    162190 i/100ms
 #        ary AA, e=500    156193 i/100ms
 #        set SS, e=503    168459 i/100ms
 #        ary AA, e=503    146403 i/100ms
 #        set SS, e=600    169182 i/100ms
 #        ary AA, e=600    138955 i/100ms
 # -------------------------------------------------
 #         set S, e=200   417717.5 (±3.0%) i/s -    2095724 in   5.022120s
 #         ary A, e=200  3691220.3 (±2.5%) i/s -   18455920 in   5.003649s
 #         set S, e=404   416981.2 (±3.0%) i/s -    2089834 in   5.016571s
 #         ary A, e=404  3256434.4 (±3.0%) i/s -   16357458 in   5.029058s
 #         set S, e=403   415218.8 (±4.6%) i/s -    2079168 in   5.021789s
 #         ary A, e=403  2735931.6 (±11.5%) i/s -   13529516 in   5.022639s
 #         set S, e=500   400631.4 (±5.7%) i/s -    2011200 in   5.038226s
 #         ary A, e=500  2711221.6 (±1.1%) i/s -   13593930 in   5.014543s
 #         set S, e=503   420076.5 (±1.1%) i/s -    2109696 in   5.022780s
 #         ary A, e=503  2385330.4 (±4.5%) i/s -   11952304 in   5.022747s
 #         set S, e=600   419418.5 (±1.4%) i/s -    2096640 in   4.999904s
 #         ary A, e=600  2235370.1 (±4.4%) i/s -   11170170 in   5.007962s
 #        set SS, e=200  5830722.8 (±6.9%) i/s -   29138909 in   5.029212s => Array is ~30% faster
 #        ary AA, e=200  7596061.0 (±7.1%) i/s -   37842210 in   5.019906s
 #        set SS, e=404  5820524.9 (±5.7%) i/s -   29117375 in   5.025010s => Array is ~3% faster
 #        ary AA, e=404  5981772.0 (±1.2%) i/s -   30060544 in   5.026147s
 #        set SS, e=403  5872626.8 (±1.5%) i/s -   29459349 in   5.017558s => Array is ~16% slower
 #        ary AA, e=403  5042044.8 (±0.9%) i/s -   25219272 in   5.002244s
 #        set SS, e=500  5793039.2 (±5.2%) i/s -   28869820 in   5.000658s => Array is ~33% slower
 #        ary AA, e=500  4350833.6 (±4.1%) i/s -   21710827 in   5.000478s
 #        set SS, e=503  5835314.1 (±1.9%) i/s -   29311866 in   5.025333s => Array is ~56% slower
 #        ary AA, e=503  3729213.4 (±1.9%) i/s -   18739584 in   5.027186s
 #        set SS, e=600  5856317.5 (±2.1%) i/s -   29268486 in   5.000189s => Array is ~73% slower
 #        ary AA, e=600  3392439.5 (±1.7%) i/s -   17091465 in   5.039622s
 #
 # There are no numbers here reported for the accumulator test, you can run those yourself :)
 # Read the 'Results' section above for conclusions.
 #
 # Instructions:
 #
 # Modify the constants below to your tastes.
 #

 require 'benchmark/ips'
 require 'set'

 DATASET = [200, 404, 403, 500, 503].freeze
 QUERYSPACE = DATASET + [600]
 QUERYSET = QUERYSPACE.dup.freeze
 # QUERYSET = [200]
 # QUERYSET = [404]

 MINQUERIES = 1
 MAXQUERIES = 1000
 STEP = 1
 ENABLE = {
  :perf => false,
  :accumulated => true
 }

 # END OF CONFIGURABLE PARAMETERS
 #

 class S
  def initialize(values)
    @values = Set.new values
  end

  def include?(val)
    @values.include? val
  end
 end

 class A
  def initialize(values)
    @values = values
  end

  def include?(val)
    @values.include? val
  end
 end

 an_array = DATASET.dup

 class SS
  VALUES = Set.new DATASET

  def self.include?(val)
    VALUES.include? val
  end
 end

 class AA
  VALUES = DATASET.dup.freeze

  def self.include?(val)
    VALUES.include? val
  end
 end

 if ENABLE[:perf]
  Benchmark::ips do |x|
    QUERYSPACE.each do |e|
      x.report "set S, e=#{e}" do
        S.new(an_array).include? e
      end
      x.report "ary A, e=#{e}" do
        A.new(an_array).include? e
      end
    end
    QUERYSPACE.each do |e|
      x.report "set SS, e=#{e}" do
        SS.include? e
      end
      x.report "ary AA, e=#{e}" do
        AA.include? e
      end
    end
  end
 end

 if ENABLE[:accumulated]
  threshold_q = QUERYSET.dup
  threshold_q += QUERYSET while threshold_q.size < MAXQUERIES
  threshold_q.shuffle!

  puts "DATA  SET: #{QUERYSET.join ','}"
  puts "QUERY SET: #{threshold_q.join ','}"

  MINQUERIES.step(MAXQUERIES, STEP).each do |n|
    t = threshold_q.first n
    GC.start
    b = Benchmark::ips do |x|
      q = t.dup
      x.report "set #{n}" do
        s = S.new an_array
        q.each do |e|
          s.include? e
        end
      end
      q = t.dup
      x.report "ary #{n}" do
        a = A.new an_array
        q.each do |e|
          a.include? e
        end
      end
    end
    if b.entries.sort_by { |e| e.ips }.last.label =~ /\Aset /
      puts "First time set comes in first is on the #{n}th query"
      break n
    end
  end
 end
	# Info:
	#
	# A performance benchmark of `Set` vs `Array` on a 5 integer element data set*.
	#
	# S: a class INSTANTIATED containing a Set INSTANTIATED for each `include?` query
	# A: a class INSTANTIATED with an already instantiated Array for each `include?` query
	# SS: an already instantiated class with an already instantiated Set for each `include?` query
	# AA: an already instantiated class with an already instantiated Array for each `include?` query
	#
	# For pure comparison purposes of Set and Array, you should only pay attention to SS and AA benchmarks.
	# For `Array#include?` vs `Set#include?` taking Set instantiation into account, see the accumulator
	# benchmarks.
	#
	# Results:
	#
	# See calculations below for reference. Basically, ONCE we have a `Set` of 5 integers instantiated,
	# assuming each one can be queried with the same probability, each query gains performance:
	#
	# * When queries have 100% probability of matching an element in the set: ~72% faster the `Set`
	# * When queries have 1/6 probability of not matching an element in the set: ~145% faster the `Set`
	# * When queries have 100% probability of matching the:
	# 1st element: Array is ~30% faster
	# 2nd element: Array is ~3% faster
	# 3rd element: Array is ~16% slower
	# 4th element: Array is ~33% slower
	# 5th element: Array is ~56% slower
	# Not found: Array is ~73% slower
	#
	# When you take `Set` initialization into account, things look slightly different:
	#
	# * On a 5 integer Array, a Set needs to perform 15 queries of elements NOT found
	# in the Array to compensate for the initialization costs. And that's just the
	# best looking case! Typical numbers for 1/6 probability between all elements
	# and an element not found vary wildly, but they can go anywhere from low 20's
	# to more than 50 queries for compensation.
	#
	# The results highlight the necessity to assess your data set and the queries (workload) you are
	# expecting. `Set` is O(1), `Array` is O(n), but the setup costs of `Set` make it vulnerable to small
	# data sets with a limited number of queries, or to data sets where the expected queries have very
	# high probabilities to be found in the first couple of elements. For those cases `Array` would be
	# better suited.
	#
	# So if you can predict your queries and they are much more likely to hit a couple of elements in
	# your data set, and you can put both of them at the head of the `Array`, this setup outperforms
	# a `Set` when the element comparison cost is negligible, such as with integers.*
	#
	# That's because when you are way more likely to hit the 1st or the 2nd element, the needs only
	# access to a couple of pointers. That makes sense, since Set is likely to be implemented as a Hash,
	# which, while O(1), needs to perform heavier computations on the key than just accessing a couple
	# of pointers and performing a simple comparison.
	#
	# * Note: of course, this depends on the elements in the Set/Array. For integers, Array can win
	# because comparisons are relatively cheap, but when complex comparisons are needed, Array
	# performs much worse, since Set does not need to compare anything but the result of the hashing
	# function.
	#
	# Numbers:
	#
	# Calculating -------------------------------------
	# set S, e=200 33802 i/100ms
	# ary A, e=200 131828 i/100ms
	# set S, e=404 33707 i/100ms
	# ary A, e=404 126802 i/100ms
	# set S, e=403 32487 i/100ms
	# ary A, e=403 124124 i/100ms
	# set S, e=500 33520 i/100ms
	# ary A, e=500 119245 i/100ms
	# set S, e=503 32964 i/100ms
	# ary A, e=503 114926 i/100ms
	# set S, e=600 33280 i/100ms
	# ary A, e=600 112830 i/100ms
	# set SS, e=200 168433 i/100ms
	# ary AA, e=200 180201 i/100ms
	# set SS, e=404 166385 i/100ms
	# ary AA, e=404 167936 i/100ms
	# set SS, e=403 166437 i/100ms
	# ary AA, e=403 161662 i/100ms
	# set SS, e=500 162190 i/100ms
	# ary AA, e=500 156193 i/100ms
	# set SS, e=503 168459 i/100ms
	# ary AA, e=503 146403 i/100ms
	# set SS, e=600 169182 i/100ms
	# ary AA, e=600 138955 i/100ms
	# -------------------------------------------------
	# set S, e=200 417717.5 (±3.0%) i/s - 2095724 in 5.022120s
	# ary A, e=200 3691220.3 (±2.5%) i/s - 18455920 in 5.003649s
	# set S, e=404 416981.2 (±3.0%) i/s - 2089834 in 5.016571s
	# ary A, e=404 3256434.4 (±3.0%) i/s - 16357458 in 5.029058s
	# set S, e=403 415218.8 (±4.6%) i/s - 2079168 in 5.021789s
	# ary A, e=403 2735931.6 (±11.5%) i/s - 13529516 in 5.022639s
	# set S, e=500 400631.4 (±5.7%) i/s - 2011200 in 5.038226s
	# ary A, e=500 2711221.6 (±1.1%) i/s - 13593930 in 5.014543s
	# set S, e=503 420076.5 (±1.1%) i/s - 2109696 in 5.022780s
	# ary A, e=503 2385330.4 (±4.5%) i/s - 11952304 in 5.022747s
	# set S, e=600 419418.5 (±1.4%) i/s - 2096640 in 4.999904s
	# ary A, e=600 2235370.1 (±4.4%) i/s - 11170170 in 5.007962s
	# set SS, e=200 5830722.8 (±6.9%) i/s - 29138909 in 5.029212s => Array is ~30% faster
	# ary AA, e=200 7596061.0 (±7.1%) i/s - 37842210 in 5.019906s
	# set SS, e=404 5820524.9 (±5.7%) i/s - 29117375 in 5.025010s => Array is ~3% faster
	# ary AA, e=404 5981772.0 (±1.2%) i/s - 30060544 in 5.026147s
	# set SS, e=403 5872626.8 (±1.5%) i/s - 29459349 in 5.017558s => Array is ~16% slower
	# ary AA, e=403 5042044.8 (±0.9%) i/s - 25219272 in 5.002244s
	# set SS, e=500 5793039.2 (±5.2%) i/s - 28869820 in 5.000658s => Array is ~33% slower
	# ary AA, e=500 4350833.6 (±4.1%) i/s - 21710827 in 5.000478s
	# set SS, e=503 5835314.1 (±1.9%) i/s - 29311866 in 5.025333s => Array is ~56% slower
	# ary AA, e=503 3729213.4 (±1.9%) i/s - 18739584 in 5.027186s
	# set SS, e=600 5856317.5 (±2.1%) i/s - 29268486 in 5.000189s => Array is ~73% slower
	# ary AA, e=600 3392439.5 (±1.7%) i/s - 17091465 in 5.039622s
	#
	# There are no numbers here reported for the accumulator test, you can run those yourself :)
	# Read the 'Results' section above for conclusions.
	#
	# Instructions:
	#
	# Modify the constants below to your tastes.
	#

	require 'benchmark/ips'
	require 'set'

	DATASET = [200, 404, 403, 500, 503].freeze
	QUERYSPACE = DATASET + [600]
	QUERYSET = QUERYSPACE.dup.freeze
	# QUERYSET = [200]
	# QUERYSET = [404]

	MINQUERIES = 1
	MAXQUERIES = 1000
	STEP = 1
	ENABLE = {
	:perf => false,
	:accumulated => true
	}

	# END OF CONFIGURABLE PARAMETERS
	#

	class S
	def initialize(values)
	@values = Set.new values
	end

	def include?(val)
	@values.include? val
	end
	end

	class A
	def initialize(values)
	@values = values
	end

	def include?(val)
	@values.include? val
	end
	end

	an_array = DATASET.dup

	class SS
	VALUES = Set.new DATASET

	def self.include?(val)
	VALUES.include? val
	end
	end

	class AA
	VALUES = DATASET.dup.freeze

	def self.include?(val)
	VALUES.include? val
	end
	end

	if ENABLE[:perf]
	Benchmark::ips do \|x\|
	QUERYSPACE.each do \|e\|
	x.report "set S, e=#{e}" do
	S.new(an_array).include? e
	end
	x.report "ary A, e=#{e}" do
	A.new(an_array).include? e
	end
	end
	QUERYSPACE.each do \|e\|
	x.report "set SS, e=#{e}" do
	SS.include? e
	end
	x.report "ary AA, e=#{e}" do
	AA.include? e
	end
	end
	end
	end

	if ENABLE[:accumulated]
	threshold_q = QUERYSET.dup
	threshold_q += QUERYSET while threshold_q.size < MAXQUERIES
	threshold_q.shuffle!

	puts "DATA SET: #{QUERYSET.join ','}"
	puts "QUERY SET: #{threshold_q.join ','}"

	MINQUERIES.step(MAXQUERIES, STEP).each do \|n\|
	t = threshold_q.first n
	GC.start
	b = Benchmark::ips do \|x\|
	q = t.dup
	x.report "set #{n}" do
	s = S.new an_array
	q.each do \|e\|
	s.include? e
	end
	end
	q = t.dup
	x.report "ary #{n}" do
	a = A.new an_array
	q.each do \|e\|
	a.include? e
	end
	end
	end
	if b.entries.sort_by { \|e\| e.ips }.last.label =~ /\Aset /
	puts "First time set comes in first is on the #{n}th query"
	break n
	end
	end
	end