Last active
December 22, 2015 13:48
-
-
Save sirupsen/6481061 to your computer and use it in GitHub Desktop.
Benchmarks for a variety of ways to search through an array of strings for a prefix match.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class BBT | |
def initialize(array) | |
@set = array.sort | |
end | |
def lower_bound(element) | |
lower, upper = 0, @set.size - 1 | |
while lower != upper | |
middle = (lower + upper) / 2 | |
if element > @set[middle] | |
lower = middle + 1 | |
else | |
upper = middle | |
end | |
end | |
lower | |
end | |
def upper_bound(element) | |
lower_bound(element.next) | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "benchmark" | |
require "set" | |
require_relative "segment_search" | |
require_relative "trie" | |
require_relative "bbt" | |
strings = ('a'..'z').map do |letter| | |
Array.new(5000) { |number| "#{letter}bba#{number}" } | |
end.flatten | |
dict = File.read("/usr/share/dict/words").split("\n").map { |e| e.downcase } | |
datasets = { | |
strings: { | |
body: strings, | |
exact: "zbba42", | |
partial: "zbba" | |
}, | |
zygo: { | |
body: dict, | |
exact: "zygodont", | |
partial: "zygo" | |
}, | |
abietene: { | |
body: dict, | |
exact: "abietene", | |
partial: "abie" | |
}, | |
abiogene: { | |
body: dict, | |
exact: "abiogenetic", | |
partial: "abiogene" | |
}, | |
} | |
Benchmark.bm do |b| | |
datasets.each do |name, info| | |
segment = SegmentSearch.new(info[:body]) | |
set = Set.new(info[:body]) | |
trie = Trie.new | |
info[:body].each do |string| | |
trie << string | |
end | |
bbt = BBT.new(info[:body]) | |
sorted = info[:body].sort | |
b.report("#{name} set (exact match)") { set.member?(info[:exact]) } | |
b.report("#{name} segmented linear (exact match)") { segment.find(info[:exact]) } | |
b.report("#{name} full linear (exact match)") { strings.detect { |e| e == info[:exact] } } | |
b.report("#{name} trie (exact match)") { trie.detect(info[:exact]) } | |
b.report("#{name} balanced binary tree (exact match)") { bbt.lower_bound(info[:exact])[0] } | |
puts "" | |
b.report("#{name} segmented linear (partial match)") { segment.grep(info[:partial]) } | |
b.report("#{name} full linear (partial match)") { strings.grep(/\A#{info[:partial]}/) } | |
b.report("#{name} trie (partial match)") { trie.grep(info[:partial]) } | |
b.report("#{name} balanced binary tree (partial match)") { strings[bbt.lower_bound(info[:partial])..bbt.upper_bound(info[:partial].next)] } | |
# ruby 2.0 only | |
b.report("#{name} array bsearch (partial match)") { sorted.bsearch { |e| e >= info[:partial] && e < info[:partial] } } | |
puts "\n\n" | |
end | |
end | |
__END__ | |
user system total real | |
strings set (exact match) 0.000000 0.000000 0.000000 ( 0.000011) | |
strings segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000020) | |
strings full linear (exact match) 0.010000 0.000000 0.010000 ( 0.013388) | |
strings trie (exact match) 0.000000 0.000000 0.000000 ( 0.000017) | |
strings balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000025) | |
strings segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.001781) | |
strings full linear (partial match) 0.050000 0.000000 0.050000 ( 0.052005) | |
strings trie (partial match) 0.010000 0.000000 0.010000 ( 0.003586) | |
strings balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000035) | |
strings array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000016) | |
zygo set (exact match) 0.000000 0.000000 0.000000 ( 0.000010) | |
zygo segmented linear (exact match) 0.010000 0.000000 0.010000 ( 0.000080) | |
zygo full linear (exact match) 0.010000 0.000000 0.010000 ( 0.014900) | |
zygo trie (exact match) 0.000000 0.000000 0.000000 ( 0.000027) | |
zygo balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000015) | |
zygo segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.000418) | |
zygo full linear (partial match) 0.060000 0.000000 0.060000 ( 0.054991) | |
zygo trie (partial match) 0.000000 0.000000 0.000000 ( 0.000301) | |
zygo balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000025) | |
zygo array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000011) | |
abietene set (exact match) 0.000000 0.000000 0.000000 ( 0.000008) | |
abietene segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000032) | |
abietene full linear (exact match) 0.020000 0.000000 0.020000 ( 0.014711) | |
abietene trie (exact match) 0.000000 0.000000 0.000000 ( 0.000018) | |
abietene balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000014) | |
abietene segmented linear (partial match) 0.000000 0.000000 0.000000 ( 0.006834) | |
abietene full linear (partial match) 0.060000 0.000000 0.060000 ( 0.052371) | |
abietene trie (partial match) 0.000000 0.000000 0.000000 ( 0.000039) | |
abietene balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000020) | |
abietene array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000014) | |
abiogene set (exact match) 0.000000 0.000000 0.000000 ( 0.000009) | |
abiogene segmented linear (exact match) 0.000000 0.000000 0.000000 ( 0.000035) | |
abiogene full linear (exact match) 0.010000 0.000000 0.010000 ( 0.013668) | |
abiogene trie (exact match) 0.000000 0.000000 0.000000 ( 0.000019) | |
abiogene balanced binary tree (exact match) 0.000000 0.000000 0.000000 ( 0.000014) | |
abiogene segmented linear (partial match) 0.010000 0.000000 0.010000 ( 0.007112) | |
abiogene full linear (partial match) 0.060000 0.000000 0.060000 ( 0.055548) | |
abiogene trie (partial match) 0.000000 0.000000 0.000000 ( 0.000044) | |
abiogene balanced binary tree (partial match) 0.000000 0.000000 0.000000 ( 0.000030) | |
abiogene array bsearch (partial match) 0.000000 0.000000 0.000000 ( 0.000029) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# A very large array of strings can be broken up into much smaller | |
# arrays by grouping all strings that share the same first (downcased) | |
# letter. The smaller arrays are stored in a hash that uses the | |
# first (downcased) letter as a key. | |
# | |
# Example mapping: | |
# { "a" => ["ab", "abb", "abbb", "abbbb"], "b" => ["ba", "baaaa"] } | |
# | |
# When a query or search is made, the first letter of the query string | |
# can be used to lookup the array of strings that begin with that | |
# letter. A linear search is then performed on a smaller (subset) of the | |
# original array. | |
# | |
# This can be faster than a linear search over the entire array, | |
# especially if you end up searching for "Zog", in a 10-000 | |
# alphabetically sorted array that has no other strings beginning | |
# with Z. | |
# | |
# It turns out 'set' from the standard library is actually even | |
# slightly faster for equality comparisons in these examples, so do | |
# consider that if you are solving problems like that. See bench.rb | |
# in this repository for a lookup in a 100k+ element array that ends | |
# up as a set or segmented into smaller arrays referencable by | |
# keys(internally). | |
# | |
class SegmentSearch | |
def initialize(string_array) | |
@map = string_array.group_by { |s| s[0].downcase } | |
@map.default = [] # a single array for failed key lookups (a no-op search). | |
end | |
def segments | |
@map.values | |
end | |
# Return the segment that would be used for +string+. | |
def [](string) | |
segment_for(string).dup | |
end | |
# Partial match (returns array). | |
def grep(string) | |
regexp = /\A#{string}/ | |
segment_for(string).grep(regexp) | |
end | |
# First partial match. | |
def fgrep(string) | |
regexp = /\A#{string}/ | |
segment_for(string).detect { |e| regexp =~ e } | |
end | |
# First exact match. | |
def find(string) | |
segment_for(string).detect { |e| e == string } | |
end | |
private | |
def segment_for(string) | |
c = string[0].downcase | |
@map[c] | |
end | |
end |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Trie | |
attr_accessor :word, :trie | |
def initialize | |
@trie = {} | |
@word = false | |
end | |
def <<(string) | |
node = string.each_char.inject(self) { |node, char| node.trie[char] ||= Trie.new } | |
node.word = true | |
end | |
def detect(string) | |
node = string.each_char.inject(self) { |node, char| | |
return false unless node.trie[char] | |
node = node.trie[char] | |
} | |
node.word | |
end | |
def grep(pattern) | |
branch = pattern.each_char.inject(self) { |node, char| | |
return false unless node.trie[char] | |
node = node.trie[char] | |
} | |
subtree(branch, pattern) | |
end | |
private | |
def subtree(node, suffix) | |
ary = [] | |
ary << suffix if node.word | |
node.trie.each do |char, sub_node| | |
ary.concat(subtree(sub_node, suffix.concat(char))) | |
end | |
ary | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment