-
-
Save jsuchal/54e26cc6601bf737b94d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Infinity = 1.0 / 0 | |
class MinHasher | |
def initialize(n, universe_size, seed = nil) | |
srand(seed) unless seed.nil? | |
@hash_functions = (1..n).collect do | |
create_random_hash_function(universe_size) | |
end | |
end | |
def generate_signature(set) | |
@hash_functions.collect do |hash_function| | |
calculate_minhash(hash_function, set) | |
end | |
end | |
private | |
def create_random_hash_function(universe_size) | |
a = rand(universe_size) | |
b = rand(universe_size) | |
lambda { |x| (a * x + b).modulo universe_size } | |
end | |
def calculate_minhash(hash_function, set) | |
minhash = Infinity | |
set.each do |item| | |
value = hash_function.call(item) | |
minhash = value if value < minhash | |
end | |
minhash | |
end | |
end | |
def jaccard(s1, s2) | |
(s1 & s2).size.to_f / (s1 | s2).size | |
end | |
def similarity(s1, s2) | |
matches = 0 | |
s1.each_with_index do |hash, idx| | |
matches += 1 if hash == s2[idx] | |
end | |
matches / s1.size.to_f | |
end | |
minhasher = MinHasher.new(10, 12549826247007890, 1234567) | |
u1 = (1..10000).to_a | |
u2 = (5001..20000).to_a | |
s1 = minhasher.generate_signature u1 | |
s2 = minhasher.generate_signature u2 | |
puts "J(u1, u2) = #{jaccard(u1, u2)}", "sim(s1, s2) = #{similarity(s1, s2)}", nil | |
puts s1.inspect, s2.inspect |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment