Skip to content

Instantly share code, notes, and snippets.

@ultraist
Created December 4, 2010 21:34
Show Gist options
  • Save ultraist/728499 to your computer and use it in GitHub Desktop.
Save ultraist/728499 to your computer and use it in GitHub Desktop.
$KCODE='UTF8'
module Zch
class TrigramIndex
class Record
attr_reader :key, :value, :similarity
def initialize(key, value, similarity)
@key = key
@value = value
@similarity = similarity
end
end
def initialize
@data = {}
@inv_index = {}
end
def add(key, value)
ti = trigram_vector(key)
@data[key] = {
:key => key,
:value => value,
:ti => ti,
:ti_norm => norm(ti)
}
ti.each do |k, v|
if (@inv_index.member?(k))
@inv_index[k] << key
else
@inv_index[k] = [key]
end
end
end
def search(query, limit = nil)
results = []
ti = trigram_vector(query)
ti_norm = norm(ti)
cand = []
ti.each do |k, v|
if (@inv_index.member?(k))
cand << @inv_index[k]
end
end
cand.flatten.uniq.each do |key|
data = @data[key]
results << Record.new(data[:key], data[:value],
dot(data[:ti], ti) / Math.sqrt(data[:ti_norm] * ti_norm))
end
results.sort! do |a, b|
b.similarity - a.similarity
end
if (limit && results.length > limit)
results = results[0 ... limit]
end
results
end
private
def trigram_vector(s)
ti = {}
chars = multibyte_chars(s)
i = 0
while (i < chars.length - 2)
trigram = chars[i ... (i + 3)].join
ti[trigram] = 1.0
i += 1
end
ti
end
def multibyte_chars(s)
chars = []
s.chars.each do |c|
chars << c
end
chars
end
def norm(ti)
ti.size
end
def dot(ti1, ti2)
x = 0.0
ti1.each do |k, v|
if (ti2.member?(k))
x += 1.0
end
end
x
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment