Skip to content

Instantly share code, notes, and snippets.

@jaydonnell
Created June 13, 2011 18:21
Show Gist options
  • Save jaydonnell/1023359 to your computer and use it in GitHub Desktop.
Save jaydonnell/1023359 to your computer and use it in GitHub Desktop.
jruby fuzzy autocomplete
#require "completor/version"
require 'rubygems'
require 'bundler'
require 'java'
Bundler.require(:default, :development)
require 'csv'
COMPLETOR_ROOT = File.expand_path(File.dirname(__FILE__))
Dir.glob(File.join(COMPLETOR_ROOT, '..', 'jars', '*.jar')).each { |jar| require jar }
module Lucene
module Store
include_package 'org.apache.lucene.store'
end
class NGramAnalyzer < org.apache.lucene.analysis.Analyzer
def tokenStream(field_name, reader)
org.apache.lucene.analysis.ngram.NGramTokenizer.new(reader)
end
end
end
class String
def distance(other_str)
_self_str = self.downcase
_othr_str = other_str.downcase
# Shortcuts
return 0 if _self_str == _othr_str
return _self_str.length if (0 == _othr_str.length)
return _othr_str.length if (0 == _self_str.length)
# how to unpack
unpack_rule = ($KCODE =~ /^U/i) ? 'U*' : 'C*'
#longer, shorter
_str_1, _str_2 = if _self_str.length > _othr_str.length
[_self_str, _othr_str]
else
[_othr_str, _self_str]
end
# get different in length as base
difference_counter = _str_1.length - _str_2.length
# Shorten first string & unpack
_str_1 = _str_1[0, _str_2.length].unpack(unpack_rule)
_str_2 = _str_2.unpack(unpack_rule)
_str_1.each_with_index do |char1, idx|
char2 = _str_2[idx]
difference_counter += 1 if char1 != char2
end
return difference_counter
end
end
class Completor
def self.load
titles = []
test_titles = "data.csv"
i = 0
CSV.foreach(test_titles) do |row|
i = i + 1
next if i == 1
titles << row[0].strip unless row[0].strip == ""
end
dir = Lucene::Store::RAMDirectory.new
writer = org.apache.lucene.index.IndexWriter.new(dir, Lucene::NGramAnalyzer.new, org.apache.lucene.index.IndexWriter::MaxFieldLength::UNLIMITED)
titles.each do |t|
doc = org.apache.lucene.document.Document.new
doc.add(org.apache.lucene.document.Field.new("title", t, org.apache.lucene.document.Field::Store::YES, org.apache.lucene.document.Field::Index::ANALYZED))
writer.addDocument(doc)
end
writer.close
Completor.new(dir)
end
def search(q)
parser = org.apache.lucene.queryParser.QueryParser.new(org.apache.lucene.util.Version::LUCENE_32, "title", Lucene::NGramAnalyzer.new)
query = parser.parse(q)
@is = org.apache.lucene.search.IndexSearcher.new(@dir)
hits = @is.search(query, 15)
end
def self.min_distance(q, s)
min = 99
s.split(/\W/).each do |w|
c = w.slice(0..q.length+1).distance(q)
min = c if c < min
end
min
end
def print_search(q)
hits = search(q)
candidates = []
hits.scoreDocs.each { |d| candidates << @is.doc(d.doc).get("title") }
candidates.sort! { |a,b| Completor.min_distance(q,a) <=> Completor.min_distance(q,b) }
candidates.each { |e| puts e }
end
def initialize(dir)
@dir = dir
end
end
se = Completor.load
se.print_search 'bu'
debugger
1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment