Created
February 5, 2013 16:15
-
-
Save brendte/4715478 to your computer and use it in GitHub Desktop.
Complete index and query solution. Naively implemented with nested loops.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/ruby | |
require 'rubygems' | |
require 'fast_stemmer' | |
def doc_prep(docs) | |
prepped_docs = {} | |
doc_id = 0 | |
stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,use,used".split(',') | |
docs.each do |doc| | |
doc_id += 1 | |
unstemmed_words = doc.gsub(/[[:punct:]]/, '').downcase.split.select { |word| !stop_words.include?(word) } | |
stemmed_words = [] | |
unstemmed_words.each { |word| stemmed_words << word.stem } | |
prepped_docs[doc_id] = stemmed_words | |
end | |
return prepped_docs | |
end | |
def create_dictionary_and_postings(docs) | |
dictionary = {} | |
postings = | |
term_id = 0 | |
docs.each do |doc_id, doc| | |
words_in_this_doc = [] | |
doc.each do |word| | |
symbolized_word = word.to_sym | |
if dictionary.has_key?(symbolized_word) | |
dictionary[symbolized_word][:cf] += 1 | |
if !words_in_this_doc.include?(symbolized_word) | |
words_in_this_doc << symbolized_word | |
dictionary[symbolized_word][:df] += 1 | |
dictionary[symbolized_word][:postings] << {doc_id: doc_id, tf: 1} | |
else | |
dictionary[symbolized_word][:postings].each do |posting| | |
posting[:tf] = posting[:tf] += 1 if posting[:doc_id] == doc_id | |
end | |
end | |
else | |
dictionary[symbolized_word] = {term_id: term_id += 1, cf: 1, df: 1, postings: [{doc_id: doc_id, tf: 1}]} | |
words_in_this_doc << symbolized_word | |
end | |
end | |
end | |
return dictionary | |
end | |
docs = [] | |
ARGV.each do |doc| | |
docs << doc | |
end | |
docs = doc_prep(docs) | |
index = create_dictionary_and_postings(docs) | |
################### | |
### index AND query | |
################### | |
puts "index AND query" | |
term_1 = "index".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
term_2 = "query".stem.to_sym | |
term_2_hits = index[term_2][:postings] | |
document_hits = [] | |
term_1_hits.each do |l_posting| | |
term_2_hits.each do |r_posting| | |
document_hits << r_posting[:doc_id] if l_posting[:doc_id] == r_posting[:doc_id] | |
end | |
end | |
puts "hit list: #{document_hits.sort.inspect}" | |
################### | |
### (search AND query) OR (search AND retrieve) | |
################### | |
puts "(search AND query) OR (search AND retrieve)" | |
# search AND query | |
term_1 = "search".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
term_2 = "query".stem.to_sym | |
term_2_hits = index[term_2][:postings] | |
document_hits_1 = [] | |
term_1_hits.each do |l_posting| | |
term_2_hits.each do |r_posting| | |
document_hits_1 << r_posting[:doc_id] if l_posting[:doc_id] == r_posting[:doc_id] | |
end | |
end | |
# search AND retrieve | |
term_1 = "search".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
term_2 = "retrieve".stem.to_sym | |
term_2_hits = index[term_2][:postings] | |
document_hits_2 = [] | |
term_1_hits.each do |l_posting| | |
term_2_hits.each do |r_posting| | |
document_hits_2 << r_posting[:doc_id] if l_posting[:doc_id] == r_posting[:doc_id] | |
end | |
end | |
# (search AND query) OR (search AND retrieve) | |
document_hits = document_hits_1.concat(document_hits_2).uniq.sort | |
puts "hit list: #{document_hits.inspect}" | |
################### | |
### (search AND engine AND web) OR feedback | |
################### | |
puts "(search AND engine AND web) OR feedback" | |
#search AND engine AND web | |
term_1 = "search".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
term_2 = "engine".stem.to_sym | |
term_2_hits = index[term_2][:postings] | |
term_3 = "web".stem.to_sym | |
term_3_hits = index[term_3][:postings] | |
document_hits_1 = [] | |
term_1_hits.each do |l_posting| | |
term_2_hits.each do |m_posting| | |
if l_posting[:doc_id] == m_posting[:doc_id] | |
term_3_hits.each do |r_posting| | |
document_hits_1 << r_posting[:doc_id] if m_posting[:doc_id] == r_posting[:doc_id] | |
end | |
end | |
end | |
end | |
#feedback | |
term_1 = "feedback".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
document_hits_2 = [] | |
term_1_hits.each { |posting| document_hits_2 << posting[:doc_id]} | |
# (search AND engine AND web) OR feedback | |
document_hits = document_hits_1.concat(document_hits_2).uniq.sort | |
puts "hit list: #{document_hits.inspect}" | |
################### | |
### (index OR cluster) AND (web OR system) | |
################### | |
puts "(index OR cluster) AND (web OR system)" | |
# (index OR cluster) | |
term_1 = "index".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
document_hits_1 = [] | |
term_1_hits.each { |posting| document_hits_1 << posting[:doc_id]} | |
term_2 = "cluster".stem.to_sym | |
term_2_hits = index[term_2][:postings] | |
document_hits_2 = [] | |
term_2_hits.each { |posting| document_hits_2 << posting[:doc_id]} | |
document_hits_l = document_hits_1.concat(document_hits_2).uniq.sort | |
# (web OR system) | |
term_1 = "web".stem.to_sym | |
term_1_hits = index[term_1][:postings] | |
document_hits_1 = [] | |
term_1_hits.each { |posting| document_hits_1 << posting[:doc_id]} | |
term_2 = "system".stem.to_sym | |
term_2_hits = index[term_2][:postings] | |
document_hits_2 = [] | |
term_2_hits.each { |posting| document_hits_2 << posting[:doc_id]} | |
document_hits_r = document_hits_1.concat(document_hits_2).uniq.sort | |
# (index OR cluster) AND (web OR system) | |
document_hits = [] | |
document_hits_l.each do |l_posting| | |
document_hits_r.each do |r_posting| | |
document_hits << r_posting if l_posting == r_posting | |
end | |
end | |
document_hits = document_hits.uniq.sort | |
puts "hit list: #{document_hits.inspect}" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment