brendte · February 5, 2013 02:42
diff --git a/build_inverted_index.rb b/build_inverted_index.rb
 #!/usr/bin/ruby

 require 'rubygems'
 require 'fast_stemmer'

 def doc_prep(docs)
  prepped_docs = {}
 	doc_id = 0
 	stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,use,used".split(',')

 	docs.each do |doc|
 		doc_id += 1
 		unstemmed_words =  doc.gsub(/[[:punct:]]/, '').downcase.split.select { |word| !stop_words.include?(word) }
 		stemmed_words = []
 		unstemmed_words.each { |word| stemmed_words << word.stem }
 		prepped_docs[doc_id] = stemmed_words
 	end

 	return prepped_docs
 end

 def create_posting
 end

 def create_dictionary_and_postings(docs)
 	dictionary = {}
 	postings = 
 	term_id = 0
 	docs.each do |doc_id, doc|
 		words_in_this_doc = []
 		doc.each do |word|
 			symbolized_word = word.to_sym
 			if dictionary.has_key?(symbolized_word)
 				dictionary[symbolized_word][:cf] += 1
 				if !words_in_this_doc.include?(symbolized_word)
 					words_in_this_doc << symbolized_word
 					dictionary[symbolized_word][:df] += 1
 					dictionary[symbolized_word][:postings] << {doc_id: doc_id, tf: 1}
 				else
 					dictionary[symbolized_word][:postings].each do |posting|
 						posting[:tf] = posting[:tf] += 1 if posting[:doc_id] == doc_id
 					end
 				end	
 			else
 				dictionary[symbolized_word] = {term_id: term_id += 1, cf: 1, df: 1, postings: [{doc_id: doc_id, tf: 1}]}
 				words_in_this_doc << symbolized_word
 			end
 		end
 	end

 	return dictionary
 end


 docs = []
 ARGV.each do |doc|
 	docs << doc
 end
 docs = doc_prep(docs)
 puts docs.inspect
 index = create_dictionary_and_postings(docs)
 # index.each { |term, values| puts "#{term}=>#{values}\n" }
 sorted_index = index.to_a.sort
 sorted_index.each { |dp| puts "#{dp[0]}=>#{dp[1]}" }
	#!/usr/bin/ruby

	require 'rubygems'
	require 'fast_stemmer'

	def doc_prep(docs)
	prepped_docs = {}
	doc_id = 0
	stop_words = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your,use,used".split(',')

	docs.each do \|doc\|
	doc_id += 1
	unstemmed_words = doc.gsub(/[[:punct:]]/, '').downcase.split.select { \|word\| !stop_words.include?(word) }
	stemmed_words = []
	unstemmed_words.each { \|word\| stemmed_words << word.stem }
	prepped_docs[doc_id] = stemmed_words
	end

	return prepped_docs
	end

	def create_posting
	end

	def create_dictionary_and_postings(docs)
	dictionary = {}
	postings =
	term_id = 0
	docs.each do \|doc_id, doc\|
	words_in_this_doc = []
	doc.each do \|word\|
	symbolized_word = word.to_sym
	if dictionary.has_key?(symbolized_word)
	dictionary[symbolized_word][:cf] += 1
	if !words_in_this_doc.include?(symbolized_word)
	words_in_this_doc << symbolized_word
	dictionary[symbolized_word][:df] += 1
	dictionary[symbolized_word][:postings] << {doc_id: doc_id, tf: 1}
	else
	dictionary[symbolized_word][:postings].each do \|posting\|
	posting[:tf] = posting[:tf] += 1 if posting[:doc_id] == doc_id
	end
	end
	else
	dictionary[symbolized_word] = {term_id: term_id += 1, cf: 1, df: 1, postings: [{doc_id: doc_id, tf: 1}]}
	words_in_this_doc << symbolized_word
	end
	end
	end

	return dictionary
	end


	docs = []
	ARGV.each do \|doc\|
	docs << doc
	end
	docs = doc_prep(docs)
	puts docs.inspect
	index = create_dictionary_and_postings(docs)
	# index.each { \|term, values\| puts "#{term}=>#{values}\n" }
	sorted_index = index.to_a.sort
	sorted_index.each { \|dp\| puts "#{dp[0]}=>#{dp[1]}" }