mh-github · April 12, 2013 22:05
diff --git a/docdist-v0.rb b/docdist-v0.rb
 #!/usr/bin/ruby

 def read_file(filename)
    """
    Read the text file with the given filename;
    return a list of the lines of text in the file.
    """
    begin
        f = File.open(filename, "r")
        return f.read()
    rescue
        print "Error opening or reading input file: ", filename
        exit
    end
 end

 ################################################
 # Operation 2: split the text line into words ##
 ################################################
 def get_words_from_line_list(text)
    """
    Parse the given text into words.
    Return list of all words found.
    """
    text = text.downcase.gsub(/[^[:alnum:]]/, ' ')
    word_list = text.split
    return word_list
 end

 ###############################################
 # Operation 3 : count frequency of each word ##
 ###############################################
 def count_frequency(word_list)
    """
    Return a dictionary mapping words to frequency.
    """
    d = {}
    for new_word in word_list do
        if d.keys.include? new_word then
            d[new_word] = d[new_word] + 1
        else
            d[new_word] = 1
        end
    end
    return d
 end

 ############################################
 # compute word frequencies for input file ##
 ############################################
 def word_frequencies_for_file(filename)
    """
    Return dictionary of (word, frequency) pairs of the given file
    """
    line_list = read_file(filename)
    word_list = get_words_from_line_list(line_list)
    freq_mapping = count_frequency(word_list)
    
    print "File ", filename, ":"
    print line_list.length, " lines,"
    print word_list.length, " words,"
    print freq_mapping.length, " distinct words\n"
    
    return freq_mapping
 end

 def inner_product(d1, d2)
 =begin
    Inner product between two vectors, where vectors
    are repeated as dictionaries of (word, freq) pairs.
    Example : inner_product({"and":3, "of":2, "the":5},
                            {"and":4, "in":1, "of":1, "this":2}) = 14.0
 =end
    sum = 0.0
    d1.keys.each do |key|
        if d2.keys.include? key then
            sum = sum + d1[key] * d2[key]
        end
    end
    return sum
 end

 def vector_angle(d1, d2)
    """
    The input is a list of (word, freq) pairs, sorted alphabetically.
    
    Return the angle between these two vectors.
    """
    numerator = inner_product(d1, d2)
    denominator = Math.sqrt(inner_product(d1, d1) * inner_product(d2, d2))
    return Math.acos(numerator/denominator)
 end

 def main
    if ARGV.length != 2 then
        puts "Usage: docdist.rb filename_1 filename_2"
    else
        filename_1 = ARGV[0]
        filename_2 = ARGV[1]
        sorted_word_list_1 = word_frequencies_for_file(filename_1)
        sorted_word_list_2 = word_frequencies_for_file(filename_2)
        distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
        print "The distance between the documents is: %0.6f (radians)\n" % distance
    end
 end

 if __FILE__ == $PROGRAM_NAME then
    main
 end
	#!/usr/bin/ruby

	def read_file(filename)
	"""
	Read the text file with the given filename;
	return a list of the lines of text in the file.
	"""
	begin
	f = File.open(filename, "r")
	return f.read()
	rescue
	print "Error opening or reading input file: ", filename
	exit
	end
	end

	################################################
	# Operation 2: split the text line into words ##
	################################################
	def get_words_from_line_list(text)
	"""
	Parse the given text into words.
	Return list of all words found.
	"""
	text = text.downcase.gsub(/[^[:alnum:]]/, ' ')
	word_list = text.split
	return word_list
	end

	###############################################
	# Operation 3 : count frequency of each word ##
	###############################################
	def count_frequency(word_list)
	"""
	Return a dictionary mapping words to frequency.
	"""
	d = {}
	for new_word in word_list do
	if d.keys.include? new_word then
	d[new_word] = d[new_word] + 1
	else
	d[new_word] = 1
	end
	end
	return d
	end

	############################################
	# compute word frequencies for input file ##
	############################################
	def word_frequencies_for_file(filename)
	"""
	Return dictionary of (word, frequency) pairs of the given file
	"""
	line_list = read_file(filename)
	word_list = get_words_from_line_list(line_list)
	freq_mapping = count_frequency(word_list)

	print "File ", filename, ":"
	print line_list.length, " lines,"
	print word_list.length, " words,"
	print freq_mapping.length, " distinct words\n"

	return freq_mapping
	end

	def inner_product(d1, d2)
	=begin
	Inner product between two vectors, where vectors
	are repeated as dictionaries of (word, freq) pairs.
	Example : inner_product({"and":3, "of":2, "the":5},
	{"and":4, "in":1, "of":1, "this":2}) = 14.0
	=end
	sum = 0.0
	d1.keys.each do \|key\|
	if d2.keys.include? key then
	sum = sum + d1[key] * d2[key]
	end
	end
	return sum
	end

	def vector_angle(d1, d2)
	"""
	The input is a list of (word, freq) pairs, sorted alphabetically.

	Return the angle between these two vectors.
	"""
	numerator = inner_product(d1, d2)
	denominator = Math.sqrt(inner_product(d1, d1) * inner_product(d2, d2))
	return Math.acos(numerator/denominator)
	end

	def main
	if ARGV.length != 2 then
	puts "Usage: docdist.rb filename_1 filename_2"
	else
	filename_1 = ARGV[0]
	filename_2 = ARGV[1]
	sorted_word_list_1 = word_frequencies_for_file(filename_1)
	sorted_word_list_2 = word_frequencies_for_file(filename_2)
	distance = vector_angle(sorted_word_list_1, sorted_word_list_2)
	print "The distance between the documents is: %0.6f (radians)\n" % distance
	end
	end

	if __FILE__ == $PROGRAM_NAME then
	main
	end
No results found