Created
April 12, 2013 22:05
-
-
Save mh-github/5375522 to your computer and use it in GitHub Desktop.
Ruby program that computes the "distance" between two text files as the angle between their word frequency vectors (in radians). This is Ruby version of Erik Demaine's (MIT) docdist8.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/ruby | |
| def read_file(filename) | |
| """ | |
| Read the text file with the given filename; | |
| return a list of the lines of text in the file. | |
| """ | |
| begin | |
| f = File.open(filename, "r") | |
| return f.read() | |
| rescue | |
| print "Error opening or reading input file: ", filename | |
| exit | |
| end | |
| end | |
| ################################################ | |
| # Operation 2: split the text line into words ## | |
| ################################################ | |
| def get_words_from_line_list(text) | |
| """ | |
| Parse the given text into words. | |
| Return list of all words found. | |
| """ | |
| text = text.downcase.gsub(/[^[:alnum:]]/, ' ') | |
| word_list = text.split | |
| return word_list | |
| end | |
| ############################################### | |
| # Operation 3 : count frequency of each word ## | |
| ############################################### | |
| def count_frequency(word_list) | |
| """ | |
| Return a dictionary mapping words to frequency. | |
| """ | |
| d = {} | |
| for new_word in word_list do | |
| if d.keys.include? new_word then | |
| d[new_word] = d[new_word] + 1 | |
| else | |
| d[new_word] = 1 | |
| end | |
| end | |
| return d | |
| end | |
| ############################################ | |
| # compute word frequencies for input file ## | |
| ############################################ | |
| def word_frequencies_for_file(filename) | |
| """ | |
| Return dictionary of (word, frequency) pairs of the given file | |
| """ | |
| line_list = read_file(filename) | |
| word_list = get_words_from_line_list(line_list) | |
| freq_mapping = count_frequency(word_list) | |
| print "File ", filename, ":" | |
| print line_list.length, " lines," | |
| print word_list.length, " words," | |
| print freq_mapping.length, " distinct words\n" | |
| return freq_mapping | |
| end | |
| def inner_product(d1, d2) | |
| =begin | |
| Inner product between two vectors, where vectors | |
| are repeated as dictionaries of (word, freq) pairs. | |
| Example : inner_product({"and":3, "of":2, "the":5}, | |
| {"and":4, "in":1, "of":1, "this":2}) = 14.0 | |
| =end | |
| sum = 0.0 | |
| d1.keys.each do |key| | |
| if d2.keys.include? key then | |
| sum = sum + d1[key] * d2[key] | |
| end | |
| end | |
| return sum | |
| end | |
| def vector_angle(d1, d2) | |
| """ | |
| The input is a list of (word, freq) pairs, sorted alphabetically. | |
| Return the angle between these two vectors. | |
| """ | |
| numerator = inner_product(d1, d2) | |
| denominator = Math.sqrt(inner_product(d1, d1) * inner_product(d2, d2)) | |
| return Math.acos(numerator/denominator) | |
| end | |
| def main | |
| if ARGV.length != 2 then | |
| puts "Usage: docdist.rb filename_1 filename_2" | |
| else | |
| filename_1 = ARGV[0] | |
| filename_2 = ARGV[1] | |
| sorted_word_list_1 = word_frequencies_for_file(filename_1) | |
| sorted_word_list_2 = word_frequencies_for_file(filename_2) | |
| distance = vector_angle(sorted_word_list_1, sorted_word_list_2) | |
| print "The distance between the documents is: %0.6f (radians)\n" % distance | |
| end | |
| end | |
| if __FILE__ == $PROGRAM_NAME then | |
| main | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment