Last active
August 29, 2015 14:12
-
-
Save jacksonhoose/3a763a5b2ac2dbab824c to your computer and use it in GitHub Desktop.
documentDistance
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Original version by Erik D. Demaine on January 31, 2011, | |
// based on code by Ronald L. Rivest (see docdist[1-7].py). | |
// Usage: | |
// node documentDistance.js file1.txt file2.txt | |
// # This program computes the "distance" between two text files | |
// # as the angle between their word frequency vectors (in radians). | |
// # | |
// # For each input file, a word-frequency vector is computed as follows: | |
// # (1) the specified file is read in | |
// # (2) it is converted into a list of alphanumeric "words" | |
// # Here a "word" is a sequence of consecutive alphanumeric | |
// # characters. Non-alphanumeric characters are treated as blanks. | |
// # Case is not significant. | |
// # (3) for each word, its frequency of occurrence is determined | |
// # (4) the word/frequency lists are sorted into order alphabetically | |
// # | |
// # The "distance" between two vectors is the angle between them. | |
// # If x = (x1, x2, ..., xn) is the first vector (xi = freq of word i) | |
// # and y = (y1, y2, ..., yn) is the second vector, | |
// # then the angle between them is defined as: | |
// # d(x,y) = arccos(inner_product(x,y) / (norm(x)*norm(y))) | |
// # where: | |
// # inner_product(x,y) = x1*y1 + x2*y2 + ... xn*yn | |
// # norm(x) = sqrt(inner_product(x,x)) | |
var readFileSync = require('fs').readFileSync; | |
var files = process.argv.slice(2); | |
function documentDistance (files) { | |
var file1 = wordFrequenciesForFile(files[0]); | |
var file2 = wordFrequenciesForFile(files[1]); | |
return vectorAngle(file1, file2); | |
} | |
function readFile (file) { | |
return readFileSync(file, 'utf8'); | |
} | |
function getWordsFromDocument (doc) { | |
return doc.replace(/[^a-zA-Z\d\s:]/g, '').replace(/\n\n/g, ' '); | |
} | |
function getWordFrequency (words) { | |
return words.split(' ').reduce(function(hash, word){ | |
hash[word] = hash[word] ? hash[word] + 1 : 1; | |
return hash; | |
}, {}); | |
} | |
function getInnerProduct (file1, file2) { | |
var sum = 0.0; | |
for(var key in file1){ | |
if(file2[key]) { | |
sum += file1[key] * file2[key]; | |
} | |
} | |
return sum; | |
} | |
function vectorAngle (file1, file2) { | |
var numerator = getInnerProduct(file1, file2); | |
var denominator = Math.sqrt(getInnerProduct(file1, file1) * getInnerProduct(file2, file2)); | |
return Math.acos(numerator/denominator); | |
} | |
function wordFrequenciesForFile(file) { | |
file = readFile(file); | |
file = getWordsFromDocument(file); | |
return getWordFrequency(file); | |
} | |
console.log(documentDistance(files)); | |
module.exports = documentDistance; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment