Last active
August 29, 2015 14:23
-
-
Save oiva/14dd1324931d912c3853 to your computer and use it in GitHub Desktop.
Muhku
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var fs = require('fs'); | |
var _ = require('lodash'); | |
var originalWords = []; | |
fs.readFile('alastalon_salissa.txt', 'utf8', function (err,data) { | |
if (err) { | |
return console.log(err); | |
} | |
process(data); | |
}); | |
// takes text data and returns all words in an array | |
function parseWords(data) { | |
data = data.toLowerCase(); | |
// remove anything not a letter or white space | |
data = data.replace(/[^a-zåäö\s]/gi, ''); | |
var words = data.split(/\s+/) | |
// cheat / optimisation: minimum length for words, | |
// takes 1-2 seconds off the total time. | |
var minimumLength = 6; | |
words = words.filter(function(word) {return word && word.length > minimumLength;}); | |
return words; | |
} | |
// takes a word and returns its unique letters sorted, for example: | |
// 'muhku' => 'hkmu' | |
function mapWordToLetters(word) { | |
var letters = word.split(''); | |
letters = letters.sort(); | |
letters = _.uniq(letters, true); | |
reducedWord = letters.join(''); | |
// store original word to global variable | |
originalWords[reducedWord] = word; | |
return reducedWord; | |
} | |
// removes consecutive words that are each others' substrings. Fast. | |
// for example: ['abc', 'muhk', 'muhku', 'muhkua'] => ['abc', 'muhkua'] | |
function filterSimilarWords(words) { | |
var filteredWords = []; | |
var previousWord = ''; | |
var word; | |
for (var i = words.length - 1; i >= 0; i--) { | |
word = words[i] | |
if (previousWord.indexOf(word) === -1) { | |
filteredWords.push(word); | |
} | |
previousWord = word; | |
}; | |
return filteredWords; | |
} | |
// removes words whose letters are found in other words. Preserves words that | |
// have only one less letter than a better word. | |
// ['abcd', 'abcdex', 'abcdefx'] => ['abcdex', 'abcdefx'] | |
// Returns character arrays because we use those later on. | |
function filterSubsets(words) { | |
var filteredWords = [], a1, a2, length, discard; | |
letterArrays = _.map(words, function(word) { | |
return word.split(''); | |
}); | |
length = letterArrays.length; | |
for (var i = 0; i < length; i++) { | |
a1 = letterArrays[i]; | |
discard = false; | |
for (var j = i + 1; j < length; j++) { | |
a2 = letterArrays[j]; | |
// don't discard 'muhku' if comparing to 'muhkua' | |
if (a1.length >= a2.length -1) { | |
continue; | |
} | |
// a1 is subset of a2 | |
if (a1.length === _.intersection(a1, a2).length) { | |
discard = true; | |
break; | |
} | |
}; | |
if (!discard) { | |
filteredWords.push(a1); | |
} | |
}; | |
return filteredWords; | |
} | |
// calculates muhkeus score by counting the unique letters in the two given words. | |
// input: character arrays | |
// ['a', 'b', 'c'], ['a', 'e'] => 4 | |
function calculateScore(word1, word2) { | |
return _.union(word1, word2).length; | |
} | |
// go through given words and calculate score for each possible pair. return | |
// highest scoring pairs | |
function findPairs(words) { | |
var word1, word2, pair, pairs = [], bestPairs, score, maxScore = 0; | |
for (var i = words.length - 1; i >= 0; i--) { | |
word1 = words[i]; | |
for (var j = i - 1; j >= 0; j--) { | |
word2 = words[j]; | |
score = calculateScore(word1, word2); | |
if (score >= maxScore) { | |
// convert arrays back to string, find original word from global array | |
pairs.push([score, originalWords[word1.join('')], originalWords[word2.join('')]]); | |
maxScore = score; | |
} | |
}; | |
}; | |
bestPairs = _.filter(pairs, function(pair) { | |
return pair[0] == maxScore; | |
}); | |
return bestPairs; | |
} | |
function process(data) { | |
var words = parseWords(data); | |
words = _.map(words, mapWordToLetters); | |
words = words.sort(); | |
console.log('1: '+words.length); | |
words = _.uniq(words, true); // unique for sorted data | |
console.log('2: '+words.length); | |
words = filterSimilarWords(words); | |
console.log('3: '+words.length); | |
words = filterSubsets(words); | |
console.log('4: '+words.length); | |
pairs = findPairs(words); | |
//console.log(_.slice(words, 0, 100)); | |
console.log('word count before calculating score '+words.length); | |
console.log(pairs.length + ' pairs '); | |
console.log(pairs); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Muhk", | |
"description": "Find the muhkiest word", | |
"version": "1.0.0", | |
"dependencies": { | |
"lodash": "^3.9.3" | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment