Last active
January 6, 2021 19:01
-
-
Save timwhitlock/2876736 to your computer and use it in GitHub Desktop.
Searchable JavaScript Dictionary
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Searchable JavaScript dictionary. | |
* | |
* Usage: | |
* var dict = require('path/to/this').create(); | |
* dict.add( 'somedata', 'Text to index' ); | |
* var items = dict.find('text'); | |
*/ | |
exports.create = function(){ | |
return new Dict; | |
} | |
/** | |
* Dictionary constructor | |
*/ | |
function Dict(){ | |
var tree, data; | |
// public access to private vars | |
this.clear = function(){ | |
this.length = 0; | |
tree = {}; | |
data = []; | |
}; | |
this.getTree = function(){ | |
return tree; | |
}; | |
this.getData = function(){ | |
return data; | |
}; | |
this.clear(); | |
} | |
var proto = Dict.prototype; | |
/** | |
* Maximum depth of search tree | |
* @var Number | |
*/ | |
proto.depth = 0; | |
/** | |
* Whether result require all words in query match, | |
* @var Boolean | |
*/ | |
proto.matchall = true; | |
/** | |
* Whether to normalize all words to lower case | |
* @var Boolean | |
*/ | |
proto.ignorecase = true; | |
/** | |
* Default expression to strip out of words | |
* @var RegExp | |
*/ | |
proto.nonword = /[\-.?!;:,_*^+=~`"'(){}<>[\]\/\\\u00a0\u1680\u180e\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f]+/g; | |
/** | |
* Configure custom transliteration table | |
* @param Object | |
* @param RegExp | |
*/ | |
proto.translit = function( charTable, charMatch ){ | |
charMatch = charMatch || /[^a-z0-9]/g; | |
function replacer( chr ){ | |
return charTable[chr] || chr; | |
} | |
// add transliterator function to call custom replacer | |
this.trans = function( word ){ | |
return word.replace( charMatch, replacer ); | |
}; | |
} | |
/** | |
* Configure custom stop words to match after normalisation | |
* @param Object | |
*/ | |
proto.stoppers = function( wordsTable ){ | |
this.stopped = function( word ){ | |
return Boolean( wordsTable[word] ); | |
}; | |
} | |
/** | |
* @param string arbitrary data being looked up | |
* @param string text index to search for data if different from data | |
* @return Dict | |
*/ | |
proto.add = function( data, text ){ | |
// fall back to data as index, if not passing custom text index | |
if( null == text ){ | |
text = String( data ); | |
} | |
var w = -1, word, leaf, branch, depth, c, chr, | |
words = this.normalize( text ), | |
store = this.getData(), | |
idx = store.length; | |
// store data and increment number of records | |
store.push( data ); | |
this.length ++; | |
// index individual words | |
while( ++w < words.length ){ | |
word = words[w]; | |
if( this.stopped(word) ){ | |
continue; | |
} | |
// start at root of tree and index word to depth | |
branch = this.getTree(); | |
depth = Math.min( word.length, this.depth ) || word.length; | |
for( c = 0; c < depth; c++ ){ | |
chr = word.charAt(c); | |
branch = branch[chr] || ( branch[chr] = {} ); | |
} | |
leaf = branch[' '] || ( branch[' '] = [] ); | |
leaf.push( idx ); | |
} | |
return this; | |
} | |
/** | |
* Lookup data from text | |
* @param string one or more words to search index | |
* @return array unique list of found data items | |
*/ | |
proto.find = function( text, meta ){ | |
var w = -1, word, branch, depth, c, chr, idx, | |
matches = {}, | |
results = [], | |
words = this.normalize( text ), | |
store = this.getData(); | |
// recursive function drills into matched branches | |
function descend( branch, word ){ | |
var i, match, node; | |
for( chr in branch ){ | |
node = branch[chr]; | |
if( ' ' === chr ){ | |
// end of branch - have data | |
for( i in node ){ | |
idx = node[i]; | |
match = matches[idx] || ( matches[idx] = { length: 0, words: {} } ); | |
match.length += match.words[word] ? 0 : 1; | |
match.words[word] = 1 + ( match.words[word] || 0 ); | |
} | |
continue; | |
} | |
// else descend into all paths | |
descend( node, word ); | |
} | |
} | |
// search on all words passed and count matches | |
wordloop: | |
while( ++w < words.length ){ | |
word = words[w]; | |
branch = this.getTree(); | |
depth = Math.min( word.length, this.depth ) || word.length; | |
// drill into dictionary as far as this partial word will go | |
for( c = 0; c < depth; c++ ){ | |
chr = word.charAt(c); | |
if( ! branch[chr] ){ | |
// overshot branch word not found | |
continue wordloop; | |
} | |
branch = branch[chr]; | |
} | |
// word reaches valid point in tree - collect all data under this branch | |
descend( branch, word ); | |
} | |
// return unique data matches | |
for( idx in matches ){ | |
if( this.matchall && ( matches[idx].length < words.length ) ){ | |
continue; | |
} | |
results.push( store[idx] ); | |
} | |
// allow callee access to full info | |
if( meta ){ | |
meta.query = text; | |
meta.words = words; | |
} | |
return results; | |
} | |
/** | |
* utility for normalizing a string into matchable words | |
* @param string raw string | |
* @return array unique normalized words | |
*/ | |
proto.normalize = function( str ){ | |
var i = -1, | |
word, | |
unique = {}, | |
words = [], | |
parts = this.split(str); | |
while( ++i < parts.length ){ | |
word = parts[i]; | |
if( ! word ){ | |
continue; | |
} | |
// convert case first so table can be smaller | |
if( this.ignorecase ){ | |
word = word.toLowerCase(); | |
} | |
// strip ignore list, assuming everything else is significant | |
word = this.strip( word ); | |
if( ! word ){ | |
continue; | |
} | |
// transliterate via custom mappings | |
if( this.trans ){ | |
word = this.trans( word ); | |
} | |
if( unique[word] ){ | |
continue; | |
} | |
words.push( word ); | |
unique[word] = true; | |
} | |
return words; | |
} | |
/** | |
* default stop word checker, can be overridden with this.stops( table ) | |
*/ | |
proto.stopped = function( word ){ | |
return 1 === word.length; | |
} | |
/** | |
* split a sentence into rough word chunks | |
*/ | |
proto.split = function( str ){ | |
return str.split(/\s+/); | |
} | |
/** | |
* strip ignorable characters from a word | |
*/ | |
proto.strip = function( word ){ | |
return word.replace( this.nonword, '' ); | |
} | |
/** | |
* Debug function for inspecting search tree and indexed data | |
* | |
proto.dump = function(){ | |
var store = this.getData(); | |
function collect( leaf ){ | |
var i = -1, data = []; | |
while( ++i < leaf.length ){ | |
data.push( store[ leaf[i] ] ); | |
} | |
return data; | |
} | |
function descend( branch, word ){ | |
var chr, node; | |
for( chr in branch ){ | |
node = branch[chr]; | |
if( ' ' === chr ){ | |
console.log( word+': [ '+collect(node).join(', ')+' ]' ); | |
} | |
else { | |
//word && console.log( word ); | |
descend( node, word+chr ) | |
} | |
} | |
} | |
descend( this.getTree(), '' ); | |
} | |
//*/ | |
proto = null; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment