timwhitlock · January 6, 2021 19:01
diff --git a/dict.js b/dict.js
 /**
 * Searchable JavaScript dictionary.
 * 
 * Usage:
 *  var dict = require('path/to/this').create();
 *  dict.add( 'somedata', 'Text to index' );
 *  var items = dict.find('text');
 */
 exports.create = function(){
    return new Dict;
 }


 /**
 * Dictionary constructor
 */
 function Dict(){
   var tree, data; 
   // public access to private vars
   this.clear = function(){
       this.length = 0;
       tree = {};
       data = [];
   };
   this.getTree = function(){
       return tree; 
   };
   this.getData = function(){
       return data;
   };
   this.clear();
 }



 var proto = Dict.prototype;


 /**
 * Maximum depth of search tree
 * @var Number
 */
 proto.depth = 0;


 /**
 * Whether result require all words in query match,
 * @var Boolean
 */
 proto.matchall = true;


 /**
 * Whether to normalize all words to lower case
 * @var Boolean
 */
 proto.ignorecase = true;


 /**
 * Default expression to strip out of words
 * @var RegExp
 */ 
 proto.nonword = /[\-.?!;:,_*^+=~`"'(){}<>[\]\/\\\u00a0\u1680\u180e\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f]+/g;


 /**
 * Configure custom transliteration table
 * @param Object
 * @param RegExp
 */ 
 proto.translit = function( charTable, charMatch ){
    charMatch = charMatch || /[^a-z0-9]/g;
    function replacer( chr ){
        return charTable[chr] || chr;
    }
    // add transliterator function to call custom replacer
    this.trans = function( word ){
        return word.replace( charMatch, replacer );
    };
 }



 /**
 * Configure custom stop words to match after normalisation
 * @param Object
 */ 
 proto.stoppers = function( wordsTable ){
    this.stopped = function( word ){
        return Boolean( wordsTable[word] );
    };
 }




 /**
 * @param string arbitrary data being looked up
 * @param string text index to search for data if different from data
 * @return Dict
 */
 proto.add = function( data, text ){
    // fall back to data as index, if not passing custom text index
    if( null == text ){
        text = String( data );
    }
    var w = -1, word, leaf, branch, depth, c, chr,
        words = this.normalize( text ),
        store = this.getData(), 
        idx = store.length;
    // store data and increment number of records
    store.push( data );
    this.length ++;
    // index individual words
    while( ++w < words.length ){
        word = words[w];
        if( this.stopped(word) ){
            continue;
        }
        // start at root of tree and index word to depth
        branch = this.getTree();
        depth  = Math.min( word.length, this.depth ) || word.length;
        for( c = 0; c < depth; c++ ){
            chr = word.charAt(c);
            branch = branch[chr] || ( branch[chr] = {} );
        }
        leaf = branch[' '] || ( branch[' '] = [] );
        leaf.push( idx );
    }
    return this;
 }



 /**
 * Lookup data from text
 * @param string one or more words to search index
 * @return array unique list of found data items
 */
 proto.find = function( text, meta ){
    var w = -1, word, branch, depth, c, chr, idx, 
        matches = {}, 
        results = [], 
        words = this.normalize( text ),
        store = this.getData();
    // recursive function drills into matched branches
    function descend( branch, word ){
        var i, match, node;
        for( chr in branch ){
            node = branch[chr];
            if( ' ' === chr ){
                // end of branch - have data
                for( i in node ){
                    idx = node[i];
                    match = matches[idx] || ( matches[idx] = { length: 0, words: {} } );
                    match.length += match.words[word] ? 0 : 1;
                    match.words[word] = 1 + ( match.words[word] || 0 );
                }
                continue;
            }    
            // else descend into all paths
            descend( node, word );
        }
    }
    // search on all words passed and count matches        
    wordloop:
    while( ++w < words.length ){
        word = words[w];
        branch = this.getTree();
        depth = Math.min( word.length, this.depth ) || word.length;
        // drill into dictionary as far as this partial word will go
        for( c = 0; c < depth; c++ ){
            chr = word.charAt(c);
            if( ! branch[chr] ){
                // overshot branch word not found
                continue wordloop;
            }
            branch = branch[chr];
        }
        // word reaches valid point in tree - collect all data under this branch
        descend( branch, word );
    }
    // return unique data matches
    for( idx in matches ){
        if( this.matchall && ( matches[idx].length < words.length ) ){
            continue;
        }
        results.push( store[idx] );
    }
    // allow callee access to full info
    if( meta ){
        meta.query = text;
        meta.words = words;
    }
    return results;
 }



 /**
 * utility for normalizing a string into matchable words
 * @param string raw string
 * @return array unique normalized words
 */
 proto.normalize = function( str ){
    var i = -1, 
        word, 
        unique = {},
        words = [],
        parts = this.split(str);
    while( ++i < parts.length ){
        word = parts[i];
        if( ! word ){
            continue;
        }
        // convert case first so table can be smaller
        if( this.ignorecase ){
            word = word.toLowerCase();
        }
        // strip ignore list, assuming everything else is significant
        word = this.strip( word );
        if( ! word ){
            continue;
        }
        // transliterate via custom mappings
        if( this.trans ){
            word = this.trans( word );
        }
        if( unique[word] ){
            continue;
        }
        words.push( word );
        unique[word] = true;
    }
    return words;
 }



 /**
 * default stop word checker, can be overridden with this.stops( table )
 */
 proto.stopped = function( word ){
    return 1 === word.length;
 }


 /**
 * split a sentence into rough word chunks
 */
 proto.split = function( str ){
    return str.split(/\s+/);
 }


 /**
 * strip ignorable characters from a word
 */
 proto.strip = function( word ){
    return word.replace( this.nonword, '' );
 }





 /**
 * Debug function for inspecting search tree and indexed data
 *
 proto.dump = function(){
    var store = this.getData();
    function collect( leaf ){
        var i = -1, data = [];
        while( ++i < leaf.length ){
            data.push( store[ leaf[i] ] );
        }
        return data;
    }
    function descend( branch, word ){
        var chr, node;
        for( chr in branch ){
            node = branch[chr];
            if( ' ' === chr ){
                console.log( word+': [ '+collect(node).join(', ')+' ]' );
            }
            else {
                //word && console.log( word );
                descend( node, word+chr ) 
            }
        }
    }
    descend( this.getTree(), '' );
 }
 //*/




 proto = null;
	/**
	* Searchable JavaScript dictionary.
	*
	* Usage:
	* var dict = require('path/to/this').create();
	* dict.add( 'somedata', 'Text to index' );
	* var items = dict.find('text');
	*/
	exports.create = function(){
	return new Dict;
	}


	/**
	* Dictionary constructor
	*/
	function Dict(){
	var tree, data;
	// public access to private vars
	this.clear = function(){
	this.length = 0;
	tree = {};
	data = [];
	};
	this.getTree = function(){
	return tree;
	};
	this.getData = function(){
	return data;
	};
	this.clear();
	}



	var proto = Dict.prototype;


	/**
	* Maximum depth of search tree
	* @var Number
	*/
	proto.depth = 0;


	/**
	* Whether result require all words in query match,
	* @var Boolean
	*/
	proto.matchall = true;


	/**
	* Whether to normalize all words to lower case
	* @var Boolean
	*/
	proto.ignorecase = true;


	/**
	* Default expression to strip out of words
	* @var RegExp
	*/
	proto.nonword = /[\-.?!;:,_*^+=~`"'(){}<>[\]\/\\\u00a0\u1680\u180e\u2000-\u206f\u2e00-\u2e7f\u3000-\u303f]+/g;


	/**
	* Configure custom transliteration table
	* @param Object
	* @param RegExp
	*/
	proto.translit = function( charTable, charMatch ){
	charMatch = charMatch \|\| /[^a-z0-9]/g;
	function replacer( chr ){
	return charTable[chr] \|\| chr;
	}
	// add transliterator function to call custom replacer
	this.trans = function( word ){
	return word.replace( charMatch, replacer );
	};
	}



	/**
	* Configure custom stop words to match after normalisation
	* @param Object
	*/
	proto.stoppers = function( wordsTable ){
	this.stopped = function( word ){
	return Boolean( wordsTable[word] );
	};
	}




	/**
	* @param string arbitrary data being looked up
	* @param string text index to search for data if different from data
	* @return Dict
	*/
	proto.add = function( data, text ){
	// fall back to data as index, if not passing custom text index
	if( null == text ){
	text = String( data );
	}
	var w = -1, word, leaf, branch, depth, c, chr,
	words = this.normalize( text ),
	store = this.getData(),
	idx = store.length;
	// store data and increment number of records
	store.push( data );
	this.length ++;
	// index individual words
	while( ++w < words.length ){
	word = words[w];
	if( this.stopped(word) ){
	continue;
	}
	// start at root of tree and index word to depth
	branch = this.getTree();
	depth = Math.min( word.length, this.depth ) \|\| word.length;
	for( c = 0; c < depth; c++ ){
	chr = word.charAt(c);
	branch = branch[chr] \|\| ( branch[chr] = {} );
	}
	leaf = branch[' '] \|\| ( branch[' '] = [] );
	leaf.push( idx );
	}
	return this;
	}



	/**
	* Lookup data from text
	* @param string one or more words to search index
	* @return array unique list of found data items
	*/
	proto.find = function( text, meta ){
	var w = -1, word, branch, depth, c, chr, idx,
	matches = {},
	results = [],
	words = this.normalize( text ),
	store = this.getData();
	// recursive function drills into matched branches
	function descend( branch, word ){
	var i, match, node;
	for( chr in branch ){
	node = branch[chr];
	if( ' ' === chr ){
	// end of branch - have data
	for( i in node ){
	idx = node[i];
	match = matches[idx] \|\| ( matches[idx] = { length: 0, words: {} } );
	match.length += match.words[word] ? 0 : 1;
	match.words[word] = 1 + ( match.words[word] \|\| 0 );
	}
	continue;
	}
	// else descend into all paths
	descend( node, word );
	}
	}
	// search on all words passed and count matches
	wordloop:
	while( ++w < words.length ){
	word = words[w];
	branch = this.getTree();
	depth = Math.min( word.length, this.depth ) \|\| word.length;
	// drill into dictionary as far as this partial word will go
	for( c = 0; c < depth; c++ ){
	chr = word.charAt(c);
	if( ! branch[chr] ){
	// overshot branch word not found
	continue wordloop;
	}
	branch = branch[chr];
	}
	// word reaches valid point in tree - collect all data under this branch
	descend( branch, word );
	}
	// return unique data matches
	for( idx in matches ){
	if( this.matchall && ( matches[idx].length < words.length ) ){
	continue;
	}
	results.push( store[idx] );
	}
	// allow callee access to full info
	if( meta ){
	meta.query = text;
	meta.words = words;
	}
	return results;
	}



	/**
	* utility for normalizing a string into matchable words
	* @param string raw string
	* @return array unique normalized words
	*/
	proto.normalize = function( str ){
	var i = -1,
	word,
	unique = {},
	words = [],
	parts = this.split(str);
	while( ++i < parts.length ){
	word = parts[i];
	if( ! word ){
	continue;
	}
	// convert case first so table can be smaller
	if( this.ignorecase ){
	word = word.toLowerCase();
	}
	// strip ignore list, assuming everything else is significant
	word = this.strip( word );
	if( ! word ){
	continue;
	}
	// transliterate via custom mappings
	if( this.trans ){
	word = this.trans( word );
	}
	if( unique[word] ){
	continue;
	}
	words.push( word );
	unique[word] = true;
	}
	return words;
	}



	/**
	* default stop word checker, can be overridden with this.stops( table )
	*/
	proto.stopped = function( word ){
	return 1 === word.length;
	}


	/**
	* split a sentence into rough word chunks
	*/
	proto.split = function( str ){
	return str.split(/\s+/);
	}


	/**
	* strip ignorable characters from a word
	*/
	proto.strip = function( word ){
	return word.replace( this.nonword, '' );
	}





	/**
	* Debug function for inspecting search tree and indexed data
	*
	proto.dump = function(){
	var store = this.getData();
	function collect( leaf ){
	var i = -1, data = [];
	while( ++i < leaf.length ){
	data.push( store[ leaf[i] ] );
	}
	return data;
	}
	function descend( branch, word ){
	var chr, node;
	for( chr in branch ){
	node = branch[chr];
	if( ' ' === chr ){
	console.log( word+': [ '+collect(node).join(', ')+' ]' );
	}
	else {
	//word && console.log( word );
	descend( node, word+chr )
	}
	}
	}
	descend( this.getTree(), '' );
	}
	//*/




	proto = null;