klappy · September 16, 2019 20:51
diff --git a/Tiny JavaScript tokenizer.js b/Tiny JavaScript tokenizer.js
 /**
 * Tiny tokenizer - https://gist.github.com/borgar/451393
 * @param {String} string - string to be tokenized
 * @param {Object} parsers - { word:/\w+/, whitespace:/\s+/, punctuation:/[^\w\s]/ }
 * @param {String} deftok - type to label tokens that are not classified with the above parsers
 * @return {Array} - array of objects => [{ token:"this", type:"word" },{ token:" ", type:"whitespace" }, Object { token:"is", type:"word" }, ... ]
 **/
 export const classifyTokens = (string, parsers, deftok) => {
  string = (!string) ? '' : string; // if string is undefined, make it an empty string
  if (typeof string !== 'string') {
    throw new Error(`tokenizer.tokenize() string is not String: ${string}`);
  }
  let m;
  let r;
  let t;
  let tokens = [];
  while (string) {
    t = null;
    m = string.length;
    let key;
    for (key in parsers) {
      if (Object.prototype.hasOwnProperty.call(parsers, key)) {
        r = parsers[key].exec( string );
        // try to choose the best match if there are several
        // where "best" is the closest to the current starting point
        if ( r && ( r.index < m ) ) {
          t = {
            token: r[0],
            type: key,
            matches: r.slice(1),
          };
          m = r.index;
        }
      }
    }
    if ( m ) {
      // there is text between last token and currently
      // matched token - push that out as default or "unknown"
      tokens.push({
        token: string.substr( 0, m ),
        type: deftok || 'unknown',
      });
    }
    if ( t ) {
      // push current token onto sequence
      tokens.push( t );
    }
    string = string.substr( m + (t ? t.token.length : 0) );
  }
  return tokens;
 };
	/**
	* Tiny tokenizer - https://gist.github.com/borgar/451393
	* @param {String} string - string to be tokenized
	* @param {Object} parsers - { word:/\w+/, whitespace:/\s+/, punctuation:/[^\w\s]/ }
	* @param {String} deftok - type to label tokens that are not classified with the above parsers
	* @return {Array} - array of objects => [{ token:"this", type:"word" },{ token:" ", type:"whitespace" }, Object { token:"is", type:"word" }, ... ]
	**/
	export const classifyTokens = (string, parsers, deftok) => {
	string = (!string) ? '' : string; // if string is undefined, make it an empty string
	if (typeof string !== 'string') {
	throw new Error(`tokenizer.tokenize() string is not String: ${string}`);
	}
	let m;
	let r;
	let t;
	let tokens = [];
	while (string) {
	t = null;
	m = string.length;
	let key;
	for (key in parsers) {
	if (Object.prototype.hasOwnProperty.call(parsers, key)) {
	r = parsers[key].exec( string );
	// try to choose the best match if there are several
	// where "best" is the closest to the current starting point
	if ( r && ( r.index < m ) ) {
	t = {
	token: r[0],
	type: key,
	matches: r.slice(1),
	};
	m = r.index;
	}
	}
	}
	if ( m ) {
	// there is text between last token and currently
	// matched token - push that out as default or "unknown"
	tokens.push({
	token: string.substr( 0, m ),
	type: deftok \|\| 'unknown',
	});
	}
	if ( t ) {
	// push current token onto sequence
	tokens.push( t );
	}
	string = string.substr( m + (t ? t.token.length : 0) );
	}
	return tokens;
	};