zcorpan · January 14, 2016 13:46
diff --git a/parse-coords.html b/parse-coords.html
 <!doctype html>
 <meta charset=utf-8>
 <title>coords</title>
 <style>
 table { table-layout:fixed; width:100%; border-collapse:collapse }
 td { max-width:25%; overflow:hidden; border:2px solid gray; padding:0.5em; font-family:monospace }
 </style>
 <table>
 <tr><th>test<th>old parser<th>new parser (POC)<th>new parser (new-spec-compliant)
 <script>
 function parseListOfInts(input) {
  var spaceCommaSemicolon = /^[ ,;]$/;
  // Let input be the string being parsed.
  //
  // Let position be a pointer into input, initially pointing at the start of the string.
  var pos = 0;
  // Let numbers be an initially empty list of integers. This list will be the result of this
  // algorithm.
  var numbers = [];
  start: while (true) {
    // If there is a character in the string input at position position, and it is either a U+0020
    // SPACE, U+002C COMMA, or U+003B SEMICOLON character, then advance position to the next character
    // in input, or to beyond the end of the string if there are no more characters.
    if (spaceCommaSemicolon.test(input[pos])) {
      pos++;
    }
    // If position points to beyond the end of input, return numbers and abort.
    if (input[pos] === undefined) {
      return numbers;
    }
    // If the character in the string input at position position is a U+0020 SPACE, U+002C COMMA, or
    // U+003B SEMICOLON character, then return to step 4.
    if (spaceCommaSemicolon.test(input[pos])) {
      continue start;
    }
    // Let negated be false.
    var negated = false;
    // Let value be 0.
    var value = 0;
    // Let started be false. This variable is set to true when the parser sees a number or a U+002D
    // HYPHEN-MINUS character (-).
    var started = false;
    // Let got number be false. This variable is set to true when the parser sees a number.
    var gotNumber = false;
    // Let finished be false. This variable is set to true to switch parser into a mode where it
    // ignores characters until the next separator.
    var finished = false;
    // Let bogus be false.
    var bogus = false;
    // Parser: If the character in the string input at position position is:
    parser: while (true) {
      inner_parser: {
        var c = input[pos];
        // A U+002D HYPHEN-MINUS character
        if (c === '-') {
          // Follow these substeps:
          //
          // If got number is true, let finished be true. If finished is true, skip to the next step in the
          // overall set of steps. If started is true, let negated be false. Otherwise, if started is false
          // and if bogus is false, let negated be true. Let started be true.
          if (gotNumber) {
            finished = true;
          }
          if (finished) {
            break inner_parser;
          }
          if (started) {
            negated = false;
          } else if (!bogus) {
            negated = true;
          }
          started = true;
        }
        // An ASCII digit
        else if (/^\d$/.test(c)) {
          // Follow these substeps:
          //
          // If finished is true, skip to the next step in the overall set of steps. Multiply value by ten.
          // Add the value of the digit, interpreted in base ten, to value. Let started be true. Let got
          // number be true.
          if (finished) {
            break inner_parser;
          }
          value *= 10;
          value += parseInt(input[pos], 10);
          started = true;
          gotNumber = true;
        } else if (spaceCommaSemicolon.test(c)) {
          // Follow these substeps:
          //
          // If got number is false, return the numbers list and abort. This happens if an entry in the list
          // has no digits, as in "1,2,x,4". If negated is true, then negate value. Append value to the
          // numbers list. Jump to step 4 in the overall set of steps.
          if (!gotNumber) {
            return numbers;
          }
          if (negated) {
            value *= -1;
          }
          numbers.push(value);
          continue start;
        } else if (/^[\u0001-\u001f\u0021-\u002b\u002d-\u002f\u003a\u003c-\u0040\u005b-\u0060\u007f]$/.test(c)) {
          // Follow these substeps:
          //
          // If got number is true, let finished be true. If finished is true, skip to the next step in the
          // overall set of steps. Let negated be false.
          if (gotNumber) {
            finished = true;
          }
          if (finished) {
            break inner_parser;
          }
          negated = false;
        } else {
          // Follow these substeps:
          //
          // If finished is true, skip to the next step in the overall set of steps. Let negated be false.
          // Let bogus be true. If started is true, then return the numbers list, and abort. (The value in
          // value is not appended to the list first; it is dropped.)
          if (finished) {
            break parser;
          }
          negated = false;
          bogus = true;
          if (started) {
            return numbers;
          }
        }
      }
      // Advance position to the next character in input, or to beyond the end of the string if there
      // are no more characters.
      pos++;
      // If position points to a character (and not to beyond the end of input), jump to the big Parser
      // step above.
      if (input[pos] !== undefined) {
        continue parser;
      }
      // If negated is true, then negate value.
      if (negated) {
        value *= -1;
      }
      // If got number is true, then append value to the numbers list.
      if (gotNumber) {
        numbers.push(value);
      }
      // Return the numbers list and abort.
      return numbers;
    }
  }
 }
 function newCoords(input) {
  var numbers = [];
  // trim leading separators
  input = input.replace(/^[\s,]+/, '');
  // split
  var tokens = input.split(/[\s,]+/);
  // for each token in tokens
  for (var i = 0; i < tokens.length; ++i) {
    var token = tokens[i];
    // replace garbage with spaces
    token = token.replace(/[^\d\.-]/g, ' ');
    // parse as float; add to numbers
    numbers.push(parseFloat(token, 10) || 0);
  }
  // return numbers
  return numbers;
 }

 function collectCharacters(input, pos, regex) {
  var startPos = pos;
  while (regex.test(input[pos])) {
    pos++;
    if (input[pos] === undefined) {
      break;
    }
  }
  return [input.substr(startPos, pos), pos];
 }

 function newSpecCoords(input) {
  // Let input be the string being parsed.

  // Let position be a pointer into input, initially pointing at the start of the
  // string.
  var pos = 0;
  // Let numbers be an initially empty list of floating-point numbers. This list
  // will be the result of this algorithm.
  var numbers = [];
  var unparsedNumber;
  var number;
  // Collect a sequence of characters that are space characters, U+002C COMMA, or
  // U+003B SEMICOLON characters. This skips past any leading delimiters.
  [, pos] = collectCharacters(input, pos, /^[\s,;]$/);
  // While position is not past the end of input:
  while (input[pos] !== undefined) {
    // Collect a sequence of characters that are not space characters, U+002C COMMA,
    // U+003B SEMICOLON, ASCII digits, U+002E FULL STOP, or U+002D HYPHEN-MINUS
    // characters. This skips past leading garbage.
    [, pos] = collectCharacters(input, pos, /^[^\s,;\d\.-]/);
    // Collect a sequence of characters that are not space characters, U+002C COMMA,
    // U+003B SEMICOLON characters, and let unparsed number be the result.
    [unparsedNumber, pos] = collectCharacters(input, pos, /^[^\s,;]$/);
    // Let number be the result using the rules for parsing floating-point number
    // values for unparsed number.
    number = parseFloat(unparsedNumber);
    // If number is an error, let number be zero.
    if (isNaN(number)) {
      number = 0;
    }
    // Append number to numbers.
    numbers.push(number);
    // Collect a sequence of characters that are space characters, U+002C COMMA, or
    // U+003B SEMICOLON characters. This skips past the delimiter.
    [, pos] = collectCharacters(input, pos, /^[\s,;]$/);
  }
  // Return numbers.
  return numbers;
 }
 var tests = [
 // a few from webdevdata
 "142,130,140,139,152,139,149,127,",
 "138,10,13.5",
 "594,72,779,72,779,142,594,142,5shop.com.tw/return/ef_return.html",
 "“0,12,625,478\"",
 "137,6 151,1,163,4,235,76,206,76",
 "557,328,705,329,706,517,658,518,656ls/spain/holidays/regions/3/Canary+Islands/Canary+Islands.html",
 "59,46,64,45,65,46,65,48,67,49,69,50,71,52,70,52,69,56,67,58,67,60,61,60,60,65,58,67,59,69,57,70,50,69,48,71,43,69,46,72,,215,6,218,3,220,2,223,1,228,2,234,4,238,6,240,6,244,10,246,14,247,18,250,23,254,27,257,32,259,34,255,33,251,35,250,38,249,38,246,40,245,40,243,37,241,37,239,38,236,37,233,36,231,38,228,39,226,39,220,37,218,35,216,35,214,37,212,38,210,38,206,35,204,31,201,28,199,25,195,23,192,19,188,13,187,10,188,6,192",
 "='69,8,153,86' ", // babyneo.de

 // https://lists.w3.org/Archives/Public/public-html/2009Jan/0086.html
 ",1,2,3,4"
 ];
 tests.forEach(function(t) {
  document.writeln('<tr><td>', esc(t), '<td>', esc(parseListOfInts(t)), '<td>', esc(newCoords(t)), '<td>', esc(newSpecCoords(t)));
 });
 function esc(s) {
  s = String(s);
  return s.replace(/\"/g, '&quot;').replace(/&/g, '&amp;');
 }
 </script>
	<!doctype html>
	<meta charset=utf-8>
	<title>coords</title>
	<style>
	table { table-layout:fixed; width:100%; border-collapse:collapse }
	td { max-width:25%; overflow:hidden; border:2px solid gray; padding:0.5em; font-family:monospace }
	</style>
	<table>
	<tr><th>test<th>old parser<th>new parser (POC)<th>new parser (new-spec-compliant)
	<script>
	function parseListOfInts(input) {
	var spaceCommaSemicolon = /^[ ,;]$/;
	// Let input be the string being parsed.
	//
	// Let position be a pointer into input, initially pointing at the start of the string.
	var pos = 0;
	// Let numbers be an initially empty list of integers. This list will be the result of this
	// algorithm.
	var numbers = [];
	start: while (true) {
	// If there is a character in the string input at position position, and it is either a U+0020
	// SPACE, U+002C COMMA, or U+003B SEMICOLON character, then advance position to the next character
	// in input, or to beyond the end of the string if there are no more characters.
	if (spaceCommaSemicolon.test(input[pos])) {
	pos++;
	}
	// If position points to beyond the end of input, return numbers and abort.
	if (input[pos] === undefined) {
	return numbers;
	}
	// If the character in the string input at position position is a U+0020 SPACE, U+002C COMMA, or
	// U+003B SEMICOLON character, then return to step 4.
	if (spaceCommaSemicolon.test(input[pos])) {
	continue start;
	}
	// Let negated be false.
	var negated = false;
	// Let value be 0.
	var value = 0;
	// Let started be false. This variable is set to true when the parser sees a number or a U+002D
	// HYPHEN-MINUS character (-).
	var started = false;
	// Let got number be false. This variable is set to true when the parser sees a number.
	var gotNumber = false;
	// Let finished be false. This variable is set to true to switch parser into a mode where it
	// ignores characters until the next separator.
	var finished = false;
	// Let bogus be false.
	var bogus = false;
	// Parser: If the character in the string input at position position is:
	parser: while (true) {
	inner_parser: {
	var c = input[pos];
	// A U+002D HYPHEN-MINUS character
	if (c === '-') {
	// Follow these substeps:
	//
	// If got number is true, let finished be true. If finished is true, skip to the next step in the
	// overall set of steps. If started is true, let negated be false. Otherwise, if started is false
	// and if bogus is false, let negated be true. Let started be true.
	if (gotNumber) {
	finished = true;
	}
	if (finished) {
	break inner_parser;
	}
	if (started) {
	negated = false;
	} else if (!bogus) {
	negated = true;
	}
	started = true;
	}
	// An ASCII digit
	else if (/^\d$/.test(c)) {
	// Follow these substeps:
	//
	// If finished is true, skip to the next step in the overall set of steps. Multiply value by ten.
	// Add the value of the digit, interpreted in base ten, to value. Let started be true. Let got
	// number be true.
	if (finished) {
	break inner_parser;
	}
	value *= 10;
	value += parseInt(input[pos], 10);
	started = true;
	gotNumber = true;
	} else if (spaceCommaSemicolon.test(c)) {
	// Follow these substeps:
	//
	// If got number is false, return the numbers list and abort. This happens if an entry in the list
	// has no digits, as in "1,2,x,4". If negated is true, then negate value. Append value to the
	// numbers list. Jump to step 4 in the overall set of steps.
	if (!gotNumber) {
	return numbers;
	}
	if (negated) {
	value *= -1;
	}
	numbers.push(value);
	continue start;
	} else if (/^[\u0001-\u001f\u0021-\u002b\u002d-\u002f\u003a\u003c-\u0040\u005b-\u0060\u007f]$/.test(c)) {
	// Follow these substeps:
	//
	// If got number is true, let finished be true. If finished is true, skip to the next step in the
	// overall set of steps. Let negated be false.
	if (gotNumber) {
	finished = true;
	}
	if (finished) {
	break inner_parser;
	}
	negated = false;
	} else {
	// Follow these substeps:
	//
	// If finished is true, skip to the next step in the overall set of steps. Let negated be false.
	// Let bogus be true. If started is true, then return the numbers list, and abort. (The value in
	// value is not appended to the list first; it is dropped.)
	if (finished) {
	break parser;
	}
	negated = false;
	bogus = true;
	if (started) {
	return numbers;
	}
	}
	}
	// Advance position to the next character in input, or to beyond the end of the string if there
	// are no more characters.
	pos++;
	// If position points to a character (and not to beyond the end of input), jump to the big Parser
	// step above.
	if (input[pos] !== undefined) {
	continue parser;
	}
	// If negated is true, then negate value.
	if (negated) {
	value *= -1;
	}
	// If got number is true, then append value to the numbers list.
	if (gotNumber) {
	numbers.push(value);
	}
	// Return the numbers list and abort.
	return numbers;
	}
	}
	}
	function newCoords(input) {
	var numbers = [];
	// trim leading separators
	input = input.replace(/^[\s,]+/, '');
	// split
	var tokens = input.split(/[\s,]+/);
	// for each token in tokens
	for (var i = 0; i < tokens.length; ++i) {
	var token = tokens[i];
	// replace garbage with spaces
	token = token.replace(/[^\d\.-]/g, ' ');
	// parse as float; add to numbers
	numbers.push(parseFloat(token, 10) \|\| 0);
	}
	// return numbers
	return numbers;
	}

	function collectCharacters(input, pos, regex) {
	var startPos = pos;
	while (regex.test(input[pos])) {
	pos++;
	if (input[pos] === undefined) {
	break;
	}
	}
	return [input.substr(startPos, pos), pos];
	}

	function newSpecCoords(input) {
	// Let input be the string being parsed.

	// Let position be a pointer into input, initially pointing at the start of the
	// string.
	var pos = 0;
	// Let numbers be an initially empty list of floating-point numbers. This list
	// will be the result of this algorithm.
	var numbers = [];
	var unparsedNumber;
	var number;
	// Collect a sequence of characters that are space characters, U+002C COMMA, or
	// U+003B SEMICOLON characters. This skips past any leading delimiters.
	[, pos] = collectCharacters(input, pos, /^[\s,;]$/);
	// While position is not past the end of input:
	while (input[pos] !== undefined) {
	// Collect a sequence of characters that are not space characters, U+002C COMMA,
	// U+003B SEMICOLON, ASCII digits, U+002E FULL STOP, or U+002D HYPHEN-MINUS
	// characters. This skips past leading garbage.
	[, pos] = collectCharacters(input, pos, /^[^\s,;\d\.-]/);
	// Collect a sequence of characters that are not space characters, U+002C COMMA,
	// U+003B SEMICOLON characters, and let unparsed number be the result.
	[unparsedNumber, pos] = collectCharacters(input, pos, /^[^\s,;]$/);
	// Let number be the result using the rules for parsing floating-point number
	// values for unparsed number.
	number = parseFloat(unparsedNumber);
	// If number is an error, let number be zero.
	if (isNaN(number)) {
	number = 0;
	}
	// Append number to numbers.
	numbers.push(number);
	// Collect a sequence of characters that are space characters, U+002C COMMA, or
	// U+003B SEMICOLON characters. This skips past the delimiter.
	[, pos] = collectCharacters(input, pos, /^[\s,;]$/);
	}
	// Return numbers.
	return numbers;
	}
	var tests = [
	// a few from webdevdata
	"142,130,140,139,152,139,149,127,",
	"138,10,13.5",
	"594,72,779,72,779,142,594,142,5shop.com.tw/return/ef_return.html",
	"“0,12,625,478\"",
	"137,6 151,1,163,4,235,76,206,76",
	"557,328,705,329,706,517,658,518,656ls/spain/holidays/regions/3/Canary+Islands/Canary+Islands.html",
	"59,46,64,45,65,46,65,48,67,49,69,50,71,52,70,52,69,56,67,58,67,60,61,60,60,65,58,67,59,69,57,70,50,69,48,71,43,69,46,72,,215,6,218,3,220,2,223,1,228,2,234,4,238,6,240,6,244,10,246,14,247,18,250,23,254,27,257,32,259,34,255,33,251,35,250,38,249,38,246,40,245,40,243,37,241,37,239,38,236,37,233,36,231,38,228,39,226,39,220,37,218,35,216,35,214,37,212,38,210,38,206,35,204,31,201,28,199,25,195,23,192,19,188,13,187,10,188,6,192",
	"='69,8,153,86' ", // babyneo.de

	// https://lists.w3.org/Archives/Public/public-html/2009Jan/0086.html
	",1,2,3,4"
	];
	tests.forEach(function(t) {
	document.writeln('<tr><td>', esc(t), '<td>', esc(parseListOfInts(t)), '<td>', esc(newCoords(t)), '<td>', esc(newSpecCoords(t)));
	});
	function esc(s) {
	s = String(s);
	return s.replace(/\"/g, '"').replace(/&/g, '&');
	}
	</script>