Last active
January 14, 2016 13:46
-
-
Save zcorpan/baa697e081a3e1aa5da0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!doctype html> | |
<meta charset=utf-8> | |
<title>coords</title> | |
<style> | |
table { table-layout:fixed; width:100%; border-collapse:collapse } | |
td { max-width:25%; overflow:hidden; border:2px solid gray; padding:0.5em; font-family:monospace } | |
</style> | |
<table> | |
<tr><th>test<th>old parser<th>new parser (POC)<th>new parser (new-spec-compliant) | |
<script> | |
function parseListOfInts(input) { | |
var spaceCommaSemicolon = /^[ ,;]$/; | |
// Let input be the string being parsed. | |
// | |
// Let position be a pointer into input, initially pointing at the start of the string. | |
var pos = 0; | |
// Let numbers be an initially empty list of integers. This list will be the result of this | |
// algorithm. | |
var numbers = []; | |
start: while (true) { | |
// If there is a character in the string input at position position, and it is either a U+0020 | |
// SPACE, U+002C COMMA, or U+003B SEMICOLON character, then advance position to the next character | |
// in input, or to beyond the end of the string if there are no more characters. | |
if (spaceCommaSemicolon.test(input[pos])) { | |
pos++; | |
} | |
// If position points to beyond the end of input, return numbers and abort. | |
if (input[pos] === undefined) { | |
return numbers; | |
} | |
// If the character in the string input at position position is a U+0020 SPACE, U+002C COMMA, or | |
// U+003B SEMICOLON character, then return to step 4. | |
if (spaceCommaSemicolon.test(input[pos])) { | |
continue start; | |
} | |
// Let negated be false. | |
var negated = false; | |
// Let value be 0. | |
var value = 0; | |
// Let started be false. This variable is set to true when the parser sees a number or a U+002D | |
// HYPHEN-MINUS character (-). | |
var started = false; | |
// Let got number be false. This variable is set to true when the parser sees a number. | |
var gotNumber = false; | |
// Let finished be false. This variable is set to true to switch parser into a mode where it | |
// ignores characters until the next separator. | |
var finished = false; | |
// Let bogus be false. | |
var bogus = false; | |
// Parser: If the character in the string input at position position is: | |
parser: while (true) { | |
inner_parser: { | |
var c = input[pos]; | |
// A U+002D HYPHEN-MINUS character | |
if (c === '-') { | |
// Follow these substeps: | |
// | |
// If got number is true, let finished be true. If finished is true, skip to the next step in the | |
// overall set of steps. If started is true, let negated be false. Otherwise, if started is false | |
// and if bogus is false, let negated be true. Let started be true. | |
if (gotNumber) { | |
finished = true; | |
} | |
if (finished) { | |
break inner_parser; | |
} | |
if (started) { | |
negated = false; | |
} else if (!bogus) { | |
negated = true; | |
} | |
started = true; | |
} | |
// An ASCII digit | |
else if (/^\d$/.test(c)) { | |
// Follow these substeps: | |
// | |
// If finished is true, skip to the next step in the overall set of steps. Multiply value by ten. | |
// Add the value of the digit, interpreted in base ten, to value. Let started be true. Let got | |
// number be true. | |
if (finished) { | |
break inner_parser; | |
} | |
value *= 10; | |
value += parseInt(input[pos], 10); | |
started = true; | |
gotNumber = true; | |
} else if (spaceCommaSemicolon.test(c)) { | |
// Follow these substeps: | |
// | |
// If got number is false, return the numbers list and abort. This happens if an entry in the list | |
// has no digits, as in "1,2,x,4". If negated is true, then negate value. Append value to the | |
// numbers list. Jump to step 4 in the overall set of steps. | |
if (!gotNumber) { | |
return numbers; | |
} | |
if (negated) { | |
value *= -1; | |
} | |
numbers.push(value); | |
continue start; | |
} else if (/^[\u0001-\u001f\u0021-\u002b\u002d-\u002f\u003a\u003c-\u0040\u005b-\u0060\u007f]$/.test(c)) { | |
// Follow these substeps: | |
// | |
// If got number is true, let finished be true. If finished is true, skip to the next step in the | |
// overall set of steps. Let negated be false. | |
if (gotNumber) { | |
finished = true; | |
} | |
if (finished) { | |
break inner_parser; | |
} | |
negated = false; | |
} else { | |
// Follow these substeps: | |
// | |
// If finished is true, skip to the next step in the overall set of steps. Let negated be false. | |
// Let bogus be true. If started is true, then return the numbers list, and abort. (The value in | |
// value is not appended to the list first; it is dropped.) | |
if (finished) { | |
break parser; | |
} | |
negated = false; | |
bogus = true; | |
if (started) { | |
return numbers; | |
} | |
} | |
} | |
// Advance position to the next character in input, or to beyond the end of the string if there | |
// are no more characters. | |
pos++; | |
// If position points to a character (and not to beyond the end of input), jump to the big Parser | |
// step above. | |
if (input[pos] !== undefined) { | |
continue parser; | |
} | |
// If negated is true, then negate value. | |
if (negated) { | |
value *= -1; | |
} | |
// If got number is true, then append value to the numbers list. | |
if (gotNumber) { | |
numbers.push(value); | |
} | |
// Return the numbers list and abort. | |
return numbers; | |
} | |
} | |
} | |
function newCoords(input) { | |
var numbers = []; | |
// trim leading separators | |
input = input.replace(/^[\s,]+/, ''); | |
// split | |
var tokens = input.split(/[\s,]+/); | |
// for each token in tokens | |
for (var i = 0; i < tokens.length; ++i) { | |
var token = tokens[i]; | |
// replace garbage with spaces | |
token = token.replace(/[^\d\.-]/g, ' '); | |
// parse as float; add to numbers | |
numbers.push(parseFloat(token, 10) || 0); | |
} | |
// return numbers | |
return numbers; | |
} | |
function collectCharacters(input, pos, regex) { | |
var startPos = pos; | |
while (regex.test(input[pos])) { | |
pos++; | |
if (input[pos] === undefined) { | |
break; | |
} | |
} | |
return [input.substr(startPos, pos), pos]; | |
} | |
function newSpecCoords(input) { | |
// Let input be the string being parsed. | |
// Let position be a pointer into input, initially pointing at the start of the | |
// string. | |
var pos = 0; | |
// Let numbers be an initially empty list of floating-point numbers. This list | |
// will be the result of this algorithm. | |
var numbers = []; | |
var unparsedNumber; | |
var number; | |
// Collect a sequence of characters that are space characters, U+002C COMMA, or | |
// U+003B SEMICOLON characters. This skips past any leading delimiters. | |
[, pos] = collectCharacters(input, pos, /^[\s,;]$/); | |
// While position is not past the end of input: | |
while (input[pos] !== undefined) { | |
// Collect a sequence of characters that are not space characters, U+002C COMMA, | |
// U+003B SEMICOLON, ASCII digits, U+002E FULL STOP, or U+002D HYPHEN-MINUS | |
// characters. This skips past leading garbage. | |
[, pos] = collectCharacters(input, pos, /^[^\s,;\d\.-]/); | |
// Collect a sequence of characters that are not space characters, U+002C COMMA, | |
// U+003B SEMICOLON characters, and let unparsed number be the result. | |
[unparsedNumber, pos] = collectCharacters(input, pos, /^[^\s,;]$/); | |
// Let number be the result using the rules for parsing floating-point number | |
// values for unparsed number. | |
number = parseFloat(unparsedNumber); | |
// If number is an error, let number be zero. | |
if (isNaN(number)) { | |
number = 0; | |
} | |
// Append number to numbers. | |
numbers.push(number); | |
// Collect a sequence of characters that are space characters, U+002C COMMA, or | |
// U+003B SEMICOLON characters. This skips past the delimiter. | |
[, pos] = collectCharacters(input, pos, /^[\s,;]$/); | |
} | |
// Return numbers. | |
return numbers; | |
} | |
var tests = [ | |
// a few from webdevdata | |
"142,130,140,139,152,139,149,127,", | |
"138,10,13.5", | |
"594,72,779,72,779,142,594,142,5shop.com.tw/return/ef_return.html", | |
"“0,12,625,478\"", | |
"137,6 151,1,163,4,235,76,206,76", | |
"557,328,705,329,706,517,658,518,656ls/spain/holidays/regions/3/Canary+Islands/Canary+Islands.html", | |
"59,46,64,45,65,46,65,48,67,49,69,50,71,52,70,52,69,56,67,58,67,60,61,60,60,65,58,67,59,69,57,70,50,69,48,71,43,69,46,72,,215,6,218,3,220,2,223,1,228,2,234,4,238,6,240,6,244,10,246,14,247,18,250,23,254,27,257,32,259,34,255,33,251,35,250,38,249,38,246,40,245,40,243,37,241,37,239,38,236,37,233,36,231,38,228,39,226,39,220,37,218,35,216,35,214,37,212,38,210,38,206,35,204,31,201,28,199,25,195,23,192,19,188,13,187,10,188,6,192", | |
"='69,8,153,86' ", // babyneo.de | |
// https://lists.w3.org/Archives/Public/public-html/2009Jan/0086.html | |
",1,2,3,4" | |
]; | |
tests.forEach(function(t) { | |
document.writeln('<tr><td>', esc(t), '<td>', esc(parseListOfInts(t)), '<td>', esc(newCoords(t)), '<td>', esc(newSpecCoords(t))); | |
}); | |
function esc(s) { | |
s = String(s); | |
return s.replace(/\"/g, '"').replace(/&/g, '&'); | |
} | |
</script> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment