Created
February 5, 2024 23:07
-
-
Save wchargin/4e9dc599f4a92541d9e192faf654f7a1 to your computer and use it in GitHub Desktop.
sortAsciinumeric, extractNumbers: numeric-aware string sorting
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* A comparator that follows elements' natural ordering. Useful for comparing | |
* numbers: `[8, 9, 10].sort()` is broken, but `[8, 9, 10].sort(natural)` works | |
* as expected. Also useful as a "neutral" comparator to give to combinators. | |
*/ | |
function natural(a, b) { | |
if (a > b) return 1; | |
if (a < b) return -1; | |
return 0; | |
} | |
/** | |
* Creates a comparator that yields the reverse order of the given comparator. | |
*/ | |
function rev(cmp = natural) { | |
return (a, b) => -cmp(a, b); | |
} | |
/** | |
* Creates a comparator that passes elements through a key-extraction function | |
* and compares the resulting keys. The extracted keys are compared using the | |
* given comparator (or natural order if none is given). | |
* | |
* For instance, to sort strings by their length, leaving strings of the same | |
* length in the same order relative to each other: | |
* | |
* const byLength = cmp.comparing((s) => s.length); | |
* words.sort(byLength); | |
* | |
* Or, to sort people by their names, locale-sensitively: | |
* | |
* const byName = cmp.comparing((p) => p.name, (a, b) => a.localeCompare(b)); | |
* people.sort(byName); | |
*/ | |
function comparing(f, cmp = natural) { | |
return (a, b) => cmp(f(a), f(b)); | |
} | |
/** | |
* Creates a comparator that tries each of the given comparators in turn, | |
* honoring the first one that gives a nonzero result. For instance, to sort | |
* strings by their length, breaking ties by lexicographical order: | |
* | |
* const byLength = cmp.comparing((s) => s.length); | |
* const byValue = cmp.natural; | |
* const shortlex = first([byLength, byValue]); | |
* words.sort(shortlex); | |
*/ | |
function first(cmps) { | |
return (a, b) => { | |
for (const cmp of cmps) { | |
const result = cmp(a, b); | |
if (result !== 0) return result; | |
} | |
return 0; | |
}; | |
} | |
/** | |
* Creates a comparator that treats `null` and `undefined` values as equal to | |
* each other but greater than all other elements, falling back to `cmp` for | |
* comparisons between non-nullish values. It is therefore guaranteed that | |
* `cmp` will only be called with non-nullish inputs. | |
*/ | |
function nullsLast(cmp = natural) { | |
return (a, b) => { | |
if (a == null && b != null) return 1; | |
if (a != null && b == null) return -1; | |
if (a == null && b == null) return 0; | |
return cmp(a, b); | |
}; | |
} | |
/** | |
* Lifts an comparator of elements to a lexicographic comparator of arrays. The | |
* first index at which the array elements compare unequal, if any, determines | |
* the order of the arrays. If one array is a strict prefix of the other, the | |
* shorter array compares smaller. | |
*/ | |
function array(cmp = natural) { | |
return (xs, ys) => { | |
// Loop invariant: at the start of each iteration, `xs[:i]` and `ys[:i]` | |
// compare equal. (Initially trivially true because `i === 0`.) | |
for (let i = 0; i < xs.length; i++) { | |
if (i >= ys.length) { | |
// `xs[:i]` and `ys` compare equal, but `xs` has more parts, so `xs` | |
// follows `ys`. | |
return 1; | |
} | |
const result = cmp(xs[i], ys[i]); | |
if (result !== 0) return result; | |
} | |
if (xs.length < ys.length) { | |
// `xs` and `ys[:xs.length]` compare equal, but `ys` has more parts, so | |
// `xs` precedes `ys`. | |
return -1; | |
} | |
return 0; | |
}; | |
} | |
module.exports = { | |
natural, | |
rev, | |
comparing, | |
first, | |
nullsLast, | |
array, | |
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Cmp = require("./cmp"); | |
describe("util/cmp", () => { | |
describe("natural", () => { | |
it("compares strings naturally", () => { | |
const actual = ["bob", "alice", "cheryl"].sort(Cmp.natural); | |
expect(actual).toEqual(["alice", "bob", "cheryl"]); | |
}); | |
it("compares numbers naturally (not lexicographically)", () => { | |
const actual = [9, 10, 8].sort(Cmp.natural); | |
expect(actual).toEqual([8, 9, 10]); | |
}); | |
}); | |
describe("rev", () => { | |
it("defines a reverse-natural order by default", () => { | |
const actual = [9, 10, 8].sort(Cmp.rev()); | |
expect(actual).toEqual([10, 9, 8]); | |
}); | |
it("reverses the order of a given sub-comparator", () => { | |
const byBidDesc = Cmp.rev(Cmp.comparing((x) => x.bid)); | |
const actual = [{ bid: 9 }, { bid: 10 }, { bid: 8 }].sort(byBidDesc); | |
expect(actual).toEqual([{ bid: 10 }, { bid: 9 }, { bid: 8 }]); | |
}); | |
}); | |
describe("comparing", () => { | |
it("accepts a simple accessor as only argument", () => { | |
const byLength = Cmp.comparing((s) => s.length); | |
const actual = ["alice", "bob", "cheryl"].sort(byLength); | |
expect(actual).toEqual(["bob", "alice", "cheryl"]); | |
}); | |
it("keeps ties stable", () => { | |
const byLength = Cmp.comparing((s) => s.length); | |
const actual = ["alice", "bob", "bab", "beb"].sort(byLength); | |
expect(actual).toEqual(["bob", "bab", "beb", "alice"]); | |
}); | |
it("accepts an accessor and sub-comparator", () => { | |
const byRevLength = Cmp.comparing((s) => s.length, Cmp.rev()); | |
const actual = ["alice", "bob", "cheryl"].sort(byRevLength); | |
expect(actual).toEqual(["cheryl", "alice", "bob"]); | |
}); | |
}); | |
describe("first", () => { | |
it("uses the first nonzero comparator result", () => { | |
const shortlex = Cmp.first([Cmp.comparing((s) => s.length), Cmp.natural]); | |
const actual = ["bob", "bab", "beb", "jo", "alice"].sort(shortlex); | |
expect(actual).toEqual(["jo", "bab", "beb", "bob", "alice"]); | |
}); | |
it("returns zero when all comparators return zero", () => { | |
const byHookOrByCrook = Cmp.first([ | |
Cmp.comparing((x) => x.hook), | |
Cmp.comparing((x) => x.crook), | |
]); | |
const x = { hook: 1, crook: 2, brook: 3 }; | |
const y = { hook: 1, crook: 2, brook: 5 }; | |
expect(byHookOrByCrook(x, y)).toEqual(0); | |
}); | |
}); | |
describe("nullsLast", () => { | |
it("pushes nulls to the end of natural order by default", () => { | |
const actual = [9, null, 10, 8].sort(Cmp.nullsLast()); | |
expect(actual).toEqual([8, 9, 10, null]); | |
}); | |
it("accepts a sub-comparator only invoked on non-nulls", () => { | |
const byLength = Cmp.comparing((s) => s.length); | |
const actual = ["alice", null, "bob"].sort(Cmp.nullsLast(byLength)); | |
expect(actual).toEqual(["bob", "alice", null]); | |
}); | |
}); | |
describe("array", () => { | |
function checkPrecedes(xs, ys, cmp = Cmp.array()) { | |
expect({ xs, ys, result: cmp(xs, ys) }).toEqual({ xs, ys, result: -1 }); | |
expect({ xs, ys, result: cmp(ys, xs) }).toEqual({ xs, ys, result: 1 }); | |
} | |
function checkEqual(xs, ys, cmp = Cmp.array()) { | |
expect({ xs, ys, result: cmp(xs, ys) }).toEqual({ xs, ys, result: 0 }); | |
expect({ xs, ys, result: cmp(ys, xs) }).toEqual({ xs, ys, result: 0 }); | |
} | |
it("compares empty array shorter than any other", () => { | |
checkEqual([], []); | |
checkPrecedes([], [1]); | |
checkPrecedes([], [-1]); | |
}); | |
it("honors the first differing element", () => { | |
checkPrecedes([1, 2, 3], [1, 5, 2]); | |
}); | |
it("honors strict prefixes", () => { | |
checkPrecedes([1, 2, 3], [1, 2, 3, 2, 1]); | |
}); | |
it("honors exact equality", () => { | |
checkEqual([1, 2, 3], [1, 2, 3]); | |
}); | |
it("honors exact comparator equality even on object inequality", () => { | |
checkEqual( | |
[2, 4, 6], | |
[3, 5, 7], | |
Cmp.comparing((x) => Math.floor(x / 2)) | |
); | |
}); | |
}); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const RE_UNSIGNED_DECIMAL = /[0-9]+(?:\.[0-9]+)?/; | |
/** | |
* Splits a string into pieces that, when concatenated, form the original | |
* string. Attempts to extract positive and negative integers into their own | |
* groups, without confusing hyphens for minus signs: | |
* | |
* - "1-of-2" maps to ["1", "-of-", "2"] | |
* - "1-2-3" maps to ["1", "-", "2", "-", "3"] | |
* - "from -1 to 1" maps to ["from ", "-1", " to ", "1"] | |
* - "from-1-to--1" maps to ["from-", "1", "-to-", "-1"] | |
* | |
* As a special case, we recognize the form "[digits]x[digits]" as three | |
* tokens: | |
* | |
* - "3x3 grid" maps to ["3", "x", "3", " grid"] | |
* - "Limited-time x2 xp event" maps to ["Limited-time x", "2", " xp event"] | |
*/ | |
function extractNumbers(s /*: string */) /*: string[] */ { | |
const result = []; | |
let i = 0; | |
while (i < s.length) { | |
const num = findNextNumber(s, i); | |
if (num == null) { | |
result.push(s.slice(i)); | |
break; | |
} | |
result.push(s.slice(i, num.index)); | |
result.push(s.slice(num.index, num.index + num.length)); | |
i = num.index + num.length; | |
} | |
return result.filter(Boolean); | |
} | |
// Finds the next possibly signed decimal that's either (a) not preceded by a | |
// word character, or (b) preceded by an "x" (as in "3x3 grid"), unless that | |
// "x" is part of a normal word like "helix" or the start of a "0x..." literal. | |
function findNextNumber(s, i) { | |
while (i < s.length) { | |
const match = s.slice(i).match(RE_UNSIGNED_DECIMAL); | |
if (match == null) break; | |
const matchStart = i + match.index; | |
if (s[matchStart - 1] === "+" || s[matchStart - 1] === "-") { | |
// Prefer interpreting as signed. | |
if (mayBeUnsignedDecimal(s, matchStart - 1)) { | |
return { index: matchStart - 1, length: match[0].length + 1 }; | |
} | |
} | |
if (mayBeUnsignedDecimal(s, matchStart)) { | |
return { index: matchStart, length: match[0].length }; | |
} | |
i += matchStart + match[0].length; | |
} | |
return null; | |
} | |
function mayBeUnsignedDecimal(s, i) { | |
// Numbers may always appear at start of string. | |
if (i === 0) return true; | |
// Numbers may be preceded by non-word characters. | |
if (!s[i - 1].match(/\w/)) return true; | |
// Numbers preceded by a word character other than an "x" are considered part | |
// of the preceding word. | |
if (s[i - 1] !== "x") return false; | |
// An "x" at the start of the string (as in "x2 multiplier") is okay. | |
if (i === 1) return true; | |
// An "x" preceded by a non-digit word character (as in "helix") doesn't | |
// count as a separator. | |
if (s[i - 2].match(/[A-Za-z_]/)) return false; | |
// Otherwise, an "x" is a separator unless it's preceded by a lone "0", which | |
// we interpret as a "0x1234" hex literal. A longer number ending in zero, as | |
// in "10x10 grid", is fine. | |
if (s[i - 2] !== "0") return true; | |
if (i === 2) return false; // "0x" at start of string | |
return s[i - 2].match(/[0-9]/); | |
} | |
module.exports = extractNumbers; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const extractNumbers = require("./extractNumbers"); | |
describe("util/extractNumbers", () => { | |
it("handles numbers offset by spaces", () => { | |
expect(extractNumbers("a 1 b 2 c")).toEqual(["a ", "1", " b ", "2", " c"]); | |
expect(extractNumbers("7 z 8 9")).toEqual(["7", " z ", "8", " ", "9"]); | |
}); | |
it("handles numbers set off by hyphens from words", () => { | |
const input = "the 1-in-100 chance"; | |
const expected = ["the ", "1", "-in-", "100", " chance"]; | |
expect(extractNumbers(input)).toEqual(expected); | |
}); | |
it("handles numbers set off by hyphens from other numbers", () => { | |
const input = "easy as 1-2-3"; | |
const expected = ["easy as ", "1", "-", "2", "-", "3"]; | |
expect(extractNumbers(input)).toEqual(expected); | |
}); | |
it("handles numbers with decimal places", () => { | |
const input = "foo 1.23 bar 4.56.78"; | |
const expected = ["foo ", "1.23", " bar ", "4.56", ".78"]; | |
}); | |
it("handles numbers with signs", () => { | |
const input = "from -1 to 0 to +1 and back"; | |
const expected = ["from ", "-1", " to ", "0", " to ", "+1", " and back"]; | |
expect(extractNumbers(input)).toEqual(expected); | |
}); | |
it("handles numbers with minus signs in hyphenated contexts", () => { | |
const input = "from--1-to-1-and-back"; | |
const expected = ["from-", "-1", "-to-", "1", "-and-back"]; | |
expect(extractNumbers(input)).toEqual(expected); | |
}); | |
it('handles adjacent "x"s', () => { | |
expect(extractNumbers("2x2 grid")).toEqual(["2", "x", "2", " grid"]); | |
expect(extractNumbers("x3 bonus")).toEqual(["x", "3", " bonus"]); | |
expect(extractNumbers("4x factor")).toEqual(["4", "x factor"]); | |
}); | |
it("keeps parts of a 0x-string together", () => { | |
expect(extractNumbers("0xa0b1")).toEqual(["0", "xa0b1"]); | |
expect(extractNumbers("0x9f8e")).toEqual(["0", "x9f8e"]); | |
}); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const Cmp = require("./cmp"); | |
const extractNumbers = require("./extractNumbers"); | |
function sortAsciinumeric(xs, key = (s) => s) { | |
const schwartz = xs.map((x) => { | |
const k = key(x); | |
if (typeof k !== "string") | |
throw new Error("key function returned non-string: " + k); | |
return { key: canonicalize(k), value: x }; | |
}); | |
return schwartz.sort((a, b) => cmp(a.key, b.key)).map((x) => x.value); | |
} | |
const GROUP_RE = /^(?:\s+|[0-9.+-]+|[^\s0-9.+-]+)/; | |
const NUMBERS = (() => { | |
const raw = [ | |
["none", "zero"], | |
"one", | |
"two", | |
"three", | |
"four", | |
"five", | |
"six", | |
"seven", | |
"eight", | |
"nine", | |
"ten", | |
"eleven", | |
"twelve", | |
"thirteen", | |
"fourteen", | |
"fifteen", | |
"sixteen", | |
"seventeen", | |
"eighteen", | |
"nineteen", | |
"twenty", | |
]; | |
const result = new Map(); | |
for (let i = 0; i < raw.length; i++) { | |
for (const key of [raw[i]].flat()) { | |
result.set(key, i); | |
} | |
} | |
return result; | |
})(); | |
function canonicalize(s) { | |
const result = []; | |
const tokens = extractNumbers(s).flatMap((group) => { | |
// Split each group on hyphens (and whitespace), unless doing so would | |
// split a negative number from its minus sign. | |
if (Number.isFinite(Number(group))) { | |
return [group]; | |
} else { | |
return group.split(/([\s-]+)/g).filter(Boolean); | |
} | |
}); | |
for (const token of tokens) { | |
if (token.match(/^\s*$/)) continue; | |
const num = Number(NUMBERS.get(token.toLowerCase()) ?? token); | |
result.push(Number.isFinite(num) ? num : token); | |
} | |
return result; | |
} | |
const cmp = Cmp.array((x, y) => { | |
if (typeof x === "number" && typeof y === "string") return -1; | |
if (typeof x === "string" && typeof y === "number") return 1; | |
return Cmp.natural(x, y); | |
}); | |
function cmpAsciinumeric(x, y) { | |
return cmp(canonicalize(x), canonicalize(y)); | |
} | |
module.exports = { sortAsciinumeric, cmpAsciinumeric }; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const { sortAsciinumeric, cmpAsciinumeric } = require("./sortAsciinumeric"); | |
describe("util/sortAsciinumeric", () => { | |
it("handles all-alphabetical strings", () => { | |
const input = ["Cove", "Archipelago", "Glacier", "Dune"]; | |
const output = sortAsciinumeric(input); | |
expect(output).toEqual(["Archipelago", "Cove", "Dune", "Glacier"]); | |
}); | |
it("handles all-numeric strings", () => { | |
const input = ["23", "9", "100"]; | |
const output = sortAsciinumeric(input); | |
expect(output).toEqual(["9", "23", "100"]); | |
}); | |
it("handles mixed alphabetical and numeric strings", () => { | |
const input = ["1 in 23", "1 in 9", "1 in 100"]; | |
const output = sortAsciinumeric(input); | |
expect(output).toEqual(["1 in 9", "1 in 23", "1 in 100"]); | |
}); | |
it("handles mixed alphabetical and numeric hyphenated strings", () => { | |
const input = ["1-in-23", "1-in-9", "1-in-100"]; | |
const output = sortAsciinumeric(input); | |
expect(output).toEqual(["1-in-9", "1-in-23", "1-in-100"]); | |
}); | |
it("treats strings with more parts as larger, all else equal", () => { | |
const input = ["Left-tilt", "Left", "Right-tilt", "Right"]; | |
const output = sortAsciinumeric(input); | |
expect(output).toEqual(["Left", "Left-tilt", "Right", "Right-tilt"]); | |
}); | |
it("handles numeric English strings", () => { | |
const input = [ | |
"Eight flowers", | |
"Five flowers", | |
"Four flowers", | |
"Nine flowers", | |
"None", | |
"One flower", | |
"Seven flowers", | |
"Six flowers", | |
"Ten flowers", | |
"Three flowers", | |
"Two flowers", | |
]; | |
const output = sortAsciinumeric(input); | |
const expected = [ | |
"None", | |
"One flower", | |
"Two flowers", | |
"Three flowers", | |
"Four flowers", | |
"Five flowers", | |
"Six flowers", | |
"Seven flowers", | |
"Eight flowers", | |
"Nine flowers", | |
"Ten flowers", | |
]; | |
expect(output).toEqual(expected); | |
}); | |
it("handles hyphenated numeric English strings", () => { | |
const input = [ | |
"eight-flowers", | |
"five-flowers", | |
"four-flowers", | |
"nine-flowers", | |
"none", | |
"one-flower", | |
"seven-flowers", | |
"six-flowers", | |
"ten-flowers", | |
"three-flowers", | |
"two-flowers", | |
]; | |
const output = sortAsciinumeric(input); | |
const expected = [ | |
"none", | |
"one-flower", | |
"two-flowers", | |
"three-flowers", | |
"four-flowers", | |
"five-flowers", | |
"six-flowers", | |
"seven-flowers", | |
"eight-flowers", | |
"nine-flowers", | |
"ten-flowers", | |
]; | |
expect(output).toEqual(expected); | |
}); | |
it('sorts "None" before categorical values', () => { | |
// This is really a side-effect of treating "None" as a number, but it's | |
// kind of nice, so let's at least document it. | |
const input = ["Contracting", "Expanding", "None"]; | |
const output = sortAsciinumeric(input); | |
expect(output).toEqual(["None", "Contracting", "Expanding"]); | |
}); | |
it("allows projecting keys", () => { | |
const input = [ | |
{ value: "1 in 23", id: "a" }, | |
{ value: "1 in 9", id: "b" }, | |
{ value: "1 in 100", id: "c" }, | |
]; | |
const output = sortAsciinumeric(input, (x) => x.value); | |
expect(output).toEqual([ | |
{ value: "1 in 9", id: "b" }, | |
{ value: "1 in 23", id: "a" }, | |
{ value: "1 in 100", id: "c" }, | |
]); | |
}); | |
it("reports a nice error if the key extractor fails", () => { | |
const input = [ | |
{ value: "1 in 23", id: "a" }, | |
{ notValue: "1 in 100", id: "c" }, | |
]; | |
expect(() => sortAsciinumeric(input, (x) => x.value)).toThrow( | |
"key function returned non-string: undefined" | |
); | |
}); | |
it("supports manually using `cmp`", () => { | |
expect(["four", "two", "three"].sort(cmpAsciinumeric)).toEqual([ | |
"two", | |
"three", | |
"four", | |
]); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment