Skip to content

Instantly share code, notes, and snippets.

@wchargin
Created February 5, 2024 23:07
Show Gist options
  • Save wchargin/4e9dc599f4a92541d9e192faf654f7a1 to your computer and use it in GitHub Desktop.
Save wchargin/4e9dc599f4a92541d9e192faf654f7a1 to your computer and use it in GitHub Desktop.
sortAsciinumeric, extractNumbers: numeric-aware string sorting
/**
* A comparator that follows elements' natural ordering. Useful for comparing
* numbers: `[8, 9, 10].sort()` is broken, but `[8, 9, 10].sort(natural)` works
* as expected. Also useful as a "neutral" comparator to give to combinators.
*/
function natural(a, b) {
if (a > b) return 1;
if (a < b) return -1;
return 0;
}
/**
* Creates a comparator that yields the reverse order of the given comparator.
*/
function rev(cmp = natural) {
return (a, b) => -cmp(a, b);
}
/**
* Creates a comparator that passes elements through a key-extraction function
* and compares the resulting keys. The extracted keys are compared using the
* given comparator (or natural order if none is given).
*
* For instance, to sort strings by their length, leaving strings of the same
* length in the same order relative to each other:
*
* const byLength = cmp.comparing((s) => s.length);
* words.sort(byLength);
*
* Or, to sort people by their names, locale-sensitively:
*
* const byName = cmp.comparing((p) => p.name, (a, b) => a.localeCompare(b));
* people.sort(byName);
*/
function comparing(f, cmp = natural) {
return (a, b) => cmp(f(a), f(b));
}
/**
* Creates a comparator that tries each of the given comparators in turn,
* honoring the first one that gives a nonzero result. For instance, to sort
* strings by their length, breaking ties by lexicographical order:
*
* const byLength = cmp.comparing((s) => s.length);
* const byValue = cmp.natural;
* const shortlex = first([byLength, byValue]);
* words.sort(shortlex);
*/
function first(cmps) {
return (a, b) => {
for (const cmp of cmps) {
const result = cmp(a, b);
if (result !== 0) return result;
}
return 0;
};
}
/**
* Creates a comparator that treats `null` and `undefined` values as equal to
* each other but greater than all other elements, falling back to `cmp` for
* comparisons between non-nullish values. It is therefore guaranteed that
* `cmp` will only be called with non-nullish inputs.
*/
function nullsLast(cmp = natural) {
return (a, b) => {
if (a == null && b != null) return 1;
if (a != null && b == null) return -1;
if (a == null && b == null) return 0;
return cmp(a, b);
};
}
/**
* Lifts an comparator of elements to a lexicographic comparator of arrays. The
* first index at which the array elements compare unequal, if any, determines
* the order of the arrays. If one array is a strict prefix of the other, the
* shorter array compares smaller.
*/
function array(cmp = natural) {
return (xs, ys) => {
// Loop invariant: at the start of each iteration, `xs[:i]` and `ys[:i]`
// compare equal. (Initially trivially true because `i === 0`.)
for (let i = 0; i < xs.length; i++) {
if (i >= ys.length) {
// `xs[:i]` and `ys` compare equal, but `xs` has more parts, so `xs`
// follows `ys`.
return 1;
}
const result = cmp(xs[i], ys[i]);
if (result !== 0) return result;
}
if (xs.length < ys.length) {
// `xs` and `ys[:xs.length]` compare equal, but `ys` has more parts, so
// `xs` precedes `ys`.
return -1;
}
return 0;
};
}
module.exports = {
natural,
rev,
comparing,
first,
nullsLast,
array,
};
const Cmp = require("./cmp");
describe("util/cmp", () => {
describe("natural", () => {
it("compares strings naturally", () => {
const actual = ["bob", "alice", "cheryl"].sort(Cmp.natural);
expect(actual).toEqual(["alice", "bob", "cheryl"]);
});
it("compares numbers naturally (not lexicographically)", () => {
const actual = [9, 10, 8].sort(Cmp.natural);
expect(actual).toEqual([8, 9, 10]);
});
});
describe("rev", () => {
it("defines a reverse-natural order by default", () => {
const actual = [9, 10, 8].sort(Cmp.rev());
expect(actual).toEqual([10, 9, 8]);
});
it("reverses the order of a given sub-comparator", () => {
const byBidDesc = Cmp.rev(Cmp.comparing((x) => x.bid));
const actual = [{ bid: 9 }, { bid: 10 }, { bid: 8 }].sort(byBidDesc);
expect(actual).toEqual([{ bid: 10 }, { bid: 9 }, { bid: 8 }]);
});
});
describe("comparing", () => {
it("accepts a simple accessor as only argument", () => {
const byLength = Cmp.comparing((s) => s.length);
const actual = ["alice", "bob", "cheryl"].sort(byLength);
expect(actual).toEqual(["bob", "alice", "cheryl"]);
});
it("keeps ties stable", () => {
const byLength = Cmp.comparing((s) => s.length);
const actual = ["alice", "bob", "bab", "beb"].sort(byLength);
expect(actual).toEqual(["bob", "bab", "beb", "alice"]);
});
it("accepts an accessor and sub-comparator", () => {
const byRevLength = Cmp.comparing((s) => s.length, Cmp.rev());
const actual = ["alice", "bob", "cheryl"].sort(byRevLength);
expect(actual).toEqual(["cheryl", "alice", "bob"]);
});
});
describe("first", () => {
it("uses the first nonzero comparator result", () => {
const shortlex = Cmp.first([Cmp.comparing((s) => s.length), Cmp.natural]);
const actual = ["bob", "bab", "beb", "jo", "alice"].sort(shortlex);
expect(actual).toEqual(["jo", "bab", "beb", "bob", "alice"]);
});
it("returns zero when all comparators return zero", () => {
const byHookOrByCrook = Cmp.first([
Cmp.comparing((x) => x.hook),
Cmp.comparing((x) => x.crook),
]);
const x = { hook: 1, crook: 2, brook: 3 };
const y = { hook: 1, crook: 2, brook: 5 };
expect(byHookOrByCrook(x, y)).toEqual(0);
});
});
describe("nullsLast", () => {
it("pushes nulls to the end of natural order by default", () => {
const actual = [9, null, 10, 8].sort(Cmp.nullsLast());
expect(actual).toEqual([8, 9, 10, null]);
});
it("accepts a sub-comparator only invoked on non-nulls", () => {
const byLength = Cmp.comparing((s) => s.length);
const actual = ["alice", null, "bob"].sort(Cmp.nullsLast(byLength));
expect(actual).toEqual(["bob", "alice", null]);
});
});
describe("array", () => {
function checkPrecedes(xs, ys, cmp = Cmp.array()) {
expect({ xs, ys, result: cmp(xs, ys) }).toEqual({ xs, ys, result: -1 });
expect({ xs, ys, result: cmp(ys, xs) }).toEqual({ xs, ys, result: 1 });
}
function checkEqual(xs, ys, cmp = Cmp.array()) {
expect({ xs, ys, result: cmp(xs, ys) }).toEqual({ xs, ys, result: 0 });
expect({ xs, ys, result: cmp(ys, xs) }).toEqual({ xs, ys, result: 0 });
}
it("compares empty array shorter than any other", () => {
checkEqual([], []);
checkPrecedes([], [1]);
checkPrecedes([], [-1]);
});
it("honors the first differing element", () => {
checkPrecedes([1, 2, 3], [1, 5, 2]);
});
it("honors strict prefixes", () => {
checkPrecedes([1, 2, 3], [1, 2, 3, 2, 1]);
});
it("honors exact equality", () => {
checkEqual([1, 2, 3], [1, 2, 3]);
});
it("honors exact comparator equality even on object inequality", () => {
checkEqual(
[2, 4, 6],
[3, 5, 7],
Cmp.comparing((x) => Math.floor(x / 2))
);
});
});
});
const RE_UNSIGNED_DECIMAL = /[0-9]+(?:\.[0-9]+)?/;
/**
* Splits a string into pieces that, when concatenated, form the original
* string. Attempts to extract positive and negative integers into their own
* groups, without confusing hyphens for minus signs:
*
* - "1-of-2" maps to ["1", "-of-", "2"]
* - "1-2-3" maps to ["1", "-", "2", "-", "3"]
* - "from -1 to 1" maps to ["from ", "-1", " to ", "1"]
* - "from-1-to--1" maps to ["from-", "1", "-to-", "-1"]
*
* As a special case, we recognize the form "[digits]x[digits]" as three
* tokens:
*
* - "3x3 grid" maps to ["3", "x", "3", " grid"]
* - "Limited-time x2 xp event" maps to ["Limited-time x", "2", " xp event"]
*/
function extractNumbers(s /*: string */) /*: string[] */ {
const result = [];
let i = 0;
while (i < s.length) {
const num = findNextNumber(s, i);
if (num == null) {
result.push(s.slice(i));
break;
}
result.push(s.slice(i, num.index));
result.push(s.slice(num.index, num.index + num.length));
i = num.index + num.length;
}
return result.filter(Boolean);
}
// Finds the next possibly signed decimal that's either (a) not preceded by a
// word character, or (b) preceded by an "x" (as in "3x3 grid"), unless that
// "x" is part of a normal word like "helix" or the start of a "0x..." literal.
function findNextNumber(s, i) {
while (i < s.length) {
const match = s.slice(i).match(RE_UNSIGNED_DECIMAL);
if (match == null) break;
const matchStart = i + match.index;
if (s[matchStart - 1] === "+" || s[matchStart - 1] === "-") {
// Prefer interpreting as signed.
if (mayBeUnsignedDecimal(s, matchStart - 1)) {
return { index: matchStart - 1, length: match[0].length + 1 };
}
}
if (mayBeUnsignedDecimal(s, matchStart)) {
return { index: matchStart, length: match[0].length };
}
i += matchStart + match[0].length;
}
return null;
}
function mayBeUnsignedDecimal(s, i) {
// Numbers may always appear at start of string.
if (i === 0) return true;
// Numbers may be preceded by non-word characters.
if (!s[i - 1].match(/\w/)) return true;
// Numbers preceded by a word character other than an "x" are considered part
// of the preceding word.
if (s[i - 1] !== "x") return false;
// An "x" at the start of the string (as in "x2 multiplier") is okay.
if (i === 1) return true;
// An "x" preceded by a non-digit word character (as in "helix") doesn't
// count as a separator.
if (s[i - 2].match(/[A-Za-z_]/)) return false;
// Otherwise, an "x" is a separator unless it's preceded by a lone "0", which
// we interpret as a "0x1234" hex literal. A longer number ending in zero, as
// in "10x10 grid", is fine.
if (s[i - 2] !== "0") return true;
if (i === 2) return false; // "0x" at start of string
return s[i - 2].match(/[0-9]/);
}
module.exports = extractNumbers;
const extractNumbers = require("./extractNumbers");
describe("util/extractNumbers", () => {
it("handles numbers offset by spaces", () => {
expect(extractNumbers("a 1 b 2 c")).toEqual(["a ", "1", " b ", "2", " c"]);
expect(extractNumbers("7 z 8 9")).toEqual(["7", " z ", "8", " ", "9"]);
});
it("handles numbers set off by hyphens from words", () => {
const input = "the 1-in-100 chance";
const expected = ["the ", "1", "-in-", "100", " chance"];
expect(extractNumbers(input)).toEqual(expected);
});
it("handles numbers set off by hyphens from other numbers", () => {
const input = "easy as 1-2-3";
const expected = ["easy as ", "1", "-", "2", "-", "3"];
expect(extractNumbers(input)).toEqual(expected);
});
it("handles numbers with decimal places", () => {
const input = "foo 1.23 bar 4.56.78";
const expected = ["foo ", "1.23", " bar ", "4.56", ".78"];
});
it("handles numbers with signs", () => {
const input = "from -1 to 0 to +1 and back";
const expected = ["from ", "-1", " to ", "0", " to ", "+1", " and back"];
expect(extractNumbers(input)).toEqual(expected);
});
it("handles numbers with minus signs in hyphenated contexts", () => {
const input = "from--1-to-1-and-back";
const expected = ["from-", "-1", "-to-", "1", "-and-back"];
expect(extractNumbers(input)).toEqual(expected);
});
it('handles adjacent "x"s', () => {
expect(extractNumbers("2x2 grid")).toEqual(["2", "x", "2", " grid"]);
expect(extractNumbers("x3 bonus")).toEqual(["x", "3", " bonus"]);
expect(extractNumbers("4x factor")).toEqual(["4", "x factor"]);
});
it("keeps parts of a 0x-string together", () => {
expect(extractNumbers("0xa0b1")).toEqual(["0", "xa0b1"]);
expect(extractNumbers("0x9f8e")).toEqual(["0", "x9f8e"]);
});
});
const Cmp = require("./cmp");
const extractNumbers = require("./extractNumbers");
function sortAsciinumeric(xs, key = (s) => s) {
const schwartz = xs.map((x) => {
const k = key(x);
if (typeof k !== "string")
throw new Error("key function returned non-string: " + k);
return { key: canonicalize(k), value: x };
});
return schwartz.sort((a, b) => cmp(a.key, b.key)).map((x) => x.value);
}
const GROUP_RE = /^(?:\s+|[0-9.+-]+|[^\s0-9.+-]+)/;
const NUMBERS = (() => {
const raw = [
["none", "zero"],
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
"twenty",
];
const result = new Map();
for (let i = 0; i < raw.length; i++) {
for (const key of [raw[i]].flat()) {
result.set(key, i);
}
}
return result;
})();
function canonicalize(s) {
const result = [];
const tokens = extractNumbers(s).flatMap((group) => {
// Split each group on hyphens (and whitespace), unless doing so would
// split a negative number from its minus sign.
if (Number.isFinite(Number(group))) {
return [group];
} else {
return group.split(/([\s-]+)/g).filter(Boolean);
}
});
for (const token of tokens) {
if (token.match(/^\s*$/)) continue;
const num = Number(NUMBERS.get(token.toLowerCase()) ?? token);
result.push(Number.isFinite(num) ? num : token);
}
return result;
}
const cmp = Cmp.array((x, y) => {
if (typeof x === "number" && typeof y === "string") return -1;
if (typeof x === "string" && typeof y === "number") return 1;
return Cmp.natural(x, y);
});
function cmpAsciinumeric(x, y) {
return cmp(canonicalize(x), canonicalize(y));
}
module.exports = { sortAsciinumeric, cmpAsciinumeric };
const { sortAsciinumeric, cmpAsciinumeric } = require("./sortAsciinumeric");
describe("util/sortAsciinumeric", () => {
it("handles all-alphabetical strings", () => {
const input = ["Cove", "Archipelago", "Glacier", "Dune"];
const output = sortAsciinumeric(input);
expect(output).toEqual(["Archipelago", "Cove", "Dune", "Glacier"]);
});
it("handles all-numeric strings", () => {
const input = ["23", "9", "100"];
const output = sortAsciinumeric(input);
expect(output).toEqual(["9", "23", "100"]);
});
it("handles mixed alphabetical and numeric strings", () => {
const input = ["1 in 23", "1 in 9", "1 in 100"];
const output = sortAsciinumeric(input);
expect(output).toEqual(["1 in 9", "1 in 23", "1 in 100"]);
});
it("handles mixed alphabetical and numeric hyphenated strings", () => {
const input = ["1-in-23", "1-in-9", "1-in-100"];
const output = sortAsciinumeric(input);
expect(output).toEqual(["1-in-9", "1-in-23", "1-in-100"]);
});
it("treats strings with more parts as larger, all else equal", () => {
const input = ["Left-tilt", "Left", "Right-tilt", "Right"];
const output = sortAsciinumeric(input);
expect(output).toEqual(["Left", "Left-tilt", "Right", "Right-tilt"]);
});
it("handles numeric English strings", () => {
const input = [
"Eight flowers",
"Five flowers",
"Four flowers",
"Nine flowers",
"None",
"One flower",
"Seven flowers",
"Six flowers",
"Ten flowers",
"Three flowers",
"Two flowers",
];
const output = sortAsciinumeric(input);
const expected = [
"None",
"One flower",
"Two flowers",
"Three flowers",
"Four flowers",
"Five flowers",
"Six flowers",
"Seven flowers",
"Eight flowers",
"Nine flowers",
"Ten flowers",
];
expect(output).toEqual(expected);
});
it("handles hyphenated numeric English strings", () => {
const input = [
"eight-flowers",
"five-flowers",
"four-flowers",
"nine-flowers",
"none",
"one-flower",
"seven-flowers",
"six-flowers",
"ten-flowers",
"three-flowers",
"two-flowers",
];
const output = sortAsciinumeric(input);
const expected = [
"none",
"one-flower",
"two-flowers",
"three-flowers",
"four-flowers",
"five-flowers",
"six-flowers",
"seven-flowers",
"eight-flowers",
"nine-flowers",
"ten-flowers",
];
expect(output).toEqual(expected);
});
it('sorts "None" before categorical values', () => {
// This is really a side-effect of treating "None" as a number, but it's
// kind of nice, so let's at least document it.
const input = ["Contracting", "Expanding", "None"];
const output = sortAsciinumeric(input);
expect(output).toEqual(["None", "Contracting", "Expanding"]);
});
it("allows projecting keys", () => {
const input = [
{ value: "1 in 23", id: "a" },
{ value: "1 in 9", id: "b" },
{ value: "1 in 100", id: "c" },
];
const output = sortAsciinumeric(input, (x) => x.value);
expect(output).toEqual([
{ value: "1 in 9", id: "b" },
{ value: "1 in 23", id: "a" },
{ value: "1 in 100", id: "c" },
]);
});
it("reports a nice error if the key extractor fails", () => {
const input = [
{ value: "1 in 23", id: "a" },
{ notValue: "1 in 100", id: "c" },
];
expect(() => sortAsciinumeric(input, (x) => x.value)).toThrow(
"key function returned non-string: undefined"
);
});
it("supports manually using `cmp`", () => {
expect(["four", "two", "three"].sort(cmpAsciinumeric)).toEqual([
"two",
"three",
"four",
]);
});
});
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment