Last active
June 10, 2018 04:54
-
-
Save dead-claudia/747a77a220bc8e45db7afbb4459c0208 to your computer and use it in GitHub Desktop.
Unicode test function generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env node | |
"use strict" | |
// This generates one or more methods that check for a code point's existence | |
// within a set of code points, but with a few substantial optimizations - I'd | |
// rather not have a very slow or high-memory runtime test. Few more notes: | |
// | |
// - This exports the methods through an ES6 module. If you don't want that, | |
// you'll need to modify the script to change that. | |
// - At runtime, it stores all of its data in the smallest integer size it can | |
// keep the data losslessly uncompressed in. This makes it much smaller to | |
// iterate, and less data within memory. | |
// - At runtime, it iterates a contiguously-located, delta-encoded table in a | |
// very cache-friendly fashion. This keeps the comparison loop pretty simple, | |
// light, and fast, even though it's technically worst-case linear relative to | |
// the number of matching code points. (In theory, it could be every other | |
// code point, but in practice, matching code points are almost always | |
// either clustered or far apart, and that's why this provides such impressive | |
// size gains.) | |
// | |
// Note that this has a dependency on `unicode-11.0.0`. | |
// | |
// Below is the config: | |
// | |
// - `output` is the output file to use. | |
// - `modules` is a mapping of method + property/category list | |
// | |
// To use, you do something like this: | |
// | |
// ```js | |
// import {isIDContinue} from "./generated-categories" | |
// | |
// if (isIDContinue(charCode)) { | |
// doThings() | |
// } | |
// ``` | |
// | |
// Below is the config used if this file is run directly. | |
const config = { | |
type: "module", | |
output: "../src/app/util/category-checkers.mjs", | |
methods: { | |
isValidHashtagChar: [ | |
"General_Category/Letter", | |
"General_Category/Mark", | |
"General_Category/Number", | |
"General_Category/Modifier_Symbol", | |
"General_Category/Other_Symbol", | |
"General_Category/Format", | |
"General_Category/Surrogate", | |
], | |
}, | |
} | |
// And here's the meat of the module. | |
const fs = require("fs") | |
const path = require("path") | |
function write(stream, str) { | |
return new Promise((resolve, reject) => { | |
stream.write(str, err => err != null ? reject(err) : resolve()) | |
}) | |
} | |
function loadCodePoints(options, tableList) { | |
const values = new Set() | |
for (const name of tableList) { | |
// eslint-disable-next-line global-require | |
const list = require(`unicode-11.0.0/${name}/code-points`) | |
for (const code of list) values.add(code) | |
} | |
return values | |
} | |
// This is an optimized equivalent of the following: | |
// | |
// ```js | |
// function loadTable(values) { | |
// const table = Array.from(values).sort((a, b) => a - b) | |
// const result = [] | |
// let count = 0 | |
// let prev | |
// | |
// for (let i = 0; i < table.length; i++) { | |
// const child = table[i] | |
// | |
// if (prev != null && child === prev.end + 1) { | |
// prev.end = child | |
// } else { | |
// prev = table[count++] = {start: child, end: child} | |
// } | |
// } | |
// | |
// table.length = count | |
// let acc = 0 | |
// | |
// for (let i = 0; i < table.length; i++) { | |
// const entry = table[i] | |
// const start = entry.start - acc | |
// | |
// acc = entry.end | |
// if (entry.start === entry.end) { | |
// result.push(-start) | |
// } else { | |
// result.push(start - 1) | |
// result.push(entry.end - entry.start + 1) | |
// } | |
// } | |
// | |
// return result | |
// } | |
// ``` | |
function loadTable(result, values) { | |
const table = Int32Array.from(values).sort() | |
let prevEnd = 0 | |
let accumulating = false | |
for (let i = 0; i < table.length; i++) { | |
const child = table[i] | |
if (child !== prevEnd + 1) { | |
result.push(prevEnd - child) | |
prevEnd = child | |
accumulating = false | |
} else { | |
const top = result.length - 1 | |
if (accumulating) { | |
result[top]++ | |
} else { | |
result[top] = 1 - result[top] | |
result.push(2) | |
} | |
prevEnd = child | |
accumulating = true | |
} | |
} | |
} | |
function getView(data) { | |
for (let i = 0; i < data.length; i++) { | |
if (data[i] >>> 1) { | |
for (; i < data.length; i++) if (data[i] >>> 2) return "Int32Array" | |
return "Int16Array" | |
} | |
} | |
return "Int8Array" | |
} | |
async function main(options = config) { | |
const format = ({js = "", esm = js, ts = esm} = {}) => { | |
switch (options.module) { | |
case "ts": return ts | |
case "script": return js | |
default: return esm | |
} | |
} | |
const stream = await new Promise((resolve, reject) => { | |
const stream = fs.createWriteStream( | |
path.resolve(__dirname, options.output), | |
"utf-8" | |
) | |
function clear() { | |
stream.removeListener("error", clear) | |
stream.removeListener("error", reject) | |
stream.removeListener("ready", clear) | |
stream.removeListener("ready", ready) | |
} | |
function ready() { | |
resolve(stream) | |
} | |
stream.on("error", clear) | |
stream.on("error", reject) | |
stream.on("ready", clear) | |
stream.on("ready", ready) | |
}) | |
const offsets = [] | |
const data = [] | |
const methods = Object.keys(options.methods) | |
for (const method of methods) { | |
const properties = options.methods[method] | |
const offset = data.length | |
loadTable(data, loadCodePoints(options, properties)) | |
offsets.push([method, offset, data.length]) | |
} | |
await write(stream, `${ | |
format({esm: "", js: "\"use strict\"\n"})}${ | |
format({ts: "/* tslint:disable */", js: "/* eslint-disable */"})} | |
var _table = new ${getView(data)}([${data}]) | |
function _test${format({ | |
ts: "(code: number, i: number, end: number): boolean", | |
js: "(code, i, end)", | |
})} { | |
var test = 0 | |
do { | |
test = _table[i++] | |
if (test < 0) { | |
test = code += test | |
} else { | |
code -= test | |
test = (code >= 0)|0 | |
if (test) code -= _table[i++] | |
} | |
} while (code > 0 && i !== end) | |
return test === 0 | |
} | |
`) | |
for (const [method, start, end] of offsets) { | |
await write(stream, `${format({ | |
ts: `export function ${method}(code: number): boolean`, | |
js: `exports.${method} = function (code)`, | |
esm: `export function ${method}(code)`, | |
})} { | |
return _test(code | 0, ${start}, ${end}) | |
} | |
`) | |
} | |
await new Promise((resolve, reject) => { | |
stream.end(err => err != null ? reject(err) : resolve()) | |
}) | |
} | |
module.exports = main | |
if (require.main === module) { | |
main().catch(err => { | |
console.error(err) | |
process.exitCode = 1 | |
}) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment