Skip to content

Instantly share code, notes, and snippets.

@dead-claudia
Last active June 10, 2018 04:54
Show Gist options
  • Save dead-claudia/747a77a220bc8e45db7afbb4459c0208 to your computer and use it in GitHub Desktop.
Save dead-claudia/747a77a220bc8e45db7afbb4459c0208 to your computer and use it in GitHub Desktop.
Unicode test function generator
#!/usr/bin/env node
"use strict"
// This generates one or more methods that check for a code point's existence
// within a set of code points, but with a few substantial optimizations - I'd
// rather not have a very slow or high-memory runtime test. Few more notes:
//
// - This exports the methods through an ES6 module. If you don't want that,
// you'll need to modify the script to change that.
// - At runtime, it stores all of its data in the smallest integer size it can
// keep the data losslessly uncompressed in. This makes it much smaller to
// iterate, and less data within memory.
// - At runtime, it iterates a contiguously-located, delta-encoded table in a
// very cache-friendly fashion. This keeps the comparison loop pretty simple,
// light, and fast, even though it's technically worst-case linear relative to
// the number of matching code points. (In theory, it could be every other
// code point, but in practice, matching code points are almost always
// either clustered or far apart, and that's why this provides such impressive
// size gains.)
//
// Note that this has a dependency on `unicode-11.0.0`.
//
// Below is the config:
//
// - `output` is the output file to use.
// - `modules` is a mapping of method + property/category list
//
// To use, you do something like this:
//
// ```js
// import {isIDContinue} from "./generated-categories"
//
// if (isIDContinue(charCode)) {
// doThings()
// }
// ```
//
// Below is the config used if this file is run directly.
const config = {
type: "module",
output: "../src/app/util/category-checkers.mjs",
methods: {
isValidHashtagChar: [
"General_Category/Letter",
"General_Category/Mark",
"General_Category/Number",
"General_Category/Modifier_Symbol",
"General_Category/Other_Symbol",
"General_Category/Format",
"General_Category/Surrogate",
],
},
}
// And here's the meat of the module.
const fs = require("fs")
const path = require("path")
function write(stream, str) {
return new Promise((resolve, reject) => {
stream.write(str, err => err != null ? reject(err) : resolve())
})
}
function loadCodePoints(options, tableList) {
const values = new Set()
for (const name of tableList) {
// eslint-disable-next-line global-require
const list = require(`unicode-11.0.0/${name}/code-points`)
for (const code of list) values.add(code)
}
return values
}
// This is an optimized equivalent of the following:
//
// ```js
// function loadTable(values) {
// const table = Array.from(values).sort((a, b) => a - b)
// const result = []
// let count = 0
// let prev
//
// for (let i = 0; i < table.length; i++) {
// const child = table[i]
//
// if (prev != null && child === prev.end + 1) {
// prev.end = child
// } else {
// prev = table[count++] = {start: child, end: child}
// }
// }
//
// table.length = count
// let acc = 0
//
// for (let i = 0; i < table.length; i++) {
// const entry = table[i]
// const start = entry.start - acc
//
// acc = entry.end
// if (entry.start === entry.end) {
// result.push(-start)
// } else {
// result.push(start - 1)
// result.push(entry.end - entry.start + 1)
// }
// }
//
// return result
// }
// ```
function loadTable(result, values) {
const table = Int32Array.from(values).sort()
let prevEnd = 0
let accumulating = false
for (let i = 0; i < table.length; i++) {
const child = table[i]
if (child !== prevEnd + 1) {
result.push(prevEnd - child)
prevEnd = child
accumulating = false
} else {
const top = result.length - 1
if (accumulating) {
result[top]++
} else {
result[top] = 1 - result[top]
result.push(2)
}
prevEnd = child
accumulating = true
}
}
}
function getView(data) {
for (let i = 0; i < data.length; i++) {
if (data[i] >>> 1) {
for (; i < data.length; i++) if (data[i] >>> 2) return "Int32Array"
return "Int16Array"
}
}
return "Int8Array"
}
async function main(options = config) {
const format = ({js = "", esm = js, ts = esm} = {}) => {
switch (options.module) {
case "ts": return ts
case "script": return js
default: return esm
}
}
const stream = await new Promise((resolve, reject) => {
const stream = fs.createWriteStream(
path.resolve(__dirname, options.output),
"utf-8"
)
function clear() {
stream.removeListener("error", clear)
stream.removeListener("error", reject)
stream.removeListener("ready", clear)
stream.removeListener("ready", ready)
}
function ready() {
resolve(stream)
}
stream.on("error", clear)
stream.on("error", reject)
stream.on("ready", clear)
stream.on("ready", ready)
})
const offsets = []
const data = []
const methods = Object.keys(options.methods)
for (const method of methods) {
const properties = options.methods[method]
const offset = data.length
loadTable(data, loadCodePoints(options, properties))
offsets.push([method, offset, data.length])
}
await write(stream, `${
format({esm: "", js: "\"use strict\"\n"})}${
format({ts: "/* tslint:disable */", js: "/* eslint-disable */"})}
var _table = new ${getView(data)}([${data}])
function _test${format({
ts: "(code: number, i: number, end: number): boolean",
js: "(code, i, end)",
})} {
var test = 0
do {
test = _table[i++]
if (test < 0) {
test = code += test
} else {
code -= test
test = (code >= 0)|0
if (test) code -= _table[i++]
}
} while (code > 0 && i !== end)
return test === 0
}
`)
for (const [method, start, end] of offsets) {
await write(stream, `${format({
ts: `export function ${method}(code: number): boolean`,
js: `exports.${method} = function (code)`,
esm: `export function ${method}(code)`,
})} {
return _test(code | 0, ${start}, ${end})
}
`)
}
await new Promise((resolve, reject) => {
stream.end(err => err != null ? reject(err) : resolve())
})
}
module.exports = main
if (require.main === module) {
main().catch(err => {
console.error(err)
process.exitCode = 1
})
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment