Last active
October 24, 2017 18:57
-
-
Save glebm/2749c75b4fc4fed4dc5911925bb8f8b9 to your computer and use it in GitHub Desktop.
A quick hack to generate a codepoint set presence condition (JS)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const regenerate = require('regenerate'); | |
/* char-graphic? */ | |
const SETS = ['Letter', 'Mark', 'Number', 'Punctuation', 'Symbol'].map(category => | |
require(`unicode-10.0.0/General_Category/${category}/code-points.js`)); | |
/* char-blank? */ | |
// const SETS = [['\t']].concat(['Space_Separator'].map(category => | |
// require(`unicode-10.0.0/General_Category/${category}/code-points.js`))); | |
/* char-whitespace? */ | |
// const SETS = ['White_Space'].map(property => | |
// require(`unicode-10.0.0/Binary_Property/${property}/code-points.js`)) | |
function main() { | |
const set = regenerate(); | |
for (const s of SETS) { | |
set.add(s); | |
} | |
console.log(toIf(set.valueOf())); | |
//console.log(toRacketRanges(set.valueOf())); | |
} | |
const VAR_NAME = 'c'; | |
function shortestInt(a) { | |
const b10 = a.toString(); | |
const b16 = a.toString(16); | |
return (b10.length < b16.length + 2) ? b10 : `0x${b16.toUpperCase()}`; | |
} | |
function printOneRange(a, b) { | |
return a === b | |
? `${VAR_NAME} === ${shortestInt(a)}` | |
: `${VAR_NAME} > ${shortestInt(a - 1)} && ${VAR_NAME} < ${shortestInt(b + 1)}`; | |
} | |
function printExcepts(cps) { | |
cps.sort(); | |
const ranges = genRanges(cps); | |
if (ranges.length === cps.length) { | |
return cps.map(cp => `${VAR_NAME} !== ${shortestInt(cp)}`).join(' && '); | |
} else { | |
return `!(${printRanges(ranges)})`; | |
} | |
} | |
function genRanges(codepoints) { | |
const result = []; | |
let prev = codepoints[0]; | |
let start = null; | |
for (let i = 1; i < codepoints.length; i++) { | |
const v = codepoints[i]; | |
if (prev + 1 === v) { | |
if (start === null) { | |
start = prev; | |
} | |
} else { | |
if (start === null) { | |
result.push([prev, prev]); | |
} else { | |
result.push([start, prev]); | |
start = null; | |
} | |
} | |
prev = v; | |
} | |
if (start === null) { | |
result.push([prev, prev]); | |
} else { | |
result.push([start, prev]); | |
} | |
return result; | |
} | |
function printRanges(ranges) { | |
let excepts = []; | |
const result = []; | |
let prevRange = ranges[0]; | |
let start = null; | |
for (let i = 1; i < ranges.length; i++) { | |
const r = ranges[i]; | |
const numExcepts = r[0] - prevRange[1] - 1; | |
if (numExcepts == 1 || numExcepts == 2 && (r[1] != r[0] && prevRange[0] != prevRange[1])) { | |
if (start === null) start = prevRange[0]; | |
for (let j = 1; j <= numExcepts; j++) { | |
excepts.push(prevRange[1] + j); | |
} | |
} else if (prevRange != null) { | |
if (excepts.length === 0) { | |
result.push(printOneRange(prevRange[0], prevRange[1])); | |
} else { | |
result.push(`(${printOneRange(start, prevRange[1])} && ${printExcepts(excepts)})`); | |
} | |
excepts = []; | |
start = null; | |
} | |
prevRange = r; | |
} | |
const lastRange = ranges[ranges.length - 1]; | |
if (excepts.length === 0) { | |
result.push(printOneRange(lastRange[0], lastRange[1])); | |
} else { | |
result.push(`(${printOneRange(start, lastRange[1])} && ${printExcepts(excepts)})`); | |
} | |
return result.join(' || '); | |
} | |
function toIf(codepoints) { | |
return printRanges(genRanges(codepoints)); | |
} | |
function toRacketRanges(codepoints) { | |
return genRanges(codepoints).map(([from, to]) => { | |
return from === to ? from : `(range ${from} ${to})` | |
}).join(' '); | |
} | |
main(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Output of the script above for char-whitespace? | |
c > 8 && c < 14 || c === 32 || c === 133 || c === 160 || c === 5760 || | |
c > 8191 && c < 8203 || c > 8231 && c < 8234 || c === 8239 || | |
c === 8287 || c === 12288 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment