Skip to content

Instantly share code, notes, and snippets.

@vojtagrec
Forked from johan/README.md
Created February 17, 2014 12:06
Show Gist options
  • Save vojtagrec/9049436 to your computer and use it in GitHub Desktop.
Save vojtagrec/9049436 to your computer and use it in GitHub Desktop.

Requirements

You need the ttx binary from FontTools to dump the cmap table of the fonts into an xml font.ttx file:

> sudo apt-get install fonttools

HOWTO

> ttx -t cmap sorren.eot
Dumping "sorren.eot" to "sorren.ttx"...
Dumping 'cmap' table...
> ln -s sorren.ttx sorren.xml

Open sorren.xml in Chrome. Open devtools, paste the ttx_to_regexp.js code into the Console tab and hit return.

Next, type fontRange() to get the regexp covering all codepoints known to this font (platformID 0 is Unicode). If you're going to paste it elsewhere, you might as well type copy(fontRange()) and avoid any copy-paste errors:

> fontRange()
[\x00\x0d -~\xa0\xad\u2000-\u200a\u2010-\u2014\u202f\u205f\ue000]

Repeat above for all fonts you are interested in.

TODO

To become an international superhero, fork this gist, make a shell-runable node.js application font-to-regexp.js that just takes your font file(s?) on the command line, invokes ttx for you on it(them), loads the result with jsdom, runs fontRange on it and prints the regexp to stdout, instead of doing the above steps manually. Oh, and brag about it in the comments here, of course, so other people find it too!

function fontRange() {
function codepoint(node) { return Number(node.nodeValue); }
return regexpify($x('//cmap/*[@platformID="0"]/*/@code').map(codepoint));
}
function regexpify(codepoints) {
function character(code) {
switch (code) {
case 45: return '\\-';
case 92: return '\\\\';
case 93: return '\\]';
case 94: return '\\^';
case 173: return '\\xad'; // soft hyphen looks like dash; avoid confusion
default:
if ((code >= 0x20 && code <= 0x7e) ||
(code >= 0xa1 && code <= 0xff))
return String.fromCharCode(code);
var hex = code.toString(16);
if (code < 0x10) return '\\x0'+ hex;
if (code < 0x100) return '\\x' + hex;
if (code < 0x1000) return '\\u0'+ hex;
return '\\u' + hex;
}
}
function rangeify(range) {
var a = range[0], b = range[1], r;
switch (b) {
case a: r = character(a); break;
case a+1: r = character(a) +''+ character(b); break;
default: r = character(a) +'-'+ character(b); break;
}
// console.log('range:', range, a, b, r);
return r;
}
var sorted = codepoints.concat().sort(function asc(a,b) { return a - b; })
, ranges = []
, start = sorted.shift()
, prev = start
, next = sorted.shift()
;
while (true) {
while (next === prev + 1) {
prev = next;
next = sorted.shift();
}
ranges.push([start, prev]);
if (next === undefined) break;
start = prev = next;
next = sorted.shift();
}
return '['+ ranges.map(rangeify).join('') +']';
}
function $x(xpath, root) {
var doc = root ? root.evaluate ? root : root.ownerDocument : document, next;
var got = doc.evaluate( xpath, root||doc, null, 0, null ), result = [];
switch (got.resultType) {
case got.STRING_TYPE:
return got.stringValue;
case got.NUMBER_TYPE:
return got.numberValue;
case got.BOOLEAN_TYPE:
return got.booleanValue;
default:
while ((next = got.iterateNext()))
result.push( next );
return result;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment