Skip to content

Instantly share code, notes, and snippets.

@ochafik
Last active February 7, 2018 22:00
Show Gist options
  • Select an option

  • Save ochafik/3f2b86c80e99f7f01b34a8cad40dd86f to your computer and use it in GitHub Desktop.

Select an option

Save ochafik/3f2b86c80e99f7f01b34a8cad40dd86f to your computer and use it in GitHub Desktop.
Confusables Unicode characters demo
const {parseString} = require('xml2js');
const child_process = require('child_process');
const fs = require('fs')
class CollisionMap {
constructor() {
this.charsByGlyphRefID = {};
}
addCharMapping(charValue, glyphRefID) {
charValue = charValue.trim();
glyphRefID = glyphRefID.trim();
if (charValue.indexOf('0x') == 0) charValue = charValue.substring(2);
let chars = this.charsByGlyphRefID[glyphRefID];
if (!chars) chars = this.charsByGlyphRefID[glyphRefID] = [];
if (chars.indexOf(charValue) < 0) chars.push(charValue);
}
detectCollisions(charClasses) {
// console.log(this.charsByGlyphRefID);
let count = 0;
for (let id of Object.keys(this.charsByGlyphRefID)) {
const chars = this.charsByGlyphRefID[id];
if (chars.length > 1) {
// console.log(`Collision(id: ${id}): ${chars.map(formatCodePoint).join(', ')}`);
const [first, ...others] = chars;
for (const other of others) {
charClasses.assertEquivalent(first, other);
count++;
}
}
}
return count;
}
}
function formatCodePoint(c) {
return `${decode(c)} (${c})`;
}
function decode(hexCodePoint) {
return String.fromCodePoint(parseInt(hexCodePoint, 16));
}
const dirs = [
'/Library/Fonts',
'/System/Library/Fonts',
]
class CharClasses {
constructor() {
this.classes = {};
this.roots = {};
}
getRoot(c) {
let root = this.roots[c];
if (root === undefined) {
this.roots[c] = root = c;
this.classes[c] = [c];
}
return root;
}
assertEquivalent(a, b) {
const rootA = this.getRoot(a);
const rootB = this.getRoot(b);
if (rootA == rootB) return;
const classesA = this.classes[rootA];
const classesB = this.classes[rootB];
delete this.classes[rootA];
delete this.classes[rootB];
const newRoot = rootA < rootB ? rootA : rootB;
const newClasses = [...classesA, ...classesB];
newClasses.sort();
for (const sibling of newClasses) {
this.roots[sibling] = newRoot;
}
// console.log(`(a = ${a}, b = ${b}) classes[${newRoot}] = ${newClasses}`);
this.classes[newRoot] = newClasses;
}
print() {
for (const root in this.classes) {
console.log(`\t${this.classes[root].map(formatCodePoint).join(', ')}`);
}
}
}
const classes = new CharClasses();
for (const dir of dirs) {
for (const item of fs.readdirSync(dir)) {
// if (item.indexOf('.ttf') < 0) continue;
// if (item != 'Arial Bold.ttf') continue;
console.log(item);
const file = `${dir}/${item}`;
try {
let fontName = item.split('.')[0];
fontName = {
'Athelas': 'Athelas Regular',
'Bodoni 72 OS': 'BodoniSvtyTwoOSITCTT-Book',
'Bodoni 72': 'BodoniSvtyTwoITCTT-Book',
'ChalkboardSE': 'ChalkboardSE-Regular',
'Charter': 'Charter-Roman',
'Corsiva': 'CorsivaHebrew',
'EuphemiaCAS': 'EuphemiaUCAS',
'Futura': 'Futura-Medium',
'Hoefler Text': 'Hoefler Text',
'InaiMathi-MN': 'InaiMathi',
'Iowan Old Style': 'Iowan Old Style Roman',
'ITFDevanagari': 'ITFDevanagari-Book',
'Kefa': 'Kefa-Regular',
'Marion': 'Marion-Regular',
'PTMono': 'PTMono-Regular',
'PTSans': 'PTSans-Regular',
}[fontName] || fontName.replace(/ /g, '');
// } else if (fontName != 'Al Nile'
// && fontName != 'AlBayan') {
// fontName = `${fontName} Regular`;
// }
let out = child_process.execSync(
item.indexOf('.ttc') > 0 || item.indexOf('.dfont') > 0
? `ftxdumperfuser -t cmap --font-name "${fontName}" "${file}"`
: `ftxdumperfuser -t cmap "${file}"`)
// 'ftxdumperfuser -t cmap "/Library/Fonts/Arial Unicode.ttf"')
// 'ftxdumperfuser -t cmap --font-name Courier /System/Library/Fonts/Courier.dfont')
// 'ftxdumperfuser -t cmap --font-name Times-Roman /System/Library/Fonts/Times.ttc')
parseString(out, function (err, result) {
const subTables = result['ft:FTFontTable_cmap']['ft:cmapSubtable'];
for (const subTable of subTables) {
// console.dir(subTable.$);
if (subTable.$.platformName == 'Unicode' ||
subTable.$.scriptName == 'Unicode') {
const map = new CollisionMap();
for (const m of subTable['ft:map']) {
if (m.$.charValue == '0xFFFD') continue;
// console.log(m);
map.addCharMapping(m.$.charValue, m.$.glyphRefID);
}
let count = map.detectCollisions(classes);
console.log(`\tcollisions: ${count}`);
}
}
});
} catch (e) {
console.log(e);
continue;
}
}
}
classes.print();
{
"name": "confusion",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"dependencies": {
"opentype.js": "^0.7.3",
"xml2js": "^0.4.19"
}
}
<html>
<head>
<meta charset="UTF-8">
<script>
// Generated with: https://apps.timwhitlock.info/js/regex#
const nonSpacingMarksRx = /[\u0300-\u036f\u0483-\u0489\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065e\u0670\u06d6-\u06dc\u06de-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07a6-\u07b0\u07eb-\u07f3\u0901-\u0902\u093c\u0941-\u0948\u094d\u0951-\u0954\u0962-\u0963\u0981\u09bc\u09c1-\u09c4\u09cd\u09e2-\u09e3\u0a01-\u0a02\u0a3c\u0a41-\u0a42\u0a47-\u0a48\u0a4b-\u0a4d\u0a51\u0a70-\u0a71\u0a75\u0a81-\u0a82\u0abc\u0ac1-\u0ac5\u0ac7-\u0ac8\u0acd\u0ae2-\u0ae3\u0b01\u0b3c\u0b3f\u0b41-\u0b44\u0b4d\u0b56\u0b62-\u0b63\u0b82\u0bc0\u0bcd\u0c3e-\u0c40\u0c46-\u0c48\u0c4a-\u0c4d\u0c55-\u0c56\u0c62-\u0c63\u0cbc\u0cbf\u0cc6\u0ccc-\u0ccd\u0ce2-\u0ce3\u0d41-\u0d44\u0d4d\u0d62-\u0d63\u0dca\u0dd2-\u0dd4\u0dd6\u0e31\u0e34-\u0e3a\u0e47-\u0e4e\u0eb1\u0eb4-\u0eb9\u0ebb-\u0ebc\u0ec8-\u0ecd\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f7e\u0f80-\u0f84\u0f86-\u0f87\u0f90-\u0f97\u0f99-\u0fbc\u0fc6\u102d-\u1030\u1032-\u1037\u1039-\u103a\u103d-\u103e\u1058-\u1059\u105e-\u1060\u1071-\u1074\u1082\u1085-\u1086\u108d\u135f\u1712-\u1714\u1732-\u1734\u1752-\u1753\u1772-\u1773\u17b7-\u17bd\u17c6\u17c9-\u17d3\u17dd\u180b-\u180d\u18a9\u1920-\u1922\u1927-\u1928\u1932\u1939-\u193b\u1a17-\u1a18\u1b00-\u1b03\u1b34\u1b36-\u1b3a\u1b3c\u1b42\u1b6b-\u1b73\u1b80-\u1b81\u1ba2-\u1ba5\u1ba8-\u1ba9\u1c2c-\u1c33\u1c36-\u1c37\u1dc0-\u1de6\u1dfe-\u1dff\u20d0-\u20f0\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\ua66f-\ua672\ua67c-\ua67d\ua802\ua806\ua80b\ua825-\ua826\ua8c4\ua926-\ua92d\ua947-\ua951\uaa29-\uaa2e\uaa31-\uaa32\uaa35-\uaa36\uaa43\uaa4c\ufb1e\ufe00-\ufe0f\ufe20-\ufe26]|\ud800\uddfd|\ud802[\ude01-\ude03\ude05-\ude06\ude0c-\ude0f\ude38-\ude3a\ude3f]|\ud834[\udd67-\udd69\udd7b-\udd82\udd85-\udd8b\uddaa-\uddad\ude42-\ude44]|\udb40[\udd00-\uddef]/ug;
let confusables = {
}
function addConfusables(from, to, mapping) {
Object.assign(confusables, mapping);
for (let i = 0, n = from.length; i < n; i++) {
confusables[from[i]] = to[i];
}
}
// TTF Fonts: http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-chapter08
// https://en.wikipedia.org/wiki/Cyrillic_script
// https://www.lexilogos.com/keyboard/russian.htm
// Problem: russophones might naturally search for romanization, while others will look for similar-ish letters (ambiguities)
addConfusables(
'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЭэЯяІіѢѣѲѳѴѵ',
'AaBbBBrrAAEeËëXx33NNÑÑKKrrMMHHOonnPpCcTTyyOoXxUuYyWwWwbbbbEeRRIibbOoVv',
{
'Ы': 'bl',
'ы': 'bl',
'Ю': 'IO',
'ю': 'IO',
});
// Greek alphabet
// https://www.lexilogos.com/keyboard/greek_modern.htm
// example: HλLF-LIFE
addConfusables(
'ΑαΆάΒβΓγΔδΕεΈέΖζΗηΉήΘθΙιϊΊίΐΚκΛλΜμΝνΞξΟοΌόΠπΡρΣσςΤτΥυϋΎύΰΦφΧχΨψΩωΏώ·',
'AaAaBBryDdEeEeZZHnHnOOIliIiiKkAAMuNvEEOoOonrPpEocTtYvvYvvOqXxWwOwOw.',
{});
function deconfuse(s) {
if (!confusables) return s;
let result = '';
for (let i = 0, n = s.length; i < n; i++) {
const c = s.charAt(i);
const r = confusables[c];
result += r == null ? c : r;
}
return result;
}
function deaccentuate(s) {
// http://unicode.org/reports/tr15/
return s.normalize('NFD').replace(nonSpacingMarksRx, '').normalize('NFC');
}
function normalize(s) {
const deconfused = deaccentuate(deconfuse(deaccentuate(s)));
//return text.normalize('NFKD').normalize();
return deconfused.normalize('NFKC').toLowerCase();
}
window.addEventListener('load', () => {
const input = document.getElementById('input');
const result = document.getElementById('result');
function update() {
result.innerText = normalize(input.value);
}
input.addEventListener('keypress', update);
input.addEventListener('input', update);
input.value = 'hipopótamo maçã pólen ñ poção água língüa Хорошо, ладно, или "ок" (окэй) а́том бато́н ежеви́ка жук';
update();
(() => {
// https://www.unicode.org/Public/security/10.0.0/confusables.txt
var xhr = new XMLHttpRequest();
xhr.open('GET', 'confusables.txt', true);
xhr.onload = () => {
if (xhr.readyState === xhr.DONE && xhr.status === 200) {
const lines = xhr.responseText.split(/[\r\n]+/g);
for (let line of lines) {
line = line.trim();
if (line === '' || line.startsWith('#')) continue;
let [from, to] = line.split(';').map(s => s.trim());
confusables[decode(from)] = to.split(/\s+/g).map(decode).join('');
}
function decode(hexCodePoint) {
return String.fromCodePoint(parseInt(hexCodePoint, 16));
}
}
};
xhr.send(null);
})();
});
</script>
</head>
<body>
<input type="text" id="input" size="100">
<div id="result"></div>
</body>
</html>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment