-
-
Save ochafik/3f2b86c80e99f7f01b34a8cad40dd86f to your computer and use it in GitHub Desktop.
Confusables Unicode characters demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| const {parseString} = require('xml2js'); | |
| const child_process = require('child_process'); | |
| const fs = require('fs') | |
| class CollisionMap { | |
| constructor() { | |
| this.charsByGlyphRefID = {}; | |
| } | |
| addCharMapping(charValue, glyphRefID) { | |
| charValue = charValue.trim(); | |
| glyphRefID = glyphRefID.trim(); | |
| if (charValue.indexOf('0x') == 0) charValue = charValue.substring(2); | |
| let chars = this.charsByGlyphRefID[glyphRefID]; | |
| if (!chars) chars = this.charsByGlyphRefID[glyphRefID] = []; | |
| if (chars.indexOf(charValue) < 0) chars.push(charValue); | |
| } | |
| detectCollisions(charClasses) { | |
| // console.log(this.charsByGlyphRefID); | |
| let count = 0; | |
| for (let id of Object.keys(this.charsByGlyphRefID)) { | |
| const chars = this.charsByGlyphRefID[id]; | |
| if (chars.length > 1) { | |
| // console.log(`Collision(id: ${id}): ${chars.map(formatCodePoint).join(', ')}`); | |
| const [first, ...others] = chars; | |
| for (const other of others) { | |
| charClasses.assertEquivalent(first, other); | |
| count++; | |
| } | |
| } | |
| } | |
| return count; | |
| } | |
| } | |
| function formatCodePoint(c) { | |
| return `${decode(c)} (${c})`; | |
| } | |
| function decode(hexCodePoint) { | |
| return String.fromCodePoint(parseInt(hexCodePoint, 16)); | |
| } | |
| const dirs = [ | |
| '/Library/Fonts', | |
| '/System/Library/Fonts', | |
| ] | |
| class CharClasses { | |
| constructor() { | |
| this.classes = {}; | |
| this.roots = {}; | |
| } | |
| getRoot(c) { | |
| let root = this.roots[c]; | |
| if (root === undefined) { | |
| this.roots[c] = root = c; | |
| this.classes[c] = [c]; | |
| } | |
| return root; | |
| } | |
| assertEquivalent(a, b) { | |
| const rootA = this.getRoot(a); | |
| const rootB = this.getRoot(b); | |
| if (rootA == rootB) return; | |
| const classesA = this.classes[rootA]; | |
| const classesB = this.classes[rootB]; | |
| delete this.classes[rootA]; | |
| delete this.classes[rootB]; | |
| const newRoot = rootA < rootB ? rootA : rootB; | |
| const newClasses = [...classesA, ...classesB]; | |
| newClasses.sort(); | |
| for (const sibling of newClasses) { | |
| this.roots[sibling] = newRoot; | |
| } | |
| // console.log(`(a = ${a}, b = ${b}) classes[${newRoot}] = ${newClasses}`); | |
| this.classes[newRoot] = newClasses; | |
| } | |
| print() { | |
| for (const root in this.classes) { | |
| console.log(`\t${this.classes[root].map(formatCodePoint).join(', ')}`); | |
| } | |
| } | |
| } | |
| const classes = new CharClasses(); | |
| for (const dir of dirs) { | |
| for (const item of fs.readdirSync(dir)) { | |
| // if (item.indexOf('.ttf') < 0) continue; | |
| // if (item != 'Arial Bold.ttf') continue; | |
| console.log(item); | |
| const file = `${dir}/${item}`; | |
| try { | |
| let fontName = item.split('.')[0]; | |
| fontName = { | |
| 'Athelas': 'Athelas Regular', | |
| 'Bodoni 72 OS': 'BodoniSvtyTwoOSITCTT-Book', | |
| 'Bodoni 72': 'BodoniSvtyTwoITCTT-Book', | |
| 'ChalkboardSE': 'ChalkboardSE-Regular', | |
| 'Charter': 'Charter-Roman', | |
| 'Corsiva': 'CorsivaHebrew', | |
| 'EuphemiaCAS': 'EuphemiaUCAS', | |
| 'Futura': 'Futura-Medium', | |
| 'Hoefler Text': 'Hoefler Text', | |
| 'InaiMathi-MN': 'InaiMathi', | |
| 'Iowan Old Style': 'Iowan Old Style Roman', | |
| 'ITFDevanagari': 'ITFDevanagari-Book', | |
| 'Kefa': 'Kefa-Regular', | |
| 'Marion': 'Marion-Regular', | |
| 'PTMono': 'PTMono-Regular', | |
| 'PTSans': 'PTSans-Regular', | |
| }[fontName] || fontName.replace(/ /g, ''); | |
| // } else if (fontName != 'Al Nile' | |
| // && fontName != 'AlBayan') { | |
| // fontName = `${fontName} Regular`; | |
| // } | |
| let out = child_process.execSync( | |
| item.indexOf('.ttc') > 0 || item.indexOf('.dfont') > 0 | |
| ? `ftxdumperfuser -t cmap --font-name "${fontName}" "${file}"` | |
| : `ftxdumperfuser -t cmap "${file}"`) | |
| // 'ftxdumperfuser -t cmap "/Library/Fonts/Arial Unicode.ttf"') | |
| // 'ftxdumperfuser -t cmap --font-name Courier /System/Library/Fonts/Courier.dfont') | |
| // 'ftxdumperfuser -t cmap --font-name Times-Roman /System/Library/Fonts/Times.ttc') | |
| parseString(out, function (err, result) { | |
| const subTables = result['ft:FTFontTable_cmap']['ft:cmapSubtable']; | |
| for (const subTable of subTables) { | |
| // console.dir(subTable.$); | |
| if (subTable.$.platformName == 'Unicode' || | |
| subTable.$.scriptName == 'Unicode') { | |
| const map = new CollisionMap(); | |
| for (const m of subTable['ft:map']) { | |
| if (m.$.charValue == '0xFFFD') continue; | |
| // console.log(m); | |
| map.addCharMapping(m.$.charValue, m.$.glyphRefID); | |
| } | |
| let count = map.detectCollisions(classes); | |
| console.log(`\tcollisions: ${count}`); | |
| } | |
| } | |
| }); | |
| } catch (e) { | |
| console.log(e); | |
| continue; | |
| } | |
| } | |
| } | |
| classes.print(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "name": "confusion", | |
| "version": "1.0.0", | |
| "description": "", | |
| "main": "index.js", | |
| "scripts": { | |
| "test": "echo \"Error: no test specified\" && exit 1" | |
| }, | |
| "author": "", | |
| "license": "ISC", | |
| "dependencies": { | |
| "opentype.js": "^0.7.3", | |
| "xml2js": "^0.4.19" | |
| } | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <html> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <script> | |
| // Generated with: https://apps.timwhitlock.info/js/regex# | |
| const nonSpacingMarksRx = /[\u0300-\u036f\u0483-\u0489\u0591-\u05bd\u05bf\u05c1-\u05c2\u05c4-\u05c5\u05c7\u0610-\u061a\u064b-\u065e\u0670\u06d6-\u06dc\u06de-\u06e4\u06e7-\u06e8\u06ea-\u06ed\u0711\u0730-\u074a\u07a6-\u07b0\u07eb-\u07f3\u0901-\u0902\u093c\u0941-\u0948\u094d\u0951-\u0954\u0962-\u0963\u0981\u09bc\u09c1-\u09c4\u09cd\u09e2-\u09e3\u0a01-\u0a02\u0a3c\u0a41-\u0a42\u0a47-\u0a48\u0a4b-\u0a4d\u0a51\u0a70-\u0a71\u0a75\u0a81-\u0a82\u0abc\u0ac1-\u0ac5\u0ac7-\u0ac8\u0acd\u0ae2-\u0ae3\u0b01\u0b3c\u0b3f\u0b41-\u0b44\u0b4d\u0b56\u0b62-\u0b63\u0b82\u0bc0\u0bcd\u0c3e-\u0c40\u0c46-\u0c48\u0c4a-\u0c4d\u0c55-\u0c56\u0c62-\u0c63\u0cbc\u0cbf\u0cc6\u0ccc-\u0ccd\u0ce2-\u0ce3\u0d41-\u0d44\u0d4d\u0d62-\u0d63\u0dca\u0dd2-\u0dd4\u0dd6\u0e31\u0e34-\u0e3a\u0e47-\u0e4e\u0eb1\u0eb4-\u0eb9\u0ebb-\u0ebc\u0ec8-\u0ecd\u0f18-\u0f19\u0f35\u0f37\u0f39\u0f71-\u0f7e\u0f80-\u0f84\u0f86-\u0f87\u0f90-\u0f97\u0f99-\u0fbc\u0fc6\u102d-\u1030\u1032-\u1037\u1039-\u103a\u103d-\u103e\u1058-\u1059\u105e-\u1060\u1071-\u1074\u1082\u1085-\u1086\u108d\u135f\u1712-\u1714\u1732-\u1734\u1752-\u1753\u1772-\u1773\u17b7-\u17bd\u17c6\u17c9-\u17d3\u17dd\u180b-\u180d\u18a9\u1920-\u1922\u1927-\u1928\u1932\u1939-\u193b\u1a17-\u1a18\u1b00-\u1b03\u1b34\u1b36-\u1b3a\u1b3c\u1b42\u1b6b-\u1b73\u1b80-\u1b81\u1ba2-\u1ba5\u1ba8-\u1ba9\u1c2c-\u1c33\u1c36-\u1c37\u1dc0-\u1de6\u1dfe-\u1dff\u20d0-\u20f0\u2de0-\u2dff\u302a-\u302f\u3099-\u309a\ua66f-\ua672\ua67c-\ua67d\ua802\ua806\ua80b\ua825-\ua826\ua8c4\ua926-\ua92d\ua947-\ua951\uaa29-\uaa2e\uaa31-\uaa32\uaa35-\uaa36\uaa43\uaa4c\ufb1e\ufe00-\ufe0f\ufe20-\ufe26]|\ud800\uddfd|\ud802[\ude01-\ude03\ude05-\ude06\ude0c-\ude0f\ude38-\ude3a\ude3f]|\ud834[\udd67-\udd69\udd7b-\udd82\udd85-\udd8b\uddaa-\uddad\ude42-\ude44]|\udb40[\udd00-\uddef]/ug; | |
| let confusables = { | |
| } | |
| function addConfusables(from, to, mapping) { | |
| Object.assign(confusables, mapping); | |
| for (let i = 0, n = from.length; i < n; i++) { | |
| confusables[from[i]] = to[i]; | |
| } | |
| } | |
| // TTF Fonts: http://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=iws-chapter08 | |
| // https://en.wikipedia.org/wiki/Cyrillic_script | |
| // https://www.lexilogos.com/keyboard/russian.htm | |
| // Problem: russophones might naturally search for romanization, while others will look for similar-ish letters (ambiguities) | |
| addConfusables( | |
| 'АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЬьЭэЯяІіѢѣѲѳѴѵ', | |
| 'AaBbBBrrAAEeËëXx33NNÑÑKKrrMMHHOonnPpCcTTyyOoXxUuYyWwWwbbbbEeRRIibbOoVv', | |
| { | |
| 'Ы': 'bl', | |
| 'ы': 'bl', | |
| 'Ю': 'IO', | |
| 'ю': 'IO', | |
| }); | |
| // Greek alphabet | |
| // https://www.lexilogos.com/keyboard/greek_modern.htm | |
| // example: HλLF-LIFE | |
| addConfusables( | |
| 'ΑαΆάΒβΓγΔδΕεΈέΖζΗηΉήΘθΙιϊΊίΐΚκΛλΜμΝνΞξΟοΌόΠπΡρΣσςΤτΥυϋΎύΰΦφΧχΨψΩωΏώ·', | |
| 'AaAaBBryDdEeEeZZHnHnOOIliIiiKkAAMuNvEEOoOonrPpEocTtYvvYvvOqXxWwOwOw.', | |
| {}); | |
| function deconfuse(s) { | |
| if (!confusables) return s; | |
| let result = ''; | |
| for (let i = 0, n = s.length; i < n; i++) { | |
| const c = s.charAt(i); | |
| const r = confusables[c]; | |
| result += r == null ? c : r; | |
| } | |
| return result; | |
| } | |
| function deaccentuate(s) { | |
| // http://unicode.org/reports/tr15/ | |
| return s.normalize('NFD').replace(nonSpacingMarksRx, '').normalize('NFC'); | |
| } | |
| function normalize(s) { | |
| const deconfused = deaccentuate(deconfuse(deaccentuate(s))); | |
| //return text.normalize('NFKD').normalize(); | |
| return deconfused.normalize('NFKC').toLowerCase(); | |
| } | |
| window.addEventListener('load', () => { | |
| const input = document.getElementById('input'); | |
| const result = document.getElementById('result'); | |
| function update() { | |
| result.innerText = normalize(input.value); | |
| } | |
| input.addEventListener('keypress', update); | |
| input.addEventListener('input', update); | |
| input.value = 'hipopótamo maçã pólen ñ poção água língüa Хорошо, ладно, или "ок" (окэй) а́том бато́н ежеви́ка жук'; | |
| update(); | |
| (() => { | |
| // https://www.unicode.org/Public/security/10.0.0/confusables.txt | |
| var xhr = new XMLHttpRequest(); | |
| xhr.open('GET', 'confusables.txt', true); | |
| xhr.onload = () => { | |
| if (xhr.readyState === xhr.DONE && xhr.status === 200) { | |
| const lines = xhr.responseText.split(/[\r\n]+/g); | |
| for (let line of lines) { | |
| line = line.trim(); | |
| if (line === '' || line.startsWith('#')) continue; | |
| let [from, to] = line.split(';').map(s => s.trim()); | |
| confusables[decode(from)] = to.split(/\s+/g).map(decode).join(''); | |
| } | |
| function decode(hexCodePoint) { | |
| return String.fromCodePoint(parseInt(hexCodePoint, 16)); | |
| } | |
| } | |
| }; | |
| xhr.send(null); | |
| })(); | |
| }); | |
| </script> | |
| </head> | |
| <body> | |
| <input type="text" id="input" size="100"> | |
| <div id="result"></div> | |
| </body> | |
| </html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment