Skip to content

Instantly share code, notes, and snippets.

@JLHwung
Last active March 19, 2021 16:00
Show Gist options
  • Save JLHwung/f5bdfc252ad55e58e2a21e83f7152f47 to your computer and use it in GitHub Desktop.
Save JLHwung/f5bdfc252ad55e58e2a21e83f7152f47 to your computer and use it in GitHub Desktop.

Workflow:

Sort current dictionary in the order of Block > Cangjie code

node ./sort-dict.mjs /path/to/cangjie5.dict.yaml

Merge Cangjie5.txt of https://github.com/Jackchows/Cangjie5 to cangjie5.dict.yaml

node ./merge-cangjie5-txt.mjs /path/to/Cangjie5.txt /path/to/cangjie5.dict.yaml

Check whether cangjie5.dict.yaml covers all the CJK Unified Ideographs

node ./check-coverage.mjs /path/to/cangjie5.dict.yaml
import fs from "fs";
import path from "path";
if (process.argv.length <= 2) {
throw new Error(
`Usage: node ./check-coverage.mjs path/to/cangjie5.dict.yaml`
);
}
const ranges = [
[0x3400, 0x4dbf],
[0x4e00, 0x9ffc],
[0x20000, 0x2a6dd],
[0x2a700, 0x2b734],
[0x2b740, 0x2b81d],
[0x2b820, 0x2cea1],
[0x2ceb0, 0x2ebe0],
[0x30000, 0x3134a]
]
function generateCoveredSet(cangjie5) {
const results = cangjie5.split("...");
const items = results[results.length - 1];
const lines = items.split("\n");
const CJMatchers = /^(?<character>.)\s+(?<cjcode>[a-z]+)/u;
const covered = new Set();
lines.forEach((line) => {
if (!line) return;
const { cjcode, character } = CJMatchers.exec(line).groups;
covered.add(character);
});
return covered;
}
function checkCoverage(covered, cjMap) {
for (const range of ranges) {
for (let cp = range[0]; cp <= range[1]; cp++) {
const ch = String.fromCodePoint(cp);
if (!covered.has(ch)) {
console.log(`${ch}\t${cjMap.get(ch)}`);
}
}
}
}
function readTXT(cangjie5) {
const cjMap = new Map();
const results = cangjie5.split("------------------------------");
const items = results[results.length - 1];
const lines = items.split("\n");
const CJMatchers = /^(?<character>.)\s+(?<cjcode>[a-z]+)/u;
lines.forEach((line) => {
if (!line) return;
const { cjcode, character } = CJMatchers.exec(line).groups;
const codepoint = character.codePointAt(0);
if (
!cjcode.startsWith("x") &&
!line.includes("[u]")
) {
cjMap.set(character, cjcode);
}
});
return cjMap;
}
const txtPath = path.resolve(process.argv[2]);
const dictPath = path.resolve(process.argv[3]);
console.log("| Codepoint | Character | Suggested mapping |");
console.log("| --- | --- | --- |");
checkCoverage(
generateCoveredSet(
fs.readFileSync(dictPath, { encoding: "utf-8" })
),
readTXT(fs.readFileSync(txtPath, { encoding: "utf-8" }))
);
import fs from "fs";
import path from "path";
if (process.argv.length <= 3) {
throw new Error(
`Usage: node ./merge-cangjie5-txt.mjs path/to/cangjie5.txt path/to/cangjie5.dict.yaml`
);
}
const extracted = [];
const addedCharacters = new Set();
function mergeTXT(cangjie5, dict) {
const results = cangjie5.split("------------------------------");
const items = results[results.length - 1];
const lines = items.split("\n");
const CJMatchers = /^(?<character>.)\s+(?<cjcode>[a-z]+)/u;
lines.forEach((line) => {
if (!line) return;
const { cjcode, character } = CJMatchers.exec(line).groups;
const codepoint = character.codePointAt(0);
if (
((codepoint >= 0x2b820 && codepoint <= 0x2ebe0) ||
(codepoint >= 0x30000 && codepoint <= 0x3134f)) &&
!cjcode.startsWith("x") &&
!line.includes("[u]")
) {
extracted.push(`${character}\t${cjcode}`);
addedCharacters.add(character);
}
});
return dict + extracted.join("\n") + "\n";
}
const txtPath = path.resolve(process.argv[2]);
const dictPath = path.resolve(process.argv[3]);
fs.writeFileSync(
dictPath,
mergeTXT(
fs.readFileSync(txtPath, { encoding: "utf-8" }),
fs.readFileSync(dictPath, { encoding: "utf-8" })
)
);
import fs from "fs";
import path from "path";
if (process.argv.length <= 2) {
console.log(`Usage: node ./sort-dict.mjs path/to/cangjie5.dict.yaml`);
}
function block(ch) {
const cp = ch.codePointAt(0);
if (cp <= 0x4dbf && cp >= 0x3400) {
return 1; // Ext. A
} else if (cp <= 0xffff) {
return 0; // Basic + Misc
} else if (cp <= 0x2a6df) {
return 2;
} else if (cp <= 0x2b73f) {
return 3;
} else if (cp <= 0x2b81f) {
return 3; // Ext. C & Ext. D are mixed :(
} else if (cp <= 0x2ceaf) {
return 5;
} else if (cp <= 0x2ebef) {
return 6;
} else if (cp <= 0x3134f) {
return 7;
} else {
throw new Error(`Unsupported character: U+${cp.toString(16)} ${ch}`);
}
}
function sign(x, y) {
if (x < y) {
return -1;
}
if (x > y) {
return 1;
}
return 0;
}
function compare(cj1, cj2, ch1, ch2) {
const c = sign(block(ch1), block(ch2));
if (c !== 0) {
// sort by different blocks
return c;
} else {
if (block(ch1) === 0 && block(ch2) === 0) {
return 0; // do not sort BMP, which is manually sorted.
}
return sign(cj1, cj2);
}
}
function sortTxt(text) {
const [prefaces, items] = text.split("...");
const lines = items.split("\n");
lines.sort((l1, l2) => {
if (!l1 || !l2) {
return 0;
}
const CJMatchers = /^(?<character>.)\s+(?<cjcode>[a-z]+)/u;
try {
const { cjcode: cj1, character: ch1 } = CJMatchers.exec(l1).groups;
const { cjcode: cj2, character: ch2 } = CJMatchers.exec(l2).groups;
return compare(cj1, cj2, ch1, ch2);
} catch (e) {
console.error(e);
console.log(l1, l2);
}
});
return prefaces + "...\n" + lines.join("\n");
}
const txtPath = path.resolve(process.argv[2]);
fs.writeFileSync(
txtPath,
sortTxt(fs.readFileSync(txtPath, { encoding: "utf-8" }))
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment