Skip to content

Instantly share code, notes, and snippets.

@bpevs
Last active October 18, 2024 04:22
Show Gist options
  • Save bpevs/2722f720e01b41a60139050f7ba6694f to your computer and use it in GitHub Desktop.
Save bpevs/2722f720e01b41a60139050f7ba6694f to your computer and use it in GitHub Desktop.
CC-CEDICT File to JSON
/**
* @description
* This is a simplified version of cedict2json.js by Kevin Yang, modified to be
* run with deno, only using std dependencies.
*
* @reference https://github.com/kevb34ns/CEDICT2JSON/blob/master/cedict2json.js
* @reference https://www.mdbg.net/chinese/dictionary?page=cc-cedict
*
* CC-CEDICT is licensed under the Creative Commons Attribution-Share
* Alike 3.0 License (https://creativecommons.org/licenses/by-sa/3.0/).
* You must give proper attribution and license any changes or
* improvements to the data under the same license.
*/
import { TextLineStream } from "jsr:@std/streams/text-line-stream";
import { toTransformStream } from "jsr:@std/streams/to-transform-stream";
interface Entry {
traditional: string;
simplified: string;
pinyin: string;
definitions: string[];
}
const utf8Stream = (await Deno.open("cedict_ts.u8")).readable
.pipeThrough(new TextDecoderStream())
.pipeThrough(new TextLineStream())
.pipeThrough(toTransformStream(async function* (src) {
for await (const line of src) {
if (line.trim().charAt(0) !== "#") {
const entry = parseEntry(line);
if (entry !== null) yield entry;
}
}
}));
const entryArray: Entry[] = await Array.fromAsync(utf8Stream);
await Deno.writeTextFile("cedict.json", JSON.stringify(entryArray, null, "\t"));
function parseEntry(entry: string): Entry | null {
const firstSpace = entry.indexOf(" ");
const secondSpace = entry.indexOf(" ", firstSpace + 1);
const bracketsMatch = entry.match(/\[(.*?)\]/);
const definitions = entry.match(/\/(.*?)\//g);
if (
firstSpace <= 0 ||
secondSpace <= 0 ||
bracketsMatch === null ||
definitions === null ||
definitions.length === 0
) {
console.log("Invalid entry: " + entry);
return null;
}
return {
traditional: entry.substring(0, firstSpace),
simplified: entry.substring(firstSpace + 1, secondSpace),
pinyin: bracketsMatch[1],
definitions: definitions.map((def) => def.replace(/[\/]/g, "")),
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment