Skip to content

Instantly share code, notes, and snippets.

@bpevs
Created October 18, 2024 04:22
Show Gist options
  • Save bpevs/5c3fdfdda4758aaa7698a55864d5d36c to your computer and use it in GitHub Desktop.
Save bpevs/5c3fdfdda4758aaa7698a55864d5d36c to your computer and use it in GitHub Desktop.
Parse Dict
/**
* @description
* This is a modified version of parse-dictd to be used with Deno, using only
* std dependencies.
*
* @reference https://github.com/nvdnkpr/parse-dictd
*/
import { TextLineStream } from "jsr:@std/streams/text-line-stream";
import { toTransformStream } from "jsr:@std/streams/to-transform-stream";
const az = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const codes = {};
for (let i = 0; i < 64; i++) codes[az.charAt(i)] = i;
const dzUtf8Stream = (await Deno.open("spa-eng/spa-eng.dict.dz")).readable
.pipeThrough(new DecompressionStream("gzip"))
.pipeThrough(new TextDecoderStream());
const indexUtf8Stream = (await Deno.open("spa-eng/spa-eng.index")).readable
.pipeThrough(new TextDecoderStream());
const r = await parse(dzUtf8Stream, indexUtf8Stream);
await Deno.writeTextFile("dict.json", JSON.stringify(r, null, "\t"));
async function parse(
dstream: ReadableStream<string>,
istream: ReadableStream<string>,
) {
const offsets: Record<number, { word: string }> = {};
let pos = 0;
// Transform the index stream into objects by processing each line
const indexStream = istream
.pipeThrough(new TextLineStream()) // Split the index stream by lines
.pipeThrough(toTransformStream(async function* (src) {
for await (const chunk of src) {
const fields = chunk.trim().split("\t");
if (fields.length !== 3) continue;
offsets[decode(fields[1])] = { word: fields[0] };
yield;
}
}));
await Array.fromAsync(indexStream);
const outputStream = dstream
.pipeThrough(new TextLineStream())
.pipeThrough(toTransformStream(async function* (src) {
for await (const line of src) {
const to = line.trim()
.replace(/[^\x20-\x7E]/g, "")
.replace(/^\w+:/, "")
.trim().split(",");
if (offsets[pos]) {
yield { from: offsets[pos].word, to };
}
pos += new TextEncoder().encode(line).length;
}
}));
const results = {};
for await (const output of outputStream) {
if (output) results[output.from] = output.to;
}
return results;
}
function decode(s) {
let index = 0;
const len = s.length;
for (let i = 0; i < len; i++) {
index += codes[s.charAt(i)] * Math.pow(64, len - i - 1);
}
return index;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment