bpevs · October 18, 2024 04:22
diff --git a/parse-dict.ts b/parse-dict.ts
 /**
 * @description
 * This is a modified version of parse-dictd to be used with Deno, using only
 * std dependencies.
 *
 * @reference https://github.com/nvdnkpr/parse-dictd
 */
 import { TextLineStream } from "jsr:@std/streams/text-line-stream";
 import { toTransformStream } from "jsr:@std/streams/to-transform-stream";

 const az = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 const codes = {};
 for (let i = 0; i < 64; i++) codes[az.charAt(i)] = i;

 const dzUtf8Stream = (await Deno.open("spa-eng/spa-eng.dict.dz")).readable
  .pipeThrough(new DecompressionStream("gzip"))
  .pipeThrough(new TextDecoderStream());

 const indexUtf8Stream = (await Deno.open("spa-eng/spa-eng.index")).readable
  .pipeThrough(new TextDecoderStream());

 const r = await parse(dzUtf8Stream, indexUtf8Stream);
 await Deno.writeTextFile("dict.json", JSON.stringify(r, null, "\t"));

 async function parse(
  dstream: ReadableStream<string>,
  istream: ReadableStream<string>,
 ) {
  const offsets: Record<number, { word: string }> = {};
  let pos = 0;

  // Transform the index stream into objects by processing each line
  const indexStream = istream
    .pipeThrough(new TextLineStream()) // Split the index stream by lines
    .pipeThrough(toTransformStream(async function* (src) {
      for await (const chunk of src) {
        const fields = chunk.trim().split("\t");
        if (fields.length !== 3) continue;
        offsets[decode(fields[1])] = { word: fields[0] };
        yield;
      }
    }));

  await Array.fromAsync(indexStream);

  const outputStream = dstream
    .pipeThrough(new TextLineStream())
    .pipeThrough(toTransformStream(async function* (src) {
      for await (const line of src) {
        const to = line.trim()
          .replace(/[^\x20-\x7E]/g, "")
          .replace(/^\w+:/, "")
          .trim().split(",");

        if (offsets[pos]) {
          yield { from: offsets[pos].word, to };
        }

        pos += new TextEncoder().encode(line).length;
      }
    }));

  const results = {};

  for await (const output of outputStream) {
    if (output) results[output.from] = output.to;
  }

  return results;
 }

 function decode(s) {
  let index = 0;
  const len = s.length;
  for (let i = 0; i < len; i++) {
    index += codes[s.charAt(i)] * Math.pow(64, len - i - 1);
  }
  return index;
 }
	/**
	* @description
	* This is a modified version of parse-dictd to be used with Deno, using only
	* std dependencies.
	*
	* @reference https://github.com/nvdnkpr/parse-dictd
	*/
	import { TextLineStream } from "jsr:@std/streams/text-line-stream";
	import { toTransformStream } from "jsr:@std/streams/to-transform-stream";

	const az = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
	const codes = {};
	for (let i = 0; i < 64; i++) codes[az.charAt(i)] = i;

	const dzUtf8Stream = (await Deno.open("spa-eng/spa-eng.dict.dz")).readable
	.pipeThrough(new DecompressionStream("gzip"))
	.pipeThrough(new TextDecoderStream());

	const indexUtf8Stream = (await Deno.open("spa-eng/spa-eng.index")).readable
	.pipeThrough(new TextDecoderStream());

	const r = await parse(dzUtf8Stream, indexUtf8Stream);
	await Deno.writeTextFile("dict.json", JSON.stringify(r, null, "\t"));

	async function parse(
	dstream: ReadableStream<string>,
	istream: ReadableStream<string>,
	) {
	const offsets: Record<number, { word: string }> = {};
	let pos = 0;

	// Transform the index stream into objects by processing each line
	const indexStream = istream
	.pipeThrough(new TextLineStream()) // Split the index stream by lines
	.pipeThrough(toTransformStream(async function* (src) {
	for await (const chunk of src) {
	const fields = chunk.trim().split("\t");
	if (fields.length !== 3) continue;
	offsets[decode(fields[1])] = { word: fields[0] };
	yield;
	}
	}));

	await Array.fromAsync(indexStream);

	const outputStream = dstream
	.pipeThrough(new TextLineStream())
	.pipeThrough(toTransformStream(async function* (src) {
	for await (const line of src) {
	const to = line.trim()
	.replace(/[^\x20-\x7E]/g, "")
	.replace(/^\w+:/, "")
	.trim().split(",");

	if (offsets[pos]) {
	yield { from: offsets[pos].word, to };
	}

	pos += new TextEncoder().encode(line).length;
	}
	}));

	const results = {};

	for await (const output of outputStream) {
	if (output) results[output.from] = output.to;
	}

	return results;
	}

	function decode(s) {
	let index = 0;
	const len = s.length;
	for (let i = 0; i < len; i++) {
	index += codes[s.charAt(i)] * Math.pow(64, len - i - 1);
	}
	return index;
	}