mattdesl · August 5, 2025 03:09
diff --git a/main.js b/main.js
 import {
  Tensor,
  TextEncoder,
  TextProcessor,
  ImageEncoder,
  env,
 } from "./uform-encoder.js";
 import imageUrl from "../assets/images/monalisa.png";

 const model = "fp32"; // "v3", "fp16" or "fp32"
 const inputType = model == "v3" ? "int32" : "int64";
 let provider = "webgpu"; // "webgpu" or "cpu"
 if (provider !== "webgpu" && model === "fp16") {
  console.warn(
    "WebGPU is required for fp16 model. Switching to WebGPU provider."
  );
  provider = "webgpu";
 }

 const modalityPaths = {
  v3: {
    image_encoder: "uform3-image-text-english-small/image_encoder.onnx",
    text_encoder: "uform3-image-text-english-small/text_encoder.onnx",
  },
  fp16: {
    text_encoder: "uform-vl-english-small-gpu-fp16/text_encoder.onnx",
    image_encoder: "uform-vl-english-small-gpu-fp16/image_encoder.onnx",
  },
  fp32: {
    text_encoder: "uform-vl-english-small-cpu-fp32/text_encoder.onnx",
    image_encoder: "uform-vl-english-small-cpu-fp32/image_encoder.onnx",
  },
 }[model];

 const paths = {
  modalityPaths,
  tokenizerPath: "uform3-image-text-english-small/tokenizer.json",
 };

 const config = await (
  await fetch("uform3-image-text-english-small/config.json")
 ).json();

 const textProcessor = new TextProcessor(config, paths.tokenizerPath);
 await textProcessor.init();

 const processedTexts = await textProcessor.process(["mona lisa"]);

 const textEncoder = new TextEncoder(
  paths.modalityPaths.text_encoder,
  inputType
 );
 await textEncoder.init({
  // preferredOutputLocation: "gpu-buffer",
  executionProviders: [provider],
 });

 const textOutput = await textEncoder.encode(processedTexts);

 const textEmbed = await textOutput.embeddings.getData();
 await textEncoder.dispose();

 const image = await loadImage(imageUrl);

 const imageInputs = Array(16)
  .fill()
  .map(() => image); // simulate multiple images
 const imageCount = imageInputs.length;
 const imageEncoder = new ImageEncoder(
  config,
  paths.modalityPaths.image_encoder
 );
 await imageEncoder.init({
  // preferredOutputLocation: "gpu-buffer",
  executionProviders: [provider],
 });

 const device = env.webgpu.device;
 console.log("WebGPU device:", device);

 console.time("resize");
 const imageDatas = imageEncoder.getResizedImageDatas(imageInputs);
 console.timeEnd("resize");

 console.time("flatten");
 const { count, data } = imageEncoder.flattenImageDatas(imageDatas);
 console.timeEnd("flatten");

 console.time("tensor");
 let tensor;

 if (provider === "webgpu") {
  const inputGpuBuffer = await createGPUBuffer({
    device,
    array: new Float32Array(data),
  });
  tensor = Tensor.fromGpuBuffer(inputGpuBuffer, {
    dataType: "float32",
    dims: [count, 3, imageEncoder.imageSize, imageEncoder.imageSize],
  });
 } else {
  tensor = new Tensor("float32", data, [
    count,
    3,
    imageEncoder.imageSize,
    imageEncoder.imageSize,
  ]);
 }
 console.timeEnd("tensor");

 console.time("encode1");
 const imageOutput = await imageEncoder.encodeFromTensor(tensor);
 console.timeEnd("encode1");

 // const imageOutput2 = await imageEncoder.encode(datas);
 console.time("read");
 const f32 = await imageOutput.getData();
 console.timeEnd("read");

 // console.log(textOutput);
 console.log("Cosine Similarity:", cos_sim(textEmbed, f32));

 async function loadImage(src) {
  return new Promise((resolve, reject) => {
    const img = new Image();
    img.crossOrigin = "anonymous";
    img.onload = () => resolve(img);
    img.onerror = (err) => reject(err);
    img.src = src;
  });
 }

 function cos_sim(a, b) {
  const n = a.length; // assume a.length === b.length
  let dot = 0.0;
  let magA = 0.0;
  let magB = 0.0;

  // plain for-loop is fastest in V8 for this size
  for (let i = 0; i < n; ++i) {
    const ai = a[i];
    const bi = b[i];
    dot += ai * bi;
    magA += ai * ai;
    magB += bi * bi;
  }

  // sqrt of product of sums of squares
  return dot / Math.sqrt(magA * magB);
 }

 async function createGPUBuffer({ device, array: floatArray }) {
  // 1. Allocate a GPUBuffer with COPY_DST (so we can upload) and STORAGE (or whichever usage you need)
  const bufferSize = floatArray.byteLength; // N * 4 bytes
  const gpuBuffer = device.createBuffer({
    size: bufferSize,
    usage:
      GPUBufferUsage.STORAGE |
      GPUBufferUsage.COPY_DST |
      GPUBufferUsage.COPY_SRC,
    mappedAtCreation: false,
  });

  // 2. Upload the data via the queue
  device.queue.writeBuffer(
    /* buffer    */ gpuBuffer,
    /* bufferOffset */ 0,
    /* data      */ floatArray.buffer, // underlying ArrayBuffer
    /* dataOffset   */ floatArray.byteOffset,
    /* size      */ floatArray.byteLength
  );

  return gpuBuffer;
 }
diff --git a/uform-encoder.js b/uform-encoder.js
 // Alternative for node.js
 // import * as ONNX from 'onnxruntime-node';

 // For web:
 import * as ONNX from "onnxruntime-web";

 import { PreTrainedTokenizer } from "@huggingface/transformers";

 const { env, Tensor, InferenceSession } = ONNX;
 export { env, Tensor, InferenceSession };

 async function grabJSON(url) {
  const resp = await fetch(url);
  return resp.json();
 }

 export class TextProcessor {
  constructor(config, tokenizerPath) {
    this.config = config;
    this.tokenizerPath = tokenizerPath;

    this.maxSeqLen = 0;
    this.padTokenIdx = 0;
    this.tokenizer = null;
  }

  async init() {
    var config = this.config;
    if (config.text_encoder !== undefined) {
      config = config.text_encoder;
    }

    this.maxSeqLen = config.max_position_embeddings;
    this.padTokenIdx = config.padding_idx;

    const tokenizerConfig = await grabJSON(this.tokenizerPath);
    this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
    this.tokenizer.model_max_length = this.maxSeqLen;
    this.tokenizer.pad_token_id = this.padTokenIdx;
  }

  async process(texts) {
    const encoded = await this.tokenizer(texts, {
      add_special_tokens: true,
      padding: "max_length",
      max_length: this.maxSeqLen,
      truncation: true,
    });

    return {
      input_ids: encoded.input_ids,
      attention_mask: encoded.attention_mask,
    };
  }
 }

 export class TextEncoder {
  constructor(modelPath, inputType = "int32") {
    this.modelPath = modelPath;
    this.session = null;
    this.inputType = inputType; // 'int32' or 'int64'
  }

  async init(opts = {}) {
    this.session = await InferenceSession.create(this.modelPath, opts);
  }

  async dispose() {
    if (this.session) {
      await this.session
        .release()
        .catch((error) => console.error("Failed to release session", error));
      this.session = null;
    }
  }

  async encode(inputs) {
    if (!this.session) {
      throw new Error("Session is not initialized.");
    }

    // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
    function ensureInt32Array(data) {
      if (data instanceof Int32Array) {
        return data; // Use as is if already Int32Array
      }
      if (data instanceof BigInt64Array) {
        // Convert BigInt64Array to Int32Array, ensuring all values are in range
        return new Int32Array(
          Array.from(data).map((bigInt) => {
            if (bigInt > 2147483647n || bigInt < -2147483648n) {
              throw new Error("Value out of range for Int32.");
            }
            return Number(bigInt); // Convert BigInt to Number
          })
        );
      }
      // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
      if (
        Array.isArray(data) ||
        data instanceof Uint32Array ||
        data instanceof Uint8Array
      ) {
        return new Int32Array(data); // Convert directly
      }
      throw new Error("Unsupported data type for tensor conversion.");
    }

    // Prepare tensor data
    const inputIDsData =
      this.inputType == "int32"
        ? ensureInt32Array(inputs.input_ids.data)
        : inputs.input_ids.data;
    const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);

    const inputIDs = new Tensor(
      this.inputType,
      inputIDsData,
      inputs.input_ids.dims
    );
    const attentionMask = new Tensor(
      "int32",
      attentionMaskData,
      inputs.attention_mask.dims
    );

    // Run model inference
    return this.session.run(
      {
        input_ids: inputIDs,
        attention_mask: attentionMask,
      },
      ["embeddings"]
    );
  }
 }

 function toValidRGBAImageDatas(images, imageSize) {
  if (!Array.isArray(images)) {
    images = [images];
  }
  return images.map((image) => {
    if (image.width == null || image.height == null || image.data == null) {
      throw new Error("Image must have width, height, and data properties.");
    }
    if (image.width !== imageSize || image.height !== imageSize) {
      throw new Error(
        `Image dimensions must be ${imageSize}x${imageSize}, but got ${image.width}x${image.height}.`
      );
    }
    let channelSize = imageSize * imageSize;
    const pixelCount = image.data.length / 4;
    if (pixelCount !== channelSize) {
      throw new Error(
        `Image pixel count must be ${channelSize}, but got ${pixelCount}.`
      );
    }
    return image.data;
  });
 }

 export class ImageEncoder {
  constructor(config, modelPath) {
    if (config.image_encoder !== undefined) {
      config = config.image_encoder;
    }

    this.modelPath = modelPath;
    this.imageSize = config.image_size;
    this.normalizationMeans = config.normalization_means;
    this.normalizationDeviations = config.normalization_deviations;

    this.imageMean = new Float32Array(this.normalizationMeans);
    this.imageStd = new Float32Array(this.normalizationDeviations);
  }

  getResizedImageDatas(images) {
    if (!Array.isArray(images)) {
      images = [images];
    }

    const size = this.imageSize;
    const offscreen =
      typeof OffscreenCanvas !== "undefined"
        ? new OffscreenCanvas(size, size)
        : (() => {
            const cnv = document.createElement("canvas");
            cnv.width = size;
            cnv.height = size;
            return cnv;
          })();
    const ctx = offscreen.getContext("2d", {
      willReadFrequently: true,
    });

    return images.map((src) => {
      // clear any previous drawing
      ctx.clearRect(0, 0, size, size);

      // draw src into the offscreen canvas, scaling to fit
      ctx.drawImage(src, 0, 0, size, size);

      // pull back the pixel data
      return ctx.getImageData(0, 0, size, size);
    });
  }

  flattenImageDatas(images) {
    const imageDatas = toValidRGBAImageDatas(images, this.imageSize);
    const N = imageDatas.length;
    const H = this.imageSize,
      W = this.imageSize;
    const C = 3;
    const pixelCount = H * W;

    // allocate one big [N * C * H * W] buffer
    const result = new Float32Array(N * C * pixelCount);

    // for each image in the batch…
    for (let n = 0; n < N; n++) {
      const buffer = imageDatas[n];
      // compute the starting offset for this image
      const base = n * C * pixelCount;

      // reorder HWC -> CHW and normalize
      for (let p = 0; p < pixelCount; p++) {
        const r = buffer[p * 4 + 0] / 0xff;
        const g = buffer[p * 4 + 1] / 0xff;
        const b = buffer[p * 4 + 2] / 0xff;

        result[base + p] = (r - this.imageMean[0]) / this.imageStd[0];
        result[base + pixelCount + p] =
          (g - this.imageMean[1]) / this.imageStd[1];
        result[base + 2 * pixelCount + p] =
          (b - this.imageMean[2]) / this.imageStd[2];
      }
    }
    return { count: N, data: result };
  }

  encode(images) {
    const { count: N, data } = this.flattenImageDatas(images);
    const C = 3;
    const H = this.imageSize,
      W = this.imageSize;

    // shape is [N, 3, H, W]
    // Create ONNX Tensor
    const imagesTensor = new Tensor("float32", data, [N, C, H, W]);

    // Run model inference
    return this.encodeFromTensor(imagesTensor);
  }

  async encodeFromTensor(tensor) {
    const key = this.session.inputNames[0];
    const opts = {};
    opts[key] = tensor;
    const output =
      this.session.outputNames[this.session.outputNames.length - 1];
    const res = await this.session.run(opts, [output]);
    return res[output];
  }

  /**
   * Initializes the ONNX session with the pre-trained model.
   */
  async init(opts = {}) {
    this.session = await InferenceSession.create(this.modelPath, opts);
  }

  /**
   * Releases the ONNX session resources.
   */
  async dispose() {
    if (this.session) {
      await this.session
        .release()
        .catch((error) => console.error("Failed to release session", error));
      this.session = null;
    }
  }
 }
	import {
	Tensor,
	TextEncoder,
	TextProcessor,
	ImageEncoder,
	env,
	} from "./uform-encoder.js";
	import imageUrl from "../assets/images/monalisa.png";

	const model = "fp32"; // "v3", "fp16" or "fp32"
	const inputType = model == "v3" ? "int32" : "int64";
	let provider = "webgpu"; // "webgpu" or "cpu"
	if (provider !== "webgpu" && model === "fp16") {
	console.warn(
	"WebGPU is required for fp16 model. Switching to WebGPU provider."
	);
	provider = "webgpu";
	}

	const modalityPaths = {
	v3: {
	image_encoder: "uform3-image-text-english-small/image_encoder.onnx",
	text_encoder: "uform3-image-text-english-small/text_encoder.onnx",
	},
	fp16: {
	text_encoder: "uform-vl-english-small-gpu-fp16/text_encoder.onnx",
	image_encoder: "uform-vl-english-small-gpu-fp16/image_encoder.onnx",
	},
	fp32: {
	text_encoder: "uform-vl-english-small-cpu-fp32/text_encoder.onnx",
	image_encoder: "uform-vl-english-small-cpu-fp32/image_encoder.onnx",
	},
	}[model];

	const paths = {
	modalityPaths,
	tokenizerPath: "uform3-image-text-english-small/tokenizer.json",
	};

	const config = await (
	await fetch("uform3-image-text-english-small/config.json")
	).json();

	const textProcessor = new TextProcessor(config, paths.tokenizerPath);
	await textProcessor.init();

	const processedTexts = await textProcessor.process(["mona lisa"]);

	const textEncoder = new TextEncoder(
	paths.modalityPaths.text_encoder,
	inputType
	);
	await textEncoder.init({
	// preferredOutputLocation: "gpu-buffer",
	executionProviders: [provider],
	});

	const textOutput = await textEncoder.encode(processedTexts);

	const textEmbed = await textOutput.embeddings.getData();
	await textEncoder.dispose();

	const image = await loadImage(imageUrl);

	const imageInputs = Array(16)
	.fill()
	.map(() => image); // simulate multiple images
	const imageCount = imageInputs.length;
	const imageEncoder = new ImageEncoder(
	config,
	paths.modalityPaths.image_encoder
	);
	await imageEncoder.init({
	// preferredOutputLocation: "gpu-buffer",
	executionProviders: [provider],
	});

	const device = env.webgpu.device;
	console.log("WebGPU device:", device);

	console.time("resize");
	const imageDatas = imageEncoder.getResizedImageDatas(imageInputs);
	console.timeEnd("resize");

	console.time("flatten");
	const { count, data } = imageEncoder.flattenImageDatas(imageDatas);
	console.timeEnd("flatten");

	console.time("tensor");
	let tensor;

	if (provider === "webgpu") {
	const inputGpuBuffer = await createGPUBuffer({
	device,
	array: new Float32Array(data),
	});
	tensor = Tensor.fromGpuBuffer(inputGpuBuffer, {
	dataType: "float32",
	dims: [count, 3, imageEncoder.imageSize, imageEncoder.imageSize],
	});
	} else {
	tensor = new Tensor("float32", data, [
	count,
	3,
	imageEncoder.imageSize,
	imageEncoder.imageSize,
	]);
	}
	console.timeEnd("tensor");

	console.time("encode1");
	const imageOutput = await imageEncoder.encodeFromTensor(tensor);
	console.timeEnd("encode1");

	// const imageOutput2 = await imageEncoder.encode(datas);
	console.time("read");
	const f32 = await imageOutput.getData();
	console.timeEnd("read");

	// console.log(textOutput);
	console.log("Cosine Similarity:", cos_sim(textEmbed, f32));

	async function loadImage(src) {
	return new Promise((resolve, reject) => {
	const img = new Image();
	img.crossOrigin = "anonymous";
	img.onload = () => resolve(img);
	img.onerror = (err) => reject(err);
	img.src = src;
	});
	}

	function cos_sim(a, b) {
	const n = a.length; // assume a.length === b.length
	let dot = 0.0;
	let magA = 0.0;
	let magB = 0.0;

	// plain for-loop is fastest in V8 for this size
	for (let i = 0; i < n; ++i) {
	const ai = a[i];
	const bi = b[i];
	dot += ai * bi;
	magA += ai * ai;
	magB += bi * bi;
	}

	// sqrt of product of sums of squares
	return dot / Math.sqrt(magA * magB);
	}

	async function createGPUBuffer({ device, array: floatArray }) {
	// 1. Allocate a GPUBuffer with COPY_DST (so we can upload) and STORAGE (or whichever usage you need)
	const bufferSize = floatArray.byteLength; // N * 4 bytes
	const gpuBuffer = device.createBuffer({
	size: bufferSize,
	usage:
	GPUBufferUsage.STORAGE \|
	GPUBufferUsage.COPY_DST \|
	GPUBufferUsage.COPY_SRC,
	mappedAtCreation: false,
	});

	// 2. Upload the data via the queue
	device.queue.writeBuffer(
	/* buffer */ gpuBuffer,
	/* bufferOffset */ 0,
	/* data */ floatArray.buffer, // underlying ArrayBuffer
	/* dataOffset */ floatArray.byteOffset,
	/* size */ floatArray.byteLength
	);

	return gpuBuffer;
	}
	// Alternative for node.js
	// import * as ONNX from 'onnxruntime-node';

	// For web:
	import * as ONNX from "onnxruntime-web";

	import { PreTrainedTokenizer } from "@huggingface/transformers";

	const { env, Tensor, InferenceSession } = ONNX;
	export { env, Tensor, InferenceSession };

	async function grabJSON(url) {
	const resp = await fetch(url);
	return resp.json();
	}

	export class TextProcessor {
	constructor(config, tokenizerPath) {
	this.config = config;
	this.tokenizerPath = tokenizerPath;

	this.maxSeqLen = 0;
	this.padTokenIdx = 0;
	this.tokenizer = null;
	}

	async init() {
	var config = this.config;
	if (config.text_encoder !== undefined) {
	config = config.text_encoder;
	}

	this.maxSeqLen = config.max_position_embeddings;
	this.padTokenIdx = config.padding_idx;

	const tokenizerConfig = await grabJSON(this.tokenizerPath);
	this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
	this.tokenizer.model_max_length = this.maxSeqLen;
	this.tokenizer.pad_token_id = this.padTokenIdx;
	}

	async process(texts) {
	const encoded = await this.tokenizer(texts, {
	add_special_tokens: true,
	padding: "max_length",
	max_length: this.maxSeqLen,
	truncation: true,
	});

	return {
	input_ids: encoded.input_ids,
	attention_mask: encoded.attention_mask,
	};
	}
	}

	export class TextEncoder {
	constructor(modelPath, inputType = "int32") {
	this.modelPath = modelPath;
	this.session = null;
	this.inputType = inputType; // 'int32' or 'int64'
	}

	async init(opts = {}) {
	this.session = await InferenceSession.create(this.modelPath, opts);
	}

	async dispose() {
	if (this.session) {
	await this.session
	.release()
	.catch((error) => console.error("Failed to release session", error));
	this.session = null;
	}
	}

	async encode(inputs) {
	if (!this.session) {
	throw new Error("Session is not initialized.");
	}

	// Helper function to convert BigInt64Array to Int32Array or validate Int32Array
	function ensureInt32Array(data) {
	if (data instanceof Int32Array) {
	return data; // Use as is if already Int32Array
	}
	if (data instanceof BigInt64Array) {
	// Convert BigInt64Array to Int32Array, ensuring all values are in range
	return new Int32Array(
	Array.from(data).map((bigInt) => {
	if (bigInt > 2147483647n \|\| bigInt < -2147483648n) {
	throw new Error("Value out of range for Int32.");
	}
	return Number(bigInt); // Convert BigInt to Number
	})
	);
	}
	// Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
	if (
	Array.isArray(data) \|\|
	data instanceof Uint32Array \|\|
	data instanceof Uint8Array
	) {
	return new Int32Array(data); // Convert directly
	}
	throw new Error("Unsupported data type for tensor conversion.");
	}

	// Prepare tensor data
	const inputIDsData =
	this.inputType == "int32"
	? ensureInt32Array(inputs.input_ids.data)
	: inputs.input_ids.data;
	const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);

	const inputIDs = new Tensor(
	this.inputType,
	inputIDsData,
	inputs.input_ids.dims
	);
	const attentionMask = new Tensor(
	"int32",
	attentionMaskData,
	inputs.attention_mask.dims
	);

	// Run model inference
	return this.session.run(
	{
	input_ids: inputIDs,
	attention_mask: attentionMask,
	},
	["embeddings"]
	);
	}
	}

	function toValidRGBAImageDatas(images, imageSize) {
	if (!Array.isArray(images)) {
	images = [images];
	}
	return images.map((image) => {
	if (image.width == null \|\| image.height == null \|\| image.data == null) {
	throw new Error("Image must have width, height, and data properties.");
	}
	if (image.width !== imageSize \|\| image.height !== imageSize) {
	throw new Error(
	`Image dimensions must be ${imageSize}x${imageSize}, but got ${image.width}x${image.height}.`
	);
	}
	let channelSize = imageSize * imageSize;
	const pixelCount = image.data.length / 4;
	if (pixelCount !== channelSize) {
	throw new Error(
	`Image pixel count must be ${channelSize}, but got ${pixelCount}.`
	);
	}
	return image.data;
	});
	}

	export class ImageEncoder {
	constructor(config, modelPath) {
	if (config.image_encoder !== undefined) {
	config = config.image_encoder;
	}

	this.modelPath = modelPath;
	this.imageSize = config.image_size;
	this.normalizationMeans = config.normalization_means;
	this.normalizationDeviations = config.normalization_deviations;

	this.imageMean = new Float32Array(this.normalizationMeans);
	this.imageStd = new Float32Array(this.normalizationDeviations);
	}

	getResizedImageDatas(images) {
	if (!Array.isArray(images)) {
	images = [images];
	}

	const size = this.imageSize;
	const offscreen =
	typeof OffscreenCanvas !== "undefined"
	? new OffscreenCanvas(size, size)
	: (() => {
	const cnv = document.createElement("canvas");
	cnv.width = size;
	cnv.height = size;
	return cnv;
	})();
	const ctx = offscreen.getContext("2d", {
	willReadFrequently: true,
	});

	return images.map((src) => {
	// clear any previous drawing
	ctx.clearRect(0, 0, size, size);

	// draw src into the offscreen canvas, scaling to fit
	ctx.drawImage(src, 0, 0, size, size);

	// pull back the pixel data
	return ctx.getImageData(0, 0, size, size);
	});
	}

	flattenImageDatas(images) {
	const imageDatas = toValidRGBAImageDatas(images, this.imageSize);
	const N = imageDatas.length;
	const H = this.imageSize,
	W = this.imageSize;
	const C = 3;
	const pixelCount = H * W;

	// allocate one big [N * C * H * W] buffer
	const result = new Float32Array(N * C * pixelCount);

	// for each image in the batch…
	for (let n = 0; n < N; n++) {
	const buffer = imageDatas[n];
	// compute the starting offset for this image
	const base = n * C * pixelCount;

	// reorder HWC -> CHW and normalize
	for (let p = 0; p < pixelCount; p++) {
	const r = buffer[p * 4 + 0] / 0xff;
	const g = buffer[p * 4 + 1] / 0xff;
	const b = buffer[p * 4 + 2] / 0xff;

	result[base + p] = (r - this.imageMean[0]) / this.imageStd[0];
	result[base + pixelCount + p] =
	(g - this.imageMean[1]) / this.imageStd[1];
	result[base + 2 * pixelCount + p] =
	(b - this.imageMean[2]) / this.imageStd[2];
	}
	}
	return { count: N, data: result };
	}

	encode(images) {
	const { count: N, data } = this.flattenImageDatas(images);
	const C = 3;
	const H = this.imageSize,
	W = this.imageSize;

	// shape is [N, 3, H, W]
	// Create ONNX Tensor
	const imagesTensor = new Tensor("float32", data, [N, C, H, W]);

	// Run model inference
	return this.encodeFromTensor(imagesTensor);
	}

	async encodeFromTensor(tensor) {
	const key = this.session.inputNames[0];
	const opts = {};
	opts[key] = tensor;
	const output =
	this.session.outputNames[this.session.outputNames.length - 1];
	const res = await this.session.run(opts, [output]);
	return res[output];
	}

	/**
	* Initializes the ONNX session with the pre-trained model.
	*/
	async init(opts = {}) {
	this.session = await InferenceSession.create(this.modelPath, opts);
	}

	/**
	* Releases the ONNX session resources.
	*/
	async dispose() {
	if (this.session) {
	await this.session
	.release()
	.catch((error) => console.error("Failed to release session", error));
	this.session = null;
	}
	}
	}