Skip to content

Instantly share code, notes, and snippets.

@mattdesl
Last active June 9, 2025 17:14
Show Gist options
  • Save mattdesl/30bc5de23eb6edfd7362d91d43170922 to your computer and use it in GitHub Desktop.
Save mattdesl/30bc5de23eb6edfd7362d91d43170922 to your computer and use it in GitHub Desktop.
uform3 perf test
import {
Tensor,
TextEncoder,
TextProcessor,
ImageEncoder,
env,
} from "./uform-encoder.js";
import imageUrl from "../assets/images/monalisa.png";
const model = "fp32"; // "v3", "fp16" or "fp32"
const inputType = model == "v3" ? "int32" : "int64";
let provider = "webgpu"; // "webgpu" or "cpu"
if (provider !== "webgpu" && model === "fp16") {
console.warn(
"WebGPU is required for fp16 model. Switching to WebGPU provider."
);
provider = "webgpu";
}
const modalityPaths = {
v3: {
image_encoder: "uform3-image-text-english-small/image_encoder.onnx",
text_encoder: "uform3-image-text-english-small/text_encoder.onnx",
},
fp16: {
text_encoder: "uform-vl-english-small-gpu-fp16/text_encoder.onnx",
image_encoder: "uform-vl-english-small-gpu-fp16/image_encoder.onnx",
},
fp32: {
text_encoder: "uform-vl-english-small-cpu-fp32/text_encoder.onnx",
image_encoder: "uform-vl-english-small-cpu-fp32/image_encoder.onnx",
},
}[model];
const paths = {
modalityPaths,
tokenizerPath: "uform3-image-text-english-small/tokenizer.json",
};
const config = await (
await fetch("uform3-image-text-english-small/config.json")
).json();
const textProcessor = new TextProcessor(config, paths.tokenizerPath);
await textProcessor.init();
const processedTexts = await textProcessor.process(["mona lisa"]);
const textEncoder = new TextEncoder(
paths.modalityPaths.text_encoder,
inputType
);
await textEncoder.init({
// preferredOutputLocation: "gpu-buffer",
executionProviders: [provider],
});
const textOutput = await textEncoder.encode(processedTexts);
const textEmbed = await textOutput.embeddings.getData();
await textEncoder.dispose();
const image = await loadImage(imageUrl);
const imageInputs = Array(16)
.fill()
.map(() => image); // simulate multiple images
const imageCount = imageInputs.length;
const imageEncoder = new ImageEncoder(
config,
paths.modalityPaths.image_encoder
);
await imageEncoder.init({
// preferredOutputLocation: "gpu-buffer",
executionProviders: [provider],
});
const device = env.webgpu.device;
console.log("WebGPU device:", device);
console.time("resize");
const imageDatas = imageEncoder.getResizedImageDatas(imageInputs);
console.timeEnd("resize");
console.time("flatten");
const { count, data } = imageEncoder.flattenImageDatas(imageDatas);
console.timeEnd("flatten");
console.time("tensor");
let tensor;
if (provider === "webgpu") {
const inputGpuBuffer = await createGPUBuffer({
device,
array: new Float32Array(data),
});
tensor = Tensor.fromGpuBuffer(inputGpuBuffer, {
dataType: "float32",
dims: [count, 3, imageEncoder.imageSize, imageEncoder.imageSize],
});
} else {
tensor = new Tensor("float32", data, [
count,
3,
imageEncoder.imageSize,
imageEncoder.imageSize,
]);
}
console.timeEnd("tensor");
console.time("encode1");
const imageOutput = await imageEncoder.encodeFromTensor(tensor);
console.timeEnd("encode1");
// const imageOutput2 = await imageEncoder.encode(datas);
console.time("read");
const f32 = await imageOutput.getData();
console.timeEnd("read");
// console.log(textOutput);
console.log("Cosine Similarity:", cos_sim(textEmbed, f32));
async function loadImage(src) {
return new Promise((resolve, reject) => {
const img = new Image();
img.crossOrigin = "anonymous";
img.onload = () => resolve(img);
img.onerror = (err) => reject(err);
img.src = src;
});
}
function cos_sim(a, b) {
const n = a.length; // assume a.length === b.length
let dot = 0.0;
let magA = 0.0;
let magB = 0.0;
// plain for-loop is fastest in V8 for this size
for (let i = 0; i < n; ++i) {
const ai = a[i];
const bi = b[i];
dot += ai * bi;
magA += ai * ai;
magB += bi * bi;
}
// sqrt of product of sums of squares
return dot / Math.sqrt(magA * magB);
}
async function createGPUBuffer({ device, array: floatArray }) {
// 1. Allocate a GPUBuffer with COPY_DST (so we can upload) and STORAGE (or whichever usage you need)
const bufferSize = floatArray.byteLength; // N * 4 bytes
const gpuBuffer = device.createBuffer({
size: bufferSize,
usage:
GPUBufferUsage.STORAGE |
GPUBufferUsage.COPY_DST |
GPUBufferUsage.COPY_SRC,
mappedAtCreation: false,
});
// 2. Upload the data via the queue
device.queue.writeBuffer(
/* buffer */ gpuBuffer,
/* bufferOffset */ 0,
/* data */ floatArray.buffer, // underlying ArrayBuffer
/* dataOffset */ floatArray.byteOffset,
/* size */ floatArray.byteLength
);
return gpuBuffer;
}
// Alternative for node.js
// import * as ONNX from 'onnxruntime-node';
// For web:
import * as ONNX from "onnxruntime-web";
import { PreTrainedTokenizer } from "@huggingface/transformers";
const { env, Tensor, InferenceSession } = ONNX;
export { env, Tensor, InferenceSession };
async function grabJSON(url) {
const resp = await fetch(url);
return resp.json();
}
export class TextProcessor {
constructor(config, tokenizerPath) {
this.config = config;
this.tokenizerPath = tokenizerPath;
this.maxSeqLen = 0;
this.padTokenIdx = 0;
this.tokenizer = null;
}
async init() {
var config = this.config;
if (config.text_encoder !== undefined) {
config = config.text_encoder;
}
this.maxSeqLen = config.max_position_embeddings;
this.padTokenIdx = config.padding_idx;
const tokenizerConfig = await grabJSON(this.tokenizerPath);
this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
this.tokenizer.model_max_length = this.maxSeqLen;
this.tokenizer.pad_token_id = this.padTokenIdx;
}
async process(texts) {
const encoded = await this.tokenizer(texts, {
add_special_tokens: true,
padding: "max_length",
max_length: this.maxSeqLen,
truncation: true,
});
return {
input_ids: encoded.input_ids,
attention_mask: encoded.attention_mask,
};
}
}
export class TextEncoder {
constructor(modelPath, inputType = "int32") {
this.modelPath = modelPath;
this.session = null;
this.inputType = inputType; // 'int32' or 'int64'
}
async init(opts = {}) {
this.session = await InferenceSession.create(this.modelPath, opts);
}
async dispose() {
if (this.session) {
await this.session
.release()
.catch((error) => console.error("Failed to release session", error));
this.session = null;
}
}
async encode(inputs) {
if (!this.session) {
throw new Error("Session is not initialized.");
}
// Helper function to convert BigInt64Array to Int32Array or validate Int32Array
function ensureInt32Array(data) {
if (data instanceof Int32Array) {
return data; // Use as is if already Int32Array
}
if (data instanceof BigInt64Array) {
// Convert BigInt64Array to Int32Array, ensuring all values are in range
return new Int32Array(
Array.from(data).map((bigInt) => {
if (bigInt > 2147483647n || bigInt < -2147483648n) {
throw new Error("Value out of range for Int32.");
}
return Number(bigInt); // Convert BigInt to Number
})
);
}
// Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
if (
Array.isArray(data) ||
data instanceof Uint32Array ||
data instanceof Uint8Array
) {
return new Int32Array(data); // Convert directly
}
throw new Error("Unsupported data type for tensor conversion.");
}
// Prepare tensor data
const inputIDsData =
this.inputType == "int32"
? ensureInt32Array(inputs.input_ids.data)
: inputs.input_ids.data;
const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);
const inputIDs = new Tensor(
this.inputType,
inputIDsData,
inputs.input_ids.dims
);
const attentionMask = new Tensor(
"int32",
attentionMaskData,
inputs.attention_mask.dims
);
// Run model inference
return this.session.run(
{
input_ids: inputIDs,
attention_mask: attentionMask,
},
["embeddings"]
);
}
}
function toValidRGBAImageDatas(images, imageSize) {
if (!Array.isArray(images)) {
images = [images];
}
return images.map((image) => {
if (image.width == null || image.height == null || image.data == null) {
throw new Error("Image must have width, height, and data properties.");
}
if (image.width !== imageSize || image.height !== imageSize) {
throw new Error(
`Image dimensions must be ${imageSize}x${imageSize}, but got ${image.width}x${image.height}.`
);
}
let channelSize = imageSize * imageSize;
const pixelCount = image.data.length / 4;
if (pixelCount !== channelSize) {
throw new Error(
`Image pixel count must be ${channelSize}, but got ${pixelCount}.`
);
}
return image.data;
});
}
export class ImageEncoder {
constructor(config, modelPath) {
if (config.image_encoder !== undefined) {
config = config.image_encoder;
}
this.modelPath = modelPath;
this.imageSize = config.image_size;
this.normalizationMeans = config.normalization_means;
this.normalizationDeviations = config.normalization_deviations;
this.imageMean = new Float32Array(this.normalizationMeans);
this.imageStd = new Float32Array(this.normalizationDeviations);
}
getResizedImageDatas(images) {
if (!Array.isArray(images)) {
images = [images];
}
const size = this.imageSize;
const offscreen =
typeof OffscreenCanvas !== "undefined"
? new OffscreenCanvas(size, size)
: (() => {
const cnv = document.createElement("canvas");
cnv.width = size;
cnv.height = size;
return cnv;
})();
const ctx = offscreen.getContext("2d", {
willReadFrequently: true,
});
return images.map((src) => {
// clear any previous drawing
ctx.clearRect(0, 0, size, size);
// draw src into the offscreen canvas, scaling to fit
ctx.drawImage(src, 0, 0, size, size);
// pull back the pixel data
return ctx.getImageData(0, 0, size, size);
});
}
flattenImageDatas(images) {
const imageDatas = toValidRGBAImageDatas(images, this.imageSize);
const N = imageDatas.length;
const H = this.imageSize,
W = this.imageSize;
const C = 3;
const pixelCount = H * W;
// allocate one big [N * C * H * W] buffer
const result = new Float32Array(N * C * pixelCount);
// for each image in the batch…
for (let n = 0; n < N; n++) {
const buffer = imageDatas[n];
// compute the starting offset for this image
const base = n * C * pixelCount;
// reorder HWC -> CHW and normalize
for (let p = 0; p < pixelCount; p++) {
const r = buffer[p * 4 + 0] / 0xff;
const g = buffer[p * 4 + 1] / 0xff;
const b = buffer[p * 4 + 2] / 0xff;
result[base + p] = (r - this.imageMean[0]) / this.imageStd[0];
result[base + pixelCount + p] =
(g - this.imageMean[1]) / this.imageStd[1];
result[base + 2 * pixelCount + p] =
(b - this.imageMean[2]) / this.imageStd[2];
}
}
return { count: N, data: result };
}
encode(images) {
const { count: N, data } = this.flattenImageDatas(images);
const C = 3;
const H = this.imageSize,
W = this.imageSize;
// shape is [N, 3, H, W]
// Create ONNX Tensor
const imagesTensor = new Tensor("float32", data, [N, C, H, W]);
// Run model inference
return this.encodeFromTensor(imagesTensor);
}
async encodeFromTensor(tensor) {
const key = this.session.inputNames[0];
const opts = {};
opts[key] = tensor;
const output =
this.session.outputNames[this.session.outputNames.length - 1];
const res = await this.session.run(opts, [output]);
return res[output];
}
/**
* Initializes the ONNX session with the pre-trained model.
*/
async init(opts = {}) {
this.session = await InferenceSession.create(this.modelPath, opts);
}
/**
* Releases the ONNX session resources.
*/
async dispose() {
if (this.session) {
await this.session
.release()
.catch((error) => console.error("Failed to release session", error));
this.session = null;
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment