Last active
June 9, 2025 17:14
-
-
Save mattdesl/30bc5de23eb6edfd7362d91d43170922 to your computer and use it in GitHub Desktop.
uform3 perf test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { | |
Tensor, | |
TextEncoder, | |
TextProcessor, | |
ImageEncoder, | |
env, | |
} from "./uform-encoder.js"; | |
import imageUrl from "../assets/images/monalisa.png"; | |
const model = "fp32"; // "v3", "fp16" or "fp32" | |
const inputType = model == "v3" ? "int32" : "int64"; | |
let provider = "webgpu"; // "webgpu" or "cpu" | |
if (provider !== "webgpu" && model === "fp16") { | |
console.warn( | |
"WebGPU is required for fp16 model. Switching to WebGPU provider." | |
); | |
provider = "webgpu"; | |
} | |
const modalityPaths = { | |
v3: { | |
image_encoder: "uform3-image-text-english-small/image_encoder.onnx", | |
text_encoder: "uform3-image-text-english-small/text_encoder.onnx", | |
}, | |
fp16: { | |
text_encoder: "uform-vl-english-small-gpu-fp16/text_encoder.onnx", | |
image_encoder: "uform-vl-english-small-gpu-fp16/image_encoder.onnx", | |
}, | |
fp32: { | |
text_encoder: "uform-vl-english-small-cpu-fp32/text_encoder.onnx", | |
image_encoder: "uform-vl-english-small-cpu-fp32/image_encoder.onnx", | |
}, | |
}[model]; | |
const paths = { | |
modalityPaths, | |
tokenizerPath: "uform3-image-text-english-small/tokenizer.json", | |
}; | |
const config = await ( | |
await fetch("uform3-image-text-english-small/config.json") | |
).json(); | |
const textProcessor = new TextProcessor(config, paths.tokenizerPath); | |
await textProcessor.init(); | |
const processedTexts = await textProcessor.process(["mona lisa"]); | |
const textEncoder = new TextEncoder( | |
paths.modalityPaths.text_encoder, | |
inputType | |
); | |
await textEncoder.init({ | |
// preferredOutputLocation: "gpu-buffer", | |
executionProviders: [provider], | |
}); | |
const textOutput = await textEncoder.encode(processedTexts); | |
const textEmbed = await textOutput.embeddings.getData(); | |
await textEncoder.dispose(); | |
const image = await loadImage(imageUrl); | |
const imageInputs = Array(16) | |
.fill() | |
.map(() => image); // simulate multiple images | |
const imageCount = imageInputs.length; | |
const imageEncoder = new ImageEncoder( | |
config, | |
paths.modalityPaths.image_encoder | |
); | |
await imageEncoder.init({ | |
// preferredOutputLocation: "gpu-buffer", | |
executionProviders: [provider], | |
}); | |
const device = env.webgpu.device; | |
console.log("WebGPU device:", device); | |
console.time("resize"); | |
const imageDatas = imageEncoder.getResizedImageDatas(imageInputs); | |
console.timeEnd("resize"); | |
console.time("flatten"); | |
const { count, data } = imageEncoder.flattenImageDatas(imageDatas); | |
console.timeEnd("flatten"); | |
console.time("tensor"); | |
let tensor; | |
if (provider === "webgpu") { | |
const inputGpuBuffer = await createGPUBuffer({ | |
device, | |
array: new Float32Array(data), | |
}); | |
tensor = Tensor.fromGpuBuffer(inputGpuBuffer, { | |
dataType: "float32", | |
dims: [count, 3, imageEncoder.imageSize, imageEncoder.imageSize], | |
}); | |
} else { | |
tensor = new Tensor("float32", data, [ | |
count, | |
3, | |
imageEncoder.imageSize, | |
imageEncoder.imageSize, | |
]); | |
} | |
console.timeEnd("tensor"); | |
console.time("encode1"); | |
const imageOutput = await imageEncoder.encodeFromTensor(tensor); | |
console.timeEnd("encode1"); | |
// const imageOutput2 = await imageEncoder.encode(datas); | |
console.time("read"); | |
const f32 = await imageOutput.getData(); | |
console.timeEnd("read"); | |
// console.log(textOutput); | |
console.log("Cosine Similarity:", cos_sim(textEmbed, f32)); | |
async function loadImage(src) { | |
return new Promise((resolve, reject) => { | |
const img = new Image(); | |
img.crossOrigin = "anonymous"; | |
img.onload = () => resolve(img); | |
img.onerror = (err) => reject(err); | |
img.src = src; | |
}); | |
} | |
function cos_sim(a, b) { | |
const n = a.length; // assume a.length === b.length | |
let dot = 0.0; | |
let magA = 0.0; | |
let magB = 0.0; | |
// plain for-loop is fastest in V8 for this size | |
for (let i = 0; i < n; ++i) { | |
const ai = a[i]; | |
const bi = b[i]; | |
dot += ai * bi; | |
magA += ai * ai; | |
magB += bi * bi; | |
} | |
// sqrt of product of sums of squares | |
return dot / Math.sqrt(magA * magB); | |
} | |
async function createGPUBuffer({ device, array: floatArray }) { | |
// 1. Allocate a GPUBuffer with COPY_DST (so we can upload) and STORAGE (or whichever usage you need) | |
const bufferSize = floatArray.byteLength; // N * 4 bytes | |
const gpuBuffer = device.createBuffer({ | |
size: bufferSize, | |
usage: | |
GPUBufferUsage.STORAGE | | |
GPUBufferUsage.COPY_DST | | |
GPUBufferUsage.COPY_SRC, | |
mappedAtCreation: false, | |
}); | |
// 2. Upload the data via the queue | |
device.queue.writeBuffer( | |
/* buffer */ gpuBuffer, | |
/* bufferOffset */ 0, | |
/* data */ floatArray.buffer, // underlying ArrayBuffer | |
/* dataOffset */ floatArray.byteOffset, | |
/* size */ floatArray.byteLength | |
); | |
return gpuBuffer; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Alternative for node.js | |
// import * as ONNX from 'onnxruntime-node'; | |
// For web: | |
import * as ONNX from "onnxruntime-web"; | |
import { PreTrainedTokenizer } from "@huggingface/transformers"; | |
const { env, Tensor, InferenceSession } = ONNX; | |
export { env, Tensor, InferenceSession }; | |
async function grabJSON(url) { | |
const resp = await fetch(url); | |
return resp.json(); | |
} | |
export class TextProcessor { | |
constructor(config, tokenizerPath) { | |
this.config = config; | |
this.tokenizerPath = tokenizerPath; | |
this.maxSeqLen = 0; | |
this.padTokenIdx = 0; | |
this.tokenizer = null; | |
} | |
async init() { | |
var config = this.config; | |
if (config.text_encoder !== undefined) { | |
config = config.text_encoder; | |
} | |
this.maxSeqLen = config.max_position_embeddings; | |
this.padTokenIdx = config.padding_idx; | |
const tokenizerConfig = await grabJSON(this.tokenizerPath); | |
this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config); | |
this.tokenizer.model_max_length = this.maxSeqLen; | |
this.tokenizer.pad_token_id = this.padTokenIdx; | |
} | |
async process(texts) { | |
const encoded = await this.tokenizer(texts, { | |
add_special_tokens: true, | |
padding: "max_length", | |
max_length: this.maxSeqLen, | |
truncation: true, | |
}); | |
return { | |
input_ids: encoded.input_ids, | |
attention_mask: encoded.attention_mask, | |
}; | |
} | |
} | |
export class TextEncoder { | |
constructor(modelPath, inputType = "int32") { | |
this.modelPath = modelPath; | |
this.session = null; | |
this.inputType = inputType; // 'int32' or 'int64' | |
} | |
async init(opts = {}) { | |
this.session = await InferenceSession.create(this.modelPath, opts); | |
} | |
async dispose() { | |
if (this.session) { | |
await this.session | |
.release() | |
.catch((error) => console.error("Failed to release session", error)); | |
this.session = null; | |
} | |
} | |
async encode(inputs) { | |
if (!this.session) { | |
throw new Error("Session is not initialized."); | |
} | |
// Helper function to convert BigInt64Array to Int32Array or validate Int32Array | |
function ensureInt32Array(data) { | |
if (data instanceof Int32Array) { | |
return data; // Use as is if already Int32Array | |
} | |
if (data instanceof BigInt64Array) { | |
// Convert BigInt64Array to Int32Array, ensuring all values are in range | |
return new Int32Array( | |
Array.from(data).map((bigInt) => { | |
if (bigInt > 2147483647n || bigInt < -2147483648n) { | |
throw new Error("Value out of range for Int32."); | |
} | |
return Number(bigInt); // Convert BigInt to Number | |
}) | |
); | |
} | |
// Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array | |
if ( | |
Array.isArray(data) || | |
data instanceof Uint32Array || | |
data instanceof Uint8Array | |
) { | |
return new Int32Array(data); // Convert directly | |
} | |
throw new Error("Unsupported data type for tensor conversion."); | |
} | |
// Prepare tensor data | |
const inputIDsData = | |
this.inputType == "int32" | |
? ensureInt32Array(inputs.input_ids.data) | |
: inputs.input_ids.data; | |
const attentionMaskData = ensureInt32Array(inputs.attention_mask.data); | |
const inputIDs = new Tensor( | |
this.inputType, | |
inputIDsData, | |
inputs.input_ids.dims | |
); | |
const attentionMask = new Tensor( | |
"int32", | |
attentionMaskData, | |
inputs.attention_mask.dims | |
); | |
// Run model inference | |
return this.session.run( | |
{ | |
input_ids: inputIDs, | |
attention_mask: attentionMask, | |
}, | |
["embeddings"] | |
); | |
} | |
} | |
function toValidRGBAImageDatas(images, imageSize) { | |
if (!Array.isArray(images)) { | |
images = [images]; | |
} | |
return images.map((image) => { | |
if (image.width == null || image.height == null || image.data == null) { | |
throw new Error("Image must have width, height, and data properties."); | |
} | |
if (image.width !== imageSize || image.height !== imageSize) { | |
throw new Error( | |
`Image dimensions must be ${imageSize}x${imageSize}, but got ${image.width}x${image.height}.` | |
); | |
} | |
let channelSize = imageSize * imageSize; | |
const pixelCount = image.data.length / 4; | |
if (pixelCount !== channelSize) { | |
throw new Error( | |
`Image pixel count must be ${channelSize}, but got ${pixelCount}.` | |
); | |
} | |
return image.data; | |
}); | |
} | |
export class ImageEncoder { | |
constructor(config, modelPath) { | |
if (config.image_encoder !== undefined) { | |
config = config.image_encoder; | |
} | |
this.modelPath = modelPath; | |
this.imageSize = config.image_size; | |
this.normalizationMeans = config.normalization_means; | |
this.normalizationDeviations = config.normalization_deviations; | |
this.imageMean = new Float32Array(this.normalizationMeans); | |
this.imageStd = new Float32Array(this.normalizationDeviations); | |
} | |
getResizedImageDatas(images) { | |
if (!Array.isArray(images)) { | |
images = [images]; | |
} | |
const size = this.imageSize; | |
const offscreen = | |
typeof OffscreenCanvas !== "undefined" | |
? new OffscreenCanvas(size, size) | |
: (() => { | |
const cnv = document.createElement("canvas"); | |
cnv.width = size; | |
cnv.height = size; | |
return cnv; | |
})(); | |
const ctx = offscreen.getContext("2d", { | |
willReadFrequently: true, | |
}); | |
return images.map((src) => { | |
// clear any previous drawing | |
ctx.clearRect(0, 0, size, size); | |
// draw src into the offscreen canvas, scaling to fit | |
ctx.drawImage(src, 0, 0, size, size); | |
// pull back the pixel data | |
return ctx.getImageData(0, 0, size, size); | |
}); | |
} | |
flattenImageDatas(images) { | |
const imageDatas = toValidRGBAImageDatas(images, this.imageSize); | |
const N = imageDatas.length; | |
const H = this.imageSize, | |
W = this.imageSize; | |
const C = 3; | |
const pixelCount = H * W; | |
// allocate one big [N * C * H * W] buffer | |
const result = new Float32Array(N * C * pixelCount); | |
// for each image in the batch… | |
for (let n = 0; n < N; n++) { | |
const buffer = imageDatas[n]; | |
// compute the starting offset for this image | |
const base = n * C * pixelCount; | |
// reorder HWC -> CHW and normalize | |
for (let p = 0; p < pixelCount; p++) { | |
const r = buffer[p * 4 + 0] / 0xff; | |
const g = buffer[p * 4 + 1] / 0xff; | |
const b = buffer[p * 4 + 2] / 0xff; | |
result[base + p] = (r - this.imageMean[0]) / this.imageStd[0]; | |
result[base + pixelCount + p] = | |
(g - this.imageMean[1]) / this.imageStd[1]; | |
result[base + 2 * pixelCount + p] = | |
(b - this.imageMean[2]) / this.imageStd[2]; | |
} | |
} | |
return { count: N, data: result }; | |
} | |
encode(images) { | |
const { count: N, data } = this.flattenImageDatas(images); | |
const C = 3; | |
const H = this.imageSize, | |
W = this.imageSize; | |
// shape is [N, 3, H, W] | |
// Create ONNX Tensor | |
const imagesTensor = new Tensor("float32", data, [N, C, H, W]); | |
// Run model inference | |
return this.encodeFromTensor(imagesTensor); | |
} | |
async encodeFromTensor(tensor) { | |
const key = this.session.inputNames[0]; | |
const opts = {}; | |
opts[key] = tensor; | |
const output = | |
this.session.outputNames[this.session.outputNames.length - 1]; | |
const res = await this.session.run(opts, [output]); | |
return res[output]; | |
} | |
/** | |
* Initializes the ONNX session with the pre-trained model. | |
*/ | |
async init(opts = {}) { | |
this.session = await InferenceSession.create(this.modelPath, opts); | |
} | |
/** | |
* Releases the ONNX session resources. | |
*/ | |
async dispose() { | |
if (this.session) { | |
await this.session | |
.release() | |
.catch((error) => console.error("Failed to release session", error)); | |
this.session = null; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment