Skip to content

Instantly share code, notes, and snippets.

@benvanik
benvanik / external-transients.md
Last active October 24, 2025 07:59
External transient storage design doc

External Transients Implementation Plan

Overview

Enable users to provide buffers for transient memory allocation in their functions, with generated query functions to calculate required sizes. This supports the kernel JIT use case where applications need control over transient allocations.

Motivation: Building a kernel JIT on top of IREE where users provide IR of their linalg ops, we compile it into dispatches, and our host code schedules it with transient allocation. Users need to control transient memory ahead of time, so we provide size query functions and let them pass storage buffers to functions (making zero allocations in steady state).


Phase 0: Foundation - ABI & HAL Layer ✅ COMPLETED

@benvanik
benvanik / 0-input.mlir
Last active June 2, 2025 19:11
PR20855 IR examples
// tools/test/iree-run-module-multi.mlir
func.func public @multi_device_mul(
// Input argument is resident on device_a (tooling default to first device).
%input_a: tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
) -> (
// Output result is expected to be on device_a (though not required).
tensor<4xf32> {iree.abi.affinity = #hal.device.promise<@device_a>}
) {
// Compute on device_a (input is there).
@benvanik
benvanik / build_kernels_c.sh
Last active November 13, 2024 16:19
hsa prototypes
set -x
~/src/iree-build/llvm-project/bin/clang \
-x c -std=c23 \
-target amdgcn-amd-amdhsa -march=gfx1100 \
-nogpulib \
-fgpu-rdc \
-fno-short-wchar \
-fno-ident \
-Xclang -finclude-default-header \
@benvanik
benvanik / methods.mlir
Last active September 14, 2022 14:35
vmvx notes
vm.import @vmvx.add.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
vm.import @vmvx.add.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
vm.import @vmvx.and.2d.i32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_strides : tuple<i64, i64>, %out_buffer : !vm.buffer, %out_offset : i64, %out_strides : tuple<i64, i64>, %sizes : tuple<i64, i64>)
vm.import @vmvx.div.2d.f32(%lhs_buffer : !vm.buffer, %lhs_offset : i64, %lhs_strides : tuple<i64, i64>, %rhs_buffer : !vm.buffer, %rhs_offset : i64, %rhs_st
@benvanik
benvanik / unidirectional_lstm.mlir
Created June 15, 2022 19:51
unidirectional_lstm.mlir
This file has been truncated, but you can view the full file.
// -----// IR Dump After TopLevelSCFToCFG //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg12: tensor<5x1x10xf32>, %arg13: tensor<5x1x10xf32>) -> tensor<i1> {
%0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<"comparison_direction LT">} : (tensor<i64>, tensor<i64>) -> tensor<i1>
return %0 : tensor<i1>
}
// -----// IR Dump After MHLOToMHLOPreprocessing //----- //
func.func private @ForwardLoopCond_gFAnjWGSoLs__.167(%arg0: tensor<i64>, %arg1: tensor<i64>, %arg2: tensor<40xf32>, %arg3: tensor<i64>, %arg4: tensor<74x40xf32>, %arg5: tensor<i64>, %arg6: tensor<1x10xf32>, %arg7: tensor<1x10xf32>, %arg8: tensor<5x1x64xf32>, %arg9: tensor<5x1x1xf32>, %arg10: tensor<5x1x1xf32>, %arg11: tensor<5xi64>, %arg
@benvanik
benvanik / launch.json
Created June 15, 2022 19:41
simple_mul.mlir
{
"name": "(gdb) iree-compile",
"type": "cppdbg",
"request": "launch",
"preLaunchTask": "build-iree-compile",
"program": "${command:cmake.buildDirectory}/tools/iree-compile",
"args": [
// "-iree-vm-bytecode-module-output-format=annotated-mlir-text",
"-iree-vm-bytecode-source-listing=${workspaceFolder}/../iree-tmp/vm.mlir",
"-iree-vm-emit-polyglot-zip=true",
@benvanik
benvanik / api_interfaces_cc.h
Created January 6, 2021 00:12
WIP api_interfaces_cc.h example for #iree/4369
// Copyright 2020 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
@benvanik
benvanik / 0-static-input.mlir
Last active December 15, 2020 23:53
tiled dispatch
// Simple dispatch of static shapes.
func @staticShapeDispatch(%arg0 : tensor<8x4xf32>) -> tensor<4x8xf32> {
%x = constant 100 : index
%y = constant 50 : index
// %x, %y here are the workgroup counts along a 2D grid to dispatch; backends turn them into 3D XYZ.
%0 = flow.dispatch.workgroups[%x, %y](%arg0) : (tensor<8x4xf32>) -> (tensor<4x8xf32>) = (
// I/O are modeled in the region as ref arguments that have some special ops available.
%arg : !flow.dispatch.input<8x4xf32>, %ret : !flow.dispatch.output<4x8xf32>
) {
// Loads a tensor from an input; can be tiled with offsets/sizes/strides.
// RUN: iree-opt -allow-unregistered-dialect -split-input-file %s | iree-opt -allow-unregistered-dialect -split-input-file | IreeFileCheck %s
func @workgroups(%arg0 : tensor<?x4xf32>, %arg1 : index) -> tensor<4x?xf32> {
%x = constant 100 : index
%y = constant 50 : index
%0 = flow.dispatch.workgroups[%x, %y](%arg0, %arg1) : (tensor<?x4xf32>, index) -> (tensor<4x?xf32>) =
(%arg0_capture : !flow.dispatch.input<?x4xf32>, %arg1_capture : index, %ret0 : !flow.dispatch.output<4x?xf32>) {
// Query symbolic workgroup info:
@benvanik
benvanik / add.mlir
Last active November 23, 2020 12:53
TFLite -> IREE synthesis example
//===----------------------------------------------------------------------===//
// b = add(a, a)
//===----------------------------------------------------------------------===//
// NOTE: this represents what our tflite import flow should produce; the _
// prefixed functions are all synthesized by us. We use the VM dialect in here
// now because std has no list and other stuff. In a real flow we may have a
// iree_tflite dialect that has pseudo ops for these things that then plug into
// the VM conversion interface, or maybe we just emit them as-is at input