Skip to content

Instantly share code, notes, and snippets.

➜ pingpong python generate_8192_inputs.py
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/numpy/core/__init__.py", line 22, in <module>
from . import multiarray
File "/usr/lib/python3/dist-packages/numpy/core/multiarray.py", line 12, in <module>
from . import overrides
File "/usr/lib/python3/dist-packages/numpy/core/overrides.py", line 7, in <module>
from numpy.core._multiarray_umath import (
ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'
➜ pingpong export ROCPROF_ATT_LIBRARY_PATH=$PWD/att-decoder-v3-3.0.0-Linux/opt/rocm/lib
➜ pingpong ~/therock/bin/rocprofv3 -i ~/pingpong/att.json -d traces -- ~/iree-build/tools/testing/e2e/iree-e2e-matmul-test --device=hip \
--module=tmp/dispatch.vmfb \
--module=tmp/calls.vmfb \
--acceptable_fp_delta=1e-02
*** Aborted at 1746043263 (unix time) try "date -d @1746043263" if you are using GNU date ***
PC: @ 0x7d411c73bf8a std::ostream::sentry::sentry(std::ostream&)
*** SIGSEGV (@0xffffffffffffffe8) received by PID 3817537 (TID 0x7d411c8e1000) from PID 18446744073709551592; stack trace: ***
@ 0x7d411c299ee8 (unknown)
@bjacob
bjacob / calls.mlir
Created April 25, 2025 16:03
Test setup for FP8 pingpong after Llama dispatch
builtin.module @calls attributes {
} {
func.func private @matmul_test.generate_random_matrix(%device: !hal.device, %dim0: i64, %dim1: i64, %element_type: i32, %seed: i32) -> !hal.buffer_view
func.func private @matmul_test.check_matmul_results(%device: !hal.device, %m: i64, %k: i64, %n: i64, %transpose_rhs: i32, %lhs: !hal.buffer_view, %rhs: !hal.buffer_view, %acc: !hal.buffer_view, %actual_result: !hal.buffer_view)
func.func private @module.matmul(%lhs: !hal.buffer_view, %rhs: !hal.buffer_view) -> !hal.buffer_view
func.func @matmul() attributes {
// -----// IR Dump After LoweringConfigInterpreterPass (iree-codegen-lowering-config-interpreter) //----- //
func.func @matmul_dispatch_0_matmul_like_Dx256x4096x4096_f8E4M3FNUZxf8E4M3FNUZxf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = true>, llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}}>} {
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
%1 = hal.interface.constant.load layout(<constants = 2, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnl
+ __global__ __launch_bounds__(256) static void run(const void *A_data,
+ const void *B_data,
+ void *C_data, int M_outer, int N_outer,
+ int K_outer) {
+ int total_tiles = M_outer * N_outer;
+ int cu = blockIdx.x;
+ int tile_start = total_tiles * cu / CUs;
+ int tile_end = total_tiles * (cu + 1) / CUs;
+ int m_outer = tile_start / N_outer;
+ int n_outer = tile_start - m_outer * N_outer;
This file has been truncated, but you can view the full file.
// -----// IR Dump After CheckVHLOStableHloMixUsage (iree-check-vhlostablehlo-mix-usage) //----- //
module {
func.func @sort3D() {
%0 = util.unfoldable_constant dense<[[[1, 2, 3, 4], [4, 3, 2, 1]]]> : tensor<1x2x4xi32>
%1 = "stablehlo.sort"(%0) <{dimension = 2 : i64, is_stable = false}> ({
^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
%2 = stablehlo.compare LT, %arg0, %arg1 : (tensor<i32>, tensor<i32>) -> tensor<i1>
stablehlo.return %2 : tensor<i1>
}) : (tensor<1x2x4xi32>) -> tensor<1x2x4xi32>
check.expect_eq_const(%1, dense<[[[1, 2, 3, 4], [1, 2, 3, 4]]]> : tensor<1x2x4xi32>) : tensor<1x2x4xi32>
@bjacob
bjacob / sort3D.rocmasm
Created March 24, 2025 16:04
sort3D asm
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store
.p2align 8
.type _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store,@function
_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_waitcnt lgkmcnt(0)
s_branch .LBB0_0
diff --git a/tests/e2e/math/math_ops_llvm-cpu.json b/tests/e2e/math/math_ops_rocm.json
index 57e8c422a7..fc4a6c964a 100644
--- a/tests/e2e/math/math_ops_llvm-cpu.json
+++ b/tests/e2e/math/math_ops_rocm.json
@@ -8,8 +8,8 @@
{
"op": "acos",
"type": "f16",
- "atol": 1.0e-02,
- "rtol": 1.0e-02
::mlir::ParseResult ExpectAlmostEqConstOp::parse(::mlir::OpAsmParser &parser, ::mlir::OperationState &result) {
::llvm::SmallVector<::mlir::OpAsmParser::UnresolvedOperand, 4> deviceOperands;
::llvm::SMLoc deviceOperandsLoc;
(void)deviceOperandsLoc;
::mlir::OpAsmParser::UnresolvedOperand lhsRawOperand{};
::llvm::ArrayRef<::mlir::OpAsmParser::UnresolvedOperand> lhsOperands(&lhsRawOperand, 1); ::llvm::SMLoc lhsOperandsLoc;
(void)lhsOperandsLoc;
::mlir::ElementsAttr valueAttr;
::mlir::FloatAttr toleranceAttr;
::mlir::Type lhsRawType{};
// -----// IR Dump After CSE (cse) //----- //
func.func @__builtin_fill_i64(%arg0: !util.buffer, %arg1: !util.buffer, %arg2: !util.list<!util.buffer>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32, %arg9: i32, %arg10: i32, %arg11: i32) attributes {translation_info = #iree_codegen.translation_info<pipeline = VMVXDefault>} {
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%c32_i64 = arith.constant 32 : i64
%c1 = arith.constant 1 : index
%buffer_size = util.buffer.size %arg1 : !util.buffer
%0 = util.buffer.load %arg1[%c0 for %c4] : !util.buffer{%buffer_size} -> i32