Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created October 14, 2022 17:39
Show Gist options
  • Save pashu123/cab92ca3ec60194dc39e9a9548dd2798 to your computer and use it in GitHub Desktop.
Save pashu123/cab92ca3ec60194dc39e9a9548dd2798 to your computer and use it in GitHub Desktop.
stable_diff_f16_elided.mlir:1730:12: error: failed to legalize operation 'vector.transfer_read' that was explicitly marked illegal
%261 = linalg.generic {indexing_maps = [#map16, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_770, %209 : tensor<2x4096x320xf16>, tensor<2x4096x320xf16>) outs(%156 : tensor<2x4096x320xf16>) {
^
stable_diff_f16_elided.mlir:25:3: note: called from
func.func @forward(%arg0: tensor<2x4x64x64xf16>, %arg1: tensor<1xf16>, %arg2: tensor<2x77x768xf16>) -> tensor<2x4x64x64xf16> {
^
stable_diff_f16_elided.mlir:1730:12: note: see current operation: %710 = "vector.transfer_read"(%58, %709, %45) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (memref<320xf16>, index, f16) -> vector<16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%261 = linalg.generic {indexing_maps = [#map16, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_770, %209 : tensor<2x4096x320xf16>, tensor<2x4096x320xf16>) outs(%156 : tensor<2x4096x320xf16>) {
^
stable_diff_f16_elided.mlir:1730:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>
%261 = linalg.generic {indexing_maps = [#map16, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_770, %209 : tensor<2x4096x320xf16>, tensor<2x4096x320xf16>) outs(%156 : tensor<2x4096x320xf16>) {
^
stable_diff_f16_elided.mlir:25:3: note: called from
func.func @forward(%arg0: tensor<2x4x64x64xf16>, %arg1: tensor<1xf16>, %arg2: tensor<2x77x768xf16>) -> tensor<2x4x64x64xf16> {
^
stable_diff_f16_elided.mlir:1730:12: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg0: !hal.device loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)), %arg1: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)), %arg2: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))):
%0 = "arith.constant"() {value = 20 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%1 = "arith.constant"() {value = 512 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%2 = "arith.constant"() {value = 1 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"hal.return"(%0, %1, %2) : (index, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {layout = #hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_77_matmul_8192x320x320", translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"builtin.module"() ({
"func.func"() ({
%0 = "arith.constant"() {value = 16 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%1 = "arith.constant"() {value = 32 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%2 = "arith.constant"() {value = 48 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%3 = "arith.constant"() {value = 64 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%4 = "arith.constant"() {value = 80 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%5 = "arith.constant"() {value = 96 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%6 = "arith.constant"() {value = 112 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%7 = "arith.constant"() {value = 128 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%8 = "arith.constant"() {value = 144 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%9 = "arith.constant"() {value = 160 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%10 = "arith.constant"() {value = 176 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%11 = "arith.constant"() {value = 192 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%12 = "arith.constant"() {value = 208 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%13 = "arith.constant"() {value = 224 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%14 = "arith.constant"() {value = 240 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%15 = "arith.constant"() {value = 256 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%16 = "arith.constant"() {value = 272 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%17 = "arith.constant"() {value = 288 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%18 = "arith.constant"() {value = 304 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%19 = "arith.constant"() {value = dense<0.000000e+00> : vector<16x16xf16>} : () -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1722:12 at "stable_diff_f16_elided.mlir":25:3))
%20 = "arith.constant"() {value = 0 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1732:15 at "stable_diff_f16_elided.mlir":25:3))
%21 = "arith.constant"() {value = 8192 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%22 = "arith.constant"() {value = 320 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%23 = "arith.constant"() {value = 0.000000e+00 : f16} : () -> f16 loc(callsite("stable_diff_f16_elided.mlir":721:16 at "stable_diff_f16_elided.mlir":25:3))
%24 = "arith.constant"() {value = 299442560 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":40:15 at "stable_diff_f16_elided.mlir":25:3))
%25 = "arith.constant"() {value = 30389056 : index} : () -> index loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%26 = "hal.interface.constant.load"() {index = 0 : index, values = [5242880 : i32, 21869376 : i32, 25146176 : i32, 32429376 : i32]} : () -> i32 loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%27 = "hal.interface.constant.load"() {index = 1 : index, values = [0 : i32, 10564416 : i32, 25146176 : i32]} : () -> i32 loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%28 = "hal.interface.constant.load"() {index = 2 : index, values = [5321536 : i32, 10485760 : i32, 16626496 : i32, 19903296 : i32, 21869376 : i32]} : () -> i32 loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%29 = "arith.index_castui"(%26) : (i32) -> index loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%30 = "arith.index_castui"(%27) : (i32) -> index loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%31 = "arith.index_castui"(%28) : (i32) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%32 = "hal.interface.binding.subspan"(%29) {alignment = 64 : index, binding = 0 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8192x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%32) {alignment = 64 : i32} : (memref<8192x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3))
%33 = "hal.interface.binding.subspan"(%25) {alignment = 64 : index, binding = 0 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<320x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%33) {alignment = 64 : i32} : (memref<320x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3))
%34 = "hal.interface.binding.subspan"(%24) {alignment = 64 : index, binding = 1 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<320xf16> loc(callsite("stable_diff_f16_elided.mlir":40:15 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%34) {alignment = 64 : i32} : (memref<320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":40:15 at "stable_diff_f16_elided.mlir":25:3))
%35 = "hal.interface.binding.subspan"(%30) {alignment = 64 : index, binding = 0 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8192x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%35) {alignment = 64 : i32} : (memref<8192x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%36 = "hal.interface.binding.subspan"(%31) {alignment = 64 : index, binding = 2 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8192x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%36) {alignment = 64 : i32} : (memref<8192x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%37 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%38 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%39 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%40 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%41 = "affine.apply"(%39) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%42 = "affine.apply"(%40) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%43 = "affine.apply"(%37) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%44 = "affine.apply"(%38) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"scf.for"(%41, %21, %42) ({
^bb0(%arg0: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))):
"scf.for"(%43, %22, %44) ({
^bb0(%arg1: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))):
%45 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%46 = "vector.transfer_read"(%32, %45, %20, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%47 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%48 = "vector.transfer_read"(%32, %47, %0, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%49 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%50 = "vector.transfer_read"(%32, %49, %1, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%51 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%52 = "vector.transfer_read"(%32, %51, %2, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%53 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%54 = "vector.transfer_read"(%32, %53, %3, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%55 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%56 = "vector.transfer_read"(%32, %55, %4, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%57 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%58 = "vector.transfer_read"(%32, %57, %5, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%59 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%60 = "vector.transfer_read"(%32, %59, %6, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%61 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%62 = "vector.transfer_read"(%32, %61, %7, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%63 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%64 = "vector.transfer_read"(%32, %63, %8, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%65 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%66 = "vector.transfer_read"(%32, %65, %9, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%67 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%68 = "vector.transfer_read"(%32, %67, %10, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%69 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%70 = "vector.transfer_read"(%32, %69, %11, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%71 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%72 = "vector.transfer_read"(%32, %71, %12, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%73 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%74 = "vector.transfer_read"(%32, %73, %13, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%75 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%76 = "vector.transfer_read"(%32, %75, %14, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%77 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%78 = "vector.transfer_read"(%32, %77, %15, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%79 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%80 = "vector.transfer_read"(%32, %79, %16, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%81 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%82 = "vector.transfer_read"(%32, %81, %17, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%83 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%84 = "vector.transfer_read"(%32, %83, %18, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%85 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%86 = "vector.transfer_read"(%33, %20, %85, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%87 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%88 = "vector.transfer_read"(%33, %0, %87, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%89 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%90 = "vector.transfer_read"(%33, %1, %89, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%91 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%92 = "vector.transfer_read"(%33, %2, %91, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%93 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%94 = "vector.transfer_read"(%33, %3, %93, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%95 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%96 = "vector.transfer_read"(%33, %4, %95, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%97 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%98 = "vector.transfer_read"(%33, %5, %97, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%99 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%100 = "vector.transfer_read"(%33, %6, %99, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%101 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%102 = "vector.transfer_read"(%33, %7, %101, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%103 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%104 = "vector.transfer_read"(%33, %8, %103, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%105 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%106 = "vector.transfer_read"(%33, %9, %105, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%107 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%108 = "vector.transfer_read"(%33, %10, %107, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%109 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%110 = "vector.transfer_read"(%33, %11, %109, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%111 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%112 = "vector.transfer_read"(%33, %12, %111, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%113 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%114 = "vector.transfer_read"(%33, %13, %113, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%115 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%116 = "vector.transfer_read"(%33, %14, %115, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%117 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%118 = "vector.transfer_read"(%33, %15, %117, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%119 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%120 = "vector.transfer_read"(%33, %16, %119, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%121 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%122 = "vector.transfer_read"(%33, %17, %121, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%123 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%124 = "vector.transfer_read"(%33, %18, %123, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%125 = "vector.contract"(%46, %86, %19) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%126 = "vector.contract"(%48, %88, %125) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%127 = "vector.contract"(%50, %90, %126) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%128 = "vector.contract"(%52, %92, %127) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%129 = "vector.contract"(%54, %94, %128) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%130 = "vector.contract"(%56, %96, %129) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%131 = "vector.contract"(%58, %98, %130) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%132 = "vector.contract"(%60, %100, %131) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%133 = "vector.contract"(%62, %102, %132) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%134 = "vector.contract"(%64, %104, %133) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%135 = "vector.contract"(%66, %106, %134) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%136 = "vector.contract"(%68, %108, %135) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%137 = "vector.contract"(%70, %110, %136) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%138 = "vector.contract"(%72, %112, %137) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%139 = "vector.contract"(%74, %114, %138) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%140 = "vector.contract"(%76, %116, %139) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%141 = "vector.contract"(%78, %118, %140) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%142 = "vector.contract"(%80, %120, %141) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%143 = "vector.contract"(%82, %122, %142) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%144 = "vector.contract"(%84, %124, %143) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%145 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%146 = "vector.transfer_read"(%34, %145, %23) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (memref<320xf16>, index, f16) -> vector<16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%147 = "vector.broadcast"(%146) : (vector<16xf16>) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%148 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%149 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%150 = "vector.transfer_read"(%35, %148, %149, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%151 = "arith.addf"(%147, %144) : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1726:15 at "stable_diff_f16_elided.mlir":25:3))
%152 = "arith.addf"(%151, %150) : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1732:15 at "stable_diff_f16_elided.mlir":25:3))
%153 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%154 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"vector.transfer_write"(%152, %36, %153, %154) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (vector<16x16xf16>, memref<8192x320xf16>, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1732:15 at "stable_diff_f16_elided.mlir":25:3))
"scf.yield"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) : (index, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"scf.yield"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) : (index, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"func.return"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {function_type = () -> (), sym_name = "forward_dispatch_77_matmul_8192x320x320"} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"hal.executable.variant_end"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%261 = linalg.generic {indexing_maps = [#map16, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_770, %209 : tensor<2x4096x320xf16>, tensor<2x4096x320xf16>) outs(%156 : tensor<2x4096x320xf16>) {
^
stable_diff_f16_elided.mlir:1730:12: error: failed to serialize executables
%261 = linalg.generic {indexing_maps = [#map16, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_770, %209 : tensor<2x4096x320xf16>, tensor<2x4096x320xf16>) outs(%156 : tensor<2x4096x320xf16>) {
^
stable_diff_f16_elided.mlir:25:3: note: called from
func.func @forward(%arg0: tensor<2x4x64x64xf16>, %arg1: tensor<1xf16>, %arg2: tensor<2x77x768xf16>) -> tensor<2x4x64x64xf16> {
^
stable_diff_f16_elided.mlir:1730:12: note: see current operation:
"hal.executable"() ({
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg0: !hal.device loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)), %arg1: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)), %arg2: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))):
%0 = "arith.constant"() {value = 20 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%1 = "arith.constant"() {value = 512 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%2 = "arith.constant"() {value = 1 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"hal.return"(%0, %1, %2) : (index, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {layout = #hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>, ordinal = 0 : index, sym_name = "forward_dispatch_77_matmul_8192x320x320", translation_info = #iree_codegen.translation_info<SPIRVCooperativeMatrixVectorize>, workgroup_size = [32 : index, 1 : index, 1 : index]} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"builtin.module"() ({
"func.func"() ({
%0 = "arith.constant"() {value = 16 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%1 = "arith.constant"() {value = 32 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%2 = "arith.constant"() {value = 48 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%3 = "arith.constant"() {value = 64 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%4 = "arith.constant"() {value = 80 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%5 = "arith.constant"() {value = 96 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%6 = "arith.constant"() {value = 112 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%7 = "arith.constant"() {value = 128 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%8 = "arith.constant"() {value = 144 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%9 = "arith.constant"() {value = 160 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%10 = "arith.constant"() {value = 176 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%11 = "arith.constant"() {value = 192 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%12 = "arith.constant"() {value = 208 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%13 = "arith.constant"() {value = 224 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%14 = "arith.constant"() {value = 240 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%15 = "arith.constant"() {value = 256 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%16 = "arith.constant"() {value = 272 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%17 = "arith.constant"() {value = 288 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%18 = "arith.constant"() {value = 304 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%19 = "arith.constant"() {value = dense<0.000000e+00> : vector<16x16xf16>} : () -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1722:12 at "stable_diff_f16_elided.mlir":25:3))
%20 = "arith.constant"() {value = 0 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1732:15 at "stable_diff_f16_elided.mlir":25:3))
%21 = "arith.constant"() {value = 8192 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%22 = "arith.constant"() {value = 320 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%23 = "arith.constant"() {value = 0.000000e+00 : f16} : () -> f16 loc(callsite("stable_diff_f16_elided.mlir":721:16 at "stable_diff_f16_elided.mlir":25:3))
%24 = "arith.constant"() {value = 299442560 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":40:15 at "stable_diff_f16_elided.mlir":25:3))
%25 = "arith.constant"() {value = 30389056 : index} : () -> index loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%26 = "hal.interface.constant.load"() {index = 0 : index, values = [5242880 : i32, 21869376 : i32, 25146176 : i32, 32429376 : i32]} : () -> i32 loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%27 = "hal.interface.constant.load"() {index = 1 : index, values = [0 : i32, 10564416 : i32, 25146176 : i32]} : () -> i32 loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%28 = "hal.interface.constant.load"() {index = 2 : index, values = [5321536 : i32, 10485760 : i32, 16626496 : i32, 19903296 : i32, 21869376 : i32]} : () -> i32 loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%29 = "arith.index_castui"(%26) : (i32) -> index loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%30 = "arith.index_castui"(%27) : (i32) -> index loc(fused[callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3), callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3)])
%31 = "arith.index_castui"(%28) : (i32) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%32 = "hal.interface.binding.subspan"(%29) {alignment = 64 : index, binding = 0 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8192x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%32) {alignment = 64 : i32} : (memref<8192x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1721:22 at "stable_diff_f16_elided.mlir":25:3))
%33 = "hal.interface.binding.subspan"(%25) {alignment = 64 : index, binding = 0 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<320x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%33) {alignment = 64 : i32} : (memref<320x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1717:12 at "stable_diff_f16_elided.mlir":25:3))
%34 = "hal.interface.binding.subspan"(%24) {alignment = 64 : index, binding = 1 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<320xf16> loc(callsite("stable_diff_f16_elided.mlir":40:15 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%34) {alignment = 64 : i32} : (memref<320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":40:15 at "stable_diff_f16_elided.mlir":25:3))
%35 = "hal.interface.binding.subspan"(%30) {alignment = 64 : index, binding = 0 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8192x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%35) {alignment = 64 : i32} : (memref<8192x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%36 = "hal.interface.binding.subspan"(%31) {alignment = 64 : index, binding = 2 : index, descriptor_type = 7 : i32, operand_segment_sizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8192x320xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"memref.assume_alignment"(%36) {alignment = 64 : i32} : (memref<8192x320xf16>) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%37 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%38 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%39 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%40 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%41 = "affine.apply"(%39) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%42 = "affine.apply"(%40) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%43 = "affine.apply"(%37) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%44 = "affine.apply"(%38) {map = affine_map<()[s0] -> (s0 * 16)>} : (index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"scf.for"(%41, %21, %42) ({
^bb0(%arg0: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))):
"scf.for"(%43, %22, %44) ({
^bb0(%arg1: index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))):
%45 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%46 = "vector.transfer_read"(%32, %45, %20, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%47 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%48 = "vector.transfer_read"(%32, %47, %0, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%49 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%50 = "vector.transfer_read"(%32, %49, %1, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%51 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%52 = "vector.transfer_read"(%32, %51, %2, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%53 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%54 = "vector.transfer_read"(%32, %53, %3, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%55 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%56 = "vector.transfer_read"(%32, %55, %4, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%57 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%58 = "vector.transfer_read"(%32, %57, %5, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%59 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%60 = "vector.transfer_read"(%32, %59, %6, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%61 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%62 = "vector.transfer_read"(%32, %61, %7, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%63 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%64 = "vector.transfer_read"(%32, %63, %8, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%65 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%66 = "vector.transfer_read"(%32, %65, %9, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%67 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%68 = "vector.transfer_read"(%32, %67, %10, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%69 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%70 = "vector.transfer_read"(%32, %69, %11, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%71 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%72 = "vector.transfer_read"(%32, %71, %12, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%73 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%74 = "vector.transfer_read"(%32, %73, %13, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%75 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%76 = "vector.transfer_read"(%32, %75, %14, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%77 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%78 = "vector.transfer_read"(%32, %77, %15, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%79 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%80 = "vector.transfer_read"(%32, %79, %16, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%81 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%82 = "vector.transfer_read"(%32, %81, %17, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%83 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%84 = "vector.transfer_read"(%32, %83, %18, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%85 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%86 = "vector.transfer_read"(%33, %20, %85, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%87 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%88 = "vector.transfer_read"(%33, %0, %87, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%89 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%90 = "vector.transfer_read"(%33, %1, %89, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%91 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%92 = "vector.transfer_read"(%33, %2, %91, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%93 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%94 = "vector.transfer_read"(%33, %3, %93, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%95 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%96 = "vector.transfer_read"(%33, %4, %95, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%97 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%98 = "vector.transfer_read"(%33, %5, %97, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%99 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%100 = "vector.transfer_read"(%33, %6, %99, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%101 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%102 = "vector.transfer_read"(%33, %7, %101, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%103 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%104 = "vector.transfer_read"(%33, %8, %103, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%105 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%106 = "vector.transfer_read"(%33, %9, %105, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%107 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%108 = "vector.transfer_read"(%33, %10, %107, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%109 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%110 = "vector.transfer_read"(%33, %11, %109, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%111 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%112 = "vector.transfer_read"(%33, %12, %111, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%113 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%114 = "vector.transfer_read"(%33, %13, %113, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%115 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%116 = "vector.transfer_read"(%33, %14, %115, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%117 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%118 = "vector.transfer_read"(%33, %15, %117, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%119 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%120 = "vector.transfer_read"(%33, %16, %119, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%121 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%122 = "vector.transfer_read"(%33, %17, %121, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%123 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%124 = "vector.transfer_read"(%33, %18, %123, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<320x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1723:12 at "stable_diff_f16_elided.mlir":25:3))
%125 = "vector.contract"(%46, %86, %19) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%126 = "vector.contract"(%48, %88, %125) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%127 = "vector.contract"(%50, %90, %126) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%128 = "vector.contract"(%52, %92, %127) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%129 = "vector.contract"(%54, %94, %128) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%130 = "vector.contract"(%56, %96, %129) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%131 = "vector.contract"(%58, %98, %130) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%132 = "vector.contract"(%60, %100, %131) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%133 = "vector.contract"(%62, %102, %132) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%134 = "vector.contract"(%64, %104, %133) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%135 = "vector.contract"(%66, %106, %134) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%136 = "vector.contract"(%68, %108, %135) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%137 = "vector.contract"(%70, %110, %136) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%138 = "vector.contract"(%72, %112, %137) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%139 = "vector.contract"(%74, %114, %138) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%140 = "vector.contract"(%76, %116, %139) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%141 = "vector.contract"(%78, %118, %140) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%142 = "vector.contract"(%80, %120, %141) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%143 = "vector.contract"(%82, %122, %142) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%144 = "vector.contract"(%84, %124, %143) {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = [#vector.iterator_type<parallel>, #vector.iterator_type<parallel>, #vector.iterator_type<reduction>], kind = #vector.kind<add>} : (vector<16x16xf16>, vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite(unknown at "stable_diff_f16_elided.mlir":25:3))
%145 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%146 = "vector.transfer_read"(%34, %145, %23) {in_bounds = [true], operand_segment_sizes = array<i32: 1, 1, 1, 0>, permutation_map = affine_map<(d0) -> (d0)>} : (memref<320xf16>, index, f16) -> vector<16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%147 = "vector.broadcast"(%146) : (vector<16xf16>) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%148 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%149 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%150 = "vector.transfer_read"(%35, %148, %149, %23) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 2, 1, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (memref<8192x320xf16>, index, index, f16) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%151 = "arith.addf"(%147, %144) : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1726:15 at "stable_diff_f16_elided.mlir":25:3))
%152 = "arith.addf"(%151, %150) : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16> loc(callsite("stable_diff_f16_elided.mlir":1732:15 at "stable_diff_f16_elided.mlir":25:3))
%153 = "affine.apply"(%20, %arg0) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%154 = "affine.apply"(%20, %arg1) {map = affine_map<(d0)[s0] -> (d0 + s0)>} : (index, index) -> index loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"vector.transfer_write"(%152, %36, %153, %154) {in_bounds = [true, true], operand_segment_sizes = array<i32: 1, 1, 2, 0>, permutation_map = affine_map<(d0, d1) -> (d0, d1)>} : (vector<16x16xf16>, memref<8192x320xf16>, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1732:15 at "stable_diff_f16_elided.mlir":25:3))
"scf.yield"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) : (index, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"scf.yield"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) : (index, index, index) -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"func.return"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {function_type = () -> (), sym_name = "forward_dispatch_77_matmul_8192x320x320"} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"hal.executable.variant_end"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {sym_name = "vulkan_spirv_fb", target = #hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
"hal.executable_end"() : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
}) {sym_name = "forward_dispatch_77", sym_visibility = "private"} : () -> () loc(callsite("stable_diff_f16_elided.mlir":1730:12 at "stable_diff_f16_elided.mlir":25:3))
%261 = linalg.generic {indexing_maps = [#map16, #map16, #map16], iterator_types = ["parallel", "parallel", "parallel"]} ins(%expanded_770, %209 : tensor<2x4096x320xf16>, tensor<2x4096x320xf16>) outs(%156 : tensor<2x4096x320xf16>) {
^
compilation failed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment