pashu123 · February 8, 2023 17:51
diff --git a/module_forward_dispatch_0_vulkan_spirv_fb.mlir b/module_forward_dispatch_0_vulkan_spirv_fb.mlir
 module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>], legacy_sync}>]} {
  hal.executable private @forward_dispatch_0 {
    hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> {
      hal.executable.export public @forward_dispatch_0_generic_10x9216x9216 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @forward_dispatch_0_generic_10x9216x9216() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant -3.40282347E+38 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
          %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>> -> tensor<10x9216x9216xf32>
          %3 = tensor.empty() : tensor<10x9216x9216xf32>
          %4 = tensor.empty() : tensor<10x9216xf32>
          %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<10x9216xf32>) -> tensor<10x9216xf32>
          %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf32>) outs(%5 : tensor<10x9216xf32>) {
          ^bb0(%in: f32, %out: f32):
            %8 = arith.maxf %in, %out : f32
            linalg.yield %8 : f32
          } -> tensor<10x9216xf32>
          %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf32>, tensor<10x9216xf32>) outs(%3 : tensor<10x9216x9216xf32>) {
          ^bb0(%in: f32, %in_0: f32, %out: f32):
            %8 = arith.subf %in, %in_0 : f32
            %9 = math.exp %8 : f32
            linalg.yield %9 : f32
          } -> tensor<10x9216x9216xf32>
          flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf32> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
          return
        }
      }
    }
  }
  util.global private mutable @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
  util.initializer {
    %c6794772480 = arith.constant 6794772480 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c6794772480}
    util.global.store %buffer, @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
    util.initializer.return
  }
  func.func @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c9216 = arith.constant 9216 : index
    %c10 = arith.constant 10 : index
    %c1 = arith.constant 1 : index
    %c3397386240 = arith.constant 3397386240 : index
    %c0 = arith.constant 0 : index
    %0 = arith.index_cast %arg0 : i32 to index
    %device = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
    %forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer = util.global.load @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c0, %c3397386240], 
      %c1 = (%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c3397386240, %c3397386240]
    ])
    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_0::@vulkan_spirv_fb::@forward_dispatch_0_generic_10x9216x9216) workload([%c10, %c9216, %c9216]) : index, index, index
    scf.for %arg1 = %c0 to %0 step %c1 {
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_0::@vulkan_spirv_fb::@forward_dispatch_0_generic_10x9216x9216) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    }
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %1 = util.null : !hal.fence
    %2 = hal.timeline.advance : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%2) commands([%cmd])
    %status = hal.fence.await until([%2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    return
  }
 }
diff --git a/module_forward_dispatch_1_vulkan_spirv_fb.mlir b/module_forward_dispatch_1_vulkan_spirv_fb.mlir
 module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>], legacy_sync}>]} {
  hal.executable private @forward_dispatch_1 {
    hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> {
      hal.executable.export public @forward_dispatch_1_generic_10x9216x9216 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
      ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
        %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @forward_dispatch_1_generic_10x9216x9216() {
          %c0 = arith.constant 0 : index
          %cst = arith.constant 0.000000e+00 : f32
          %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>>
          %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
          %2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>> -> tensor<10x9216x9216xf32>
          %3 = tensor.empty() : tensor<10x9216x9216xf32>
          %4 = tensor.empty() : tensor<10x9216xf32>
          %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<10x9216xf32>) -> tensor<10x9216xf32>
          %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf32>) outs(%5 : tensor<10x9216xf32>) {
          ^bb0(%in: f32, %out: f32):
            %8 = arith.addf %in, %out : f32
            linalg.yield %8 : f32
          } -> tensor<10x9216xf32>
          %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf32>, tensor<10x9216xf32>) outs(%3 : tensor<10x9216x9216xf32>) {
          ^bb0(%in: f32, %in_0: f32, %out: f32):
            %8 = arith.divf %in, %in_0 : f32
            linalg.yield %8 : f32
          } -> tensor<10x9216x9216xf32>
          flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf32> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
          return
        }
      }
    }
  }
  util.global private mutable @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
  util.initializer {
    %c6794772480 = arith.constant 6794772480 : index
    %device = hal.ex.shared_device : !hal.device
    %allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
    %buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c6794772480}
    util.global.store %buffer, @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
    util.initializer.return
  }
  func.func @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
    %c-1_i32 = arith.constant -1 : i32
    %c-1_i64 = arith.constant -1 : i64
    %c9216 = arith.constant 9216 : index
    %c10 = arith.constant 10 : index
    %c1 = arith.constant 1 : index
    %c3397386240 = arith.constant 3397386240 : index
    %c0 = arith.constant 0 : index
    %0 = arith.index_cast %arg0 : i32 to index
    %device = hal.ex.shared_device : !hal.device
    %cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
    %pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
    %forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer = util.global.load @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
    hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
      %c0 = (%forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c0, %c3397386240], 
      %c1 = (%forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c3397386240, %c3397386240]
    ])
    %workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_1::@vulkan_spirv_fb::@forward_dispatch_1_generic_10x9216x9216) workload([%c10, %c9216, %c9216]) : index, index, index
    scf.for %arg1 = %c0 to %0 step %c1 {
      hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_1::@vulkan_spirv_fb::@forward_dispatch_1_generic_10x9216x9216) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
      hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
    }
    hal.command_buffer.finalize<%cmd : !hal.command_buffer>
    %1 = util.null : !hal.fence
    %2 = hal.timeline.advance : !hal.fence
    hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%2) commands([%cmd])
    %status = hal.fence.await until([%2]) timeout_millis(%c-1_i32) : i32
    util.status.check_ok %status, "failed to wait on timepoint"
    return
  }
 }
	module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>], legacy_sync}>]} {
	hal.executable private @forward_dispatch_0 {
	hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> {
	hal.executable.export public @forward_dispatch_0_generic_10x9216x9216 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
	^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
	%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
	hal.return %x, %y, %z : index, index, index
	}
	builtin.module {
	func.func @forward_dispatch_0_generic_10x9216x9216() {
	%c0 = arith.constant 0 : index
	%cst = arith.constant -3.40282347E+38 : f32
	%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>>
	%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
	%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>> -> tensor<10x9216x9216xf32>
	%3 = tensor.empty() : tensor<10x9216x9216xf32>
	%4 = tensor.empty() : tensor<10x9216xf32>
	%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<10x9216xf32>) -> tensor<10x9216xf32>
	%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf32>) outs(%5 : tensor<10x9216xf32>) {
	^bb0(%in: f32, %out: f32):
	%8 = arith.maxf %in, %out : f32
	linalg.yield %8 : f32
	} -> tensor<10x9216xf32>
	%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf32>, tensor<10x9216xf32>) outs(%3 : tensor<10x9216x9216xf32>) {
	^bb0(%in: f32, %in_0: f32, %out: f32):
	%8 = arith.subf %in, %in_0 : f32
	%9 = math.exp %8 : f32
	linalg.yield %9 : f32
	} -> tensor<10x9216x9216xf32>
	flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf32> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
	return
	}
	}
	}
	}
	util.global private mutable @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
	util.initializer {
	%c6794772480 = arith.constant 6794772480 : index
	%device = hal.ex.shared_device : !hal.device
	%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
	%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible\|DeviceLocal") usage("TransferSource\|TransferTarget\|Transfer\|DispatchStorageRead\|DispatchStorageWrite\|DispatchStorage") : !hal.buffer{%c6794772480}
	util.global.store %buffer, @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
	util.initializer.return
	}
	func.func @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
	%c-1_i32 = arith.constant -1 : i32
	%c-1_i64 = arith.constant -1 : i64
	%c9216 = arith.constant 9216 : index
	%c10 = arith.constant 10 : index
	%c1 = arith.constant 1 : index
	%c3397386240 = arith.constant 3397386240 : index
	%c0 = arith.constant 0 : index
	%0 = arith.index_cast %arg0 : i32 to index
	%device = hal.ex.shared_device : !hal.device
	%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot\|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
	%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
	%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer = util.global.load @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
	hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
	%c0 = (%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c0, %c3397386240],
	%c1 = (%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c3397386240, %c3397386240]
	])
	%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_0::@vulkan_spirv_fb::@forward_dispatch_0_generic_10x9216x9216) workload([%c10, %c9216, %c9216]) : index, index, index
	scf.for %arg1 = %c0 to %0 step %c1 {
	hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_0::@vulkan_spirv_fb::@forward_dispatch_0_generic_10x9216x9216) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
	hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch\|CommandRetire") target("CommandIssue\|Dispatch") flags("None")
	}
	hal.command_buffer.finalize<%cmd : !hal.command_buffer>
	%1 = util.null : !hal.fence
	%2 = hal.timeline.advance : !hal.fence
	hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%2) commands([%cmd])
	%status = hal.fence.await until([%2]) timeout_millis(%c-1_i32) : i32
	util.status.check_ok %status, "failed to wait on timepoint"
	return
	}
	}