Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pashu123/be1f17ec489f0fedd3ec7cfad4dba147 to your computer and use it in GitHub Desktop.
Save pashu123/be1f17ec489f0fedd3ec7cfad4dba147 to your computer and use it in GitHub Desktop.
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>], legacy_sync}>]} {
hal.executable private @forward_dispatch_0 {
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> {
hal.executable.export public @forward_dispatch_0_generic_10x9216x9216 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @forward_dispatch_0_generic_10x9216x9216() {
%c0 = arith.constant 0 : index
%cst = arith.constant -3.40282347E+38 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>> -> tensor<10x9216x9216xf32>
%3 = tensor.empty() : tensor<10x9216x9216xf32>
%4 = tensor.empty() : tensor<10x9216xf32>
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<10x9216xf32>) -> tensor<10x9216xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf32>) outs(%5 : tensor<10x9216xf32>) {
^bb0(%in: f32, %out: f32):
%8 = arith.maxf %in, %out : f32
linalg.yield %8 : f32
} -> tensor<10x9216xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf32>, tensor<10x9216xf32>) outs(%3 : tensor<10x9216x9216xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.subf %in, %in_0 : f32
%9 = math.exp %8 : f32
linalg.yield %9 : f32
} -> tensor<10x9216x9216xf32>
flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf32> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
return
}
}
}
}
util.global private mutable @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
util.initializer {
%c6794772480 = arith.constant 6794772480 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c6794772480}
util.global.store %buffer, @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
util.initializer.return
}
func.func @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c9216 = arith.constant 9216 : index
%c10 = arith.constant 10 : index
%c1 = arith.constant 1 : index
%c3397386240 = arith.constant 3397386240 : index
%c0 = arith.constant 0 : index
%0 = arith.index_cast %arg0 : i32 to index
%device = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer = util.global.load @forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c0, %c3397386240],
%c1 = (%forward_dispatch_0_vulkan_spirv_fb_forward_dispatch_0_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c3397386240, %c3397386240]
])
%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_0::@vulkan_spirv_fb::@forward_dispatch_0_generic_10x9216x9216) workload([%c10, %c9216, %c9216]) : index, index, index
scf.for %arg1 = %c0 to %0 step %c1 {
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_0::@vulkan_spirv_fb::@forward_dispatch_0_generic_10x9216x9216) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
}
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%1 = util.null : !hal.fence
%2 = hal.timeline.advance : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%2) commands([%cmd])
%status = hal.fence.await until([%2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
return
}
}
module attributes {hal.device.targets = [#hal.device.target<"vulkan", {executable_targets = [#hal.executable.target<"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}>], legacy_sync}>]} {
hal.executable private @forward_dispatch_1 {
hal.executable.variant public @vulkan_spirv_fb, target = <"vulkan", "vulkan-spirv-fb", {spirv.target_env = #spirv.target_env<#spirv.vce<v1.6, [Shader, Float64, Float16, Int64, Int16, Int8, StorageBuffer16BitAccess, StorageUniform16, StoragePushConstant16, StorageBuffer8BitAccess, UniformAndStorageBuffer8BitAccess, StoragePushConstant8, GroupNonUniform, GroupNonUniformVote, GroupNonUniformArithmetic, GroupNonUniformBallot, GroupNonUniformShuffle, GroupNonUniformShuffleRelative, GroupNonUniformClustered, GroupNonUniformQuad, VariablePointers, VariablePointersStorageBuffer, CooperativeMatrixNV], [SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_storage_buffer_storage_class, SPV_KHR_variable_pointers, SPV_NV_cooperative_matrix]>, api=Vulkan, NVIDIA:DiscreteGPU, #spirv.resource_limits<max_compute_shared_memory_size = 49152, max_compute_workgroup_invocations = 1024, max_compute_workgroup_size = [1024, 1024, 64], min_subgroup_size = 32, max_subgroup_size = 32, cooperative_matrix_properties_nv = [#spirv.coop_matrix_props<m_size = 8, n_size = 8, k_size = 32, a_type = i8, b_type = i8, c_type = i32, result_type = i32, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f16, result_type = f16, scope = <Subgroup>>, #spirv.coop_matrix_props<m_size = 16, n_size = 16, k_size = 16, a_type = f16, b_type = f16, c_type = f32, result_type = f32, scope = <Subgroup>>]>>}> {
hal.executable.export public @forward_dispatch_1_generic_10x9216x9216 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
%x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @forward_dispatch_1_generic_10x9216x9216() {
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) alignment(64) offset(%c0) : !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<10x9216x9216xf32>> -> tensor<10x9216x9216xf32>
%3 = tensor.empty() : tensor<10x9216x9216xf32>
%4 = tensor.empty() : tensor<10x9216xf32>
%5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<10x9216xf32>) -> tensor<10x9216xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%2 : tensor<10x9216x9216xf32>) outs(%5 : tensor<10x9216xf32>) {
^bb0(%in: f32, %out: f32):
%8 = arith.addf %in, %out : f32
linalg.yield %8 : f32
} -> tensor<10x9216xf32>
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2, %6 : tensor<10x9216x9216xf32>, tensor<10x9216xf32>) outs(%3 : tensor<10x9216x9216xf32>) {
^bb0(%in: f32, %in_0: f32, %out: f32):
%8 = arith.divf %in, %in_0 : f32
linalg.yield %8 : f32
} -> tensor<10x9216x9216xf32>
flow.dispatch.tensor.store %7, %1, offsets = [0, 0, 0], sizes = [10, 9216, 9216], strides = [1, 1, 1] : tensor<10x9216x9216xf32> -> !flow.dispatch.tensor<writeonly:tensor<10x9216x9216xf32>>
return
}
}
}
}
util.global private mutable @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
util.initializer {
%c6794772480 = arith.constant 6794772480 : index
%device = hal.ex.shared_device : !hal.device
%allocator = hal.device.allocator<%device : !hal.device> : !hal.allocator
%buffer = hal.allocator.allocate<%allocator : !hal.allocator> type("DeviceVisible|DeviceLocal") usage("TransferSource|TransferTarget|Transfer|DispatchStorageRead|DispatchStorageWrite|DispatchStorage") : !hal.buffer{%c6794772480}
util.global.store %buffer, @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
util.initializer.return
}
func.func @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216(%arg0: i32) attributes {iree.abi.stub, iree.reflection = {iree.benchmark = "dispatch"}} {
%c-1_i32 = arith.constant -1 : i32
%c-1_i64 = arith.constant -1 : i64
%c9216 = arith.constant 9216 : index
%c10 = arith.constant 10 : index
%c1 = arith.constant 1 : index
%c3397386240 = arith.constant 3397386240 : index
%c0 = arith.constant 0 : index
%0 = arith.index_cast %arg0 : i32 to index
%device = hal.ex.shared_device : !hal.device
%cmd = hal.command_buffer.create device(%device : !hal.device) mode("OneShot|AllowInlineExecution") categories(Dispatch) : !hal.command_buffer
%pipeline_layout = hal.pipeline_layout.lookup device(%device : !hal.device) layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer>]>]>) : !hal.pipeline_layout
%forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer = util.global.load @forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer
hal.command_buffer.push_descriptor_set<%cmd : !hal.command_buffer> layout(%pipeline_layout : !hal.pipeline_layout)[%c0] bindings([
%c0 = (%forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c0, %c3397386240],
%c1 = (%forward_dispatch_1_vulkan_spirv_fb_forward_dispatch_1_generic_10x9216x9216_10x9216x9216_buffer : !hal.buffer)[%c3397386240, %c3397386240]
])
%workgroup_x, %workgroup_y, %workgroup_z = hal.executable.calculate_workgroups device(%device : !hal.device) target(@forward_dispatch_1::@vulkan_spirv_fb::@forward_dispatch_1_generic_10x9216x9216) workload([%c10, %c9216, %c9216]) : index, index, index
scf.for %arg1 = %c0 to %0 step %c1 {
hal.command_buffer.dispatch.symbol<%cmd : !hal.command_buffer> target(@forward_dispatch_1::@vulkan_spirv_fb::@forward_dispatch_1_generic_10x9216x9216) workgroups([%workgroup_x, %workgroup_y, %workgroup_z])
hal.command_buffer.execution_barrier<%cmd : !hal.command_buffer> source("Dispatch|CommandRetire") target("CommandIssue|Dispatch") flags("None")
}
hal.command_buffer.finalize<%cmd : !hal.command_buffer>
%1 = util.null : !hal.fence
%2 = hal.timeline.advance : !hal.fence
hal.device.queue.execute<%device : !hal.device> affinity(%c-1_i64) wait(%1) signal(%2) commands([%cmd])
%status = hal.fence.await until([%2]) timeout_millis(%c-1_i32) : i32
util.status.check_ok %status, "failed to wait on timepoint"
return
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment