Skip to content

Instantly share code, notes, and snippets.

@marcusbuffett
Created July 23, 2025 11:22
Show Gist options
  • Save marcusbuffett/013e9baab93e210a622a9e408c38e49c to your computer and use it in GitHub Desktop.
Save marcusbuffett/013e9baab93e210a622a9e408c38e49c to your computer and use it in GitHub Desktop.
This file has been truncated, but you can view the full file.
validate checksum key=ReduceAutotuneKey - ElemInput: Float(F32), ElemOutput: Float(F32), ElemAcc: Float(F32), PotentialLineSize: 4, AxisIsContiguous: true, ReduceAxisShape: 4096, ReduceCount: 64, checksum=a14a9dbdfd906e71ae310b4829971734
00d104e8138e13e2dd8e06
0129d16,
),
info: Some (
(
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
),
),
mode: Some (
Unchecked,
),
type_name: "burn_cubecl: : kernel: : index: : slice_assign: : slice_assign_kernel: : SliceAssignKernel<f32,
cubecl_cuda: : runtime: : CudaRuntime>",
}
source:
```cpp
#include <cuda_runtime.h>
typedef unsigned int uint;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
struct __align__(32) float_8 {
float i_0;
float i_1;
float i_2;
float i_3;
float i_4;
float i_5;
float i_6;
float i_7;
};
struct scalars_uint32_st {
uint32 x[12];
};
struct metadata_st {
uint x[10];
};
extern "C" __global__ void slice_assign_kernel (
float_8* buffer_0, const float_8* __restrict__ buffer_1, const uint32* __restrict__ info, const __grid_constant__ metadata_st static_info, const __grid_constant__ scalars_uint32_st scalars_uint32
) {
uint3 absoluteIdx = make_uint3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint32 idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint32 l_mut_3;
uint32 l_mut_4;
uint32 l_mut_5;
const uint32 l_0 = static_info.x[uint32(3)];
const bool l_1 = idxGlobal >= l_0;
if (l_1) {
return;}
const uint32 l_2 = idxGlobal * uint32(8);
l_mut_3 = l_2;
l_mut_4 = uint32(0);
l_mut_5 = uint32(0);
const uint32 l_6 = l_mut_3 >> scalars_uint32.x[6];
const uint32 l_7 = l_mut_3 & scalars_uint32.x[7];
const uint32 l_8 = l_7 + scalars_uint32.x[11];
const uint32 l_9 = info[static_info.x[uint32(9)] + uint32(3) - 10];
const uint32 l_10 = l_7 * l_9;
l_mut_5 = l_mut_5 + l_10;
const uint32 l_11 = info[static_info.x[uint32(8)] + uint32(3) - 10];
const uint32 l_12 = l_8 * l_11;
l_mut_4 = l_mut_4 + l_12;
l_mut_3 = l_6;
const uint32 l_13 = l_mut_3 >> scalars_uint32.x[4];
const uint32 l_14 = l_mut_3 & scalars_uint32.x[5];
const uint32 l_15 = l_14 + scalars_uint32.x[10];
const uint32 l_16 = info[static_info.x[uint32(9)] + uint32(2) - 10];
const uint32 l_17 = l_14 * l_16;
l_mut_5 = l_mut_5 + l_17;
const uint32 l_18 = info[static_info.x[uint32(8)] + uint32(2) - 10];
const uint32 l_19 = l_15 * l_18;
l_mut_4 = l_mut_4 + l_19;
l_mut_3 = l_13;
const uint32 l_20 = l_mut_3 >> scalars_uint32.x[2];
const uint32 l_21 = l_mut_3 & scalars_uint32.x[3];
const uint32 l_22 = l_21 + scalars_uint32.x[9];
const uint32 l_23 = info[static_info.x[uint32(9)] + uint32(1) - 10];
const uint32 l_24 = l_21 * l_23;
l_mut_5 = l_mut_5 + l_24;
const uint32 l_25 = info[static_info.x[uint32(8)] + uint32(1) - 10];
const uint32 l_26 = l_22 * l_25;
l_mut_4 = l_mut_4 + l_26;
l_mut_3 = l_20;
const uint32 l_27 = l_mut_3 >> scalars_uint32.x[0];
const uint32 l_28 = l_mut_3 & scalars_uint32.x[1];
const uint32 l_29 = l_28 + scalars_uint32.x[8];
const uint32 l_30 = info[static_info.x[uint32(9)] + uint32(0) - 10];
const uint32 l_31 = l_28 * l_30;
l_mut_5 = l_mut_5 + l_31;
const uint32 l_32 = info[static_info.x[uint32(8)] + uint32(0) - 10];
const uint32 l_33 = l_29 * l_32;
l_mut_4 = l_mut_4 + l_33;
l_mut_3 = l_27;
const uint32 l_34 = l_mut_4 / uint32(8);
const uint32 l_35 = l_mut_5 / uint32(8);
const float_8 l_36 = buffer_1[l_35];
buffer_0[l_34] = reinterpret_cast< float_8 const&>(l_36);
}
```
[END_KERNEL_COMPILATION]
| 222.102099ms | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 49.851µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.77µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 38.721µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 25.531µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.011µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 28.321µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.97µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.38µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.421µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.251µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.101µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.101µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 36.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.82µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.471µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 37.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.621µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.6µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.5µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.81µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.711µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.891µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.61µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 20.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.21µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.05µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
[START_KERNEL_COMPILATION]
name: burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<
f32,
cubecl_cuda::runtime::CudaRuntime,
>
cube_dim: (16, 16, 1)
info: KernelId {
type_id: TypeId (
0xbb7da037008a4eaa774030a150129d16,
),
info: Some (
(
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
),
),
mode: Some (
Unchecked,
),
type_name: "burn_cubecl: : kernel: : index: : slice_assign: : slice_assign_kernel: : SliceAssignKernel<f32,
cubecl_cuda: : runtime: : CudaRuntime>",
}
source:
```cpp
#include <cuda_runtime.h>
typedef unsigned int uint;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
struct __align__(32) float_8 {
float i_0;
float i_1;
float i_2;
float i_3;
float i_4;
float i_5;
float i_6;
float i_7;
};
struct scalars_uint32_st {
uint32 x[6];
};
struct metadata_st {
uint x[10];
};
extern "C" __global__ void slice_assign_kernel (
float_8* buffer_0, const float_8* __restrict__ buffer_1, const uint32* __restrict__ info, const __grid_constant__ metadata_st static_info, const __grid_constant__ scalars_uint32_st scalars_uint32
) {
uint3 absoluteIdx = make_uint3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint32 idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint32 l_mut_3;
uint32 l_mut_4;
uint32 l_mut_5;
const uint32 l_0 = static_info.x[uint32(3)];
const bool l_1 = idxGlobal >= l_0;
if (l_1) {
return;}
const uint32 l_2 = idxGlobal * uint32(8);
l_mut_3 = l_2;
l_mut_4 = uint32(0);
l_mut_5 = uint32(0);
const uint32 l_6 = l_mut_3 >> scalars_uint32.x[2];
const uint32 l_7 = l_mut_3 & scalars_uint32.x[3];
const uint32 l_8 = l_7 + scalars_uint32.x[5];
const uint32 l_9 = info[static_info.x[uint32(9)] + uint32(1) - 10];
const uint32 l_10 = l_7 * l_9;
l_mut_5 = l_mut_5 + l_10;
const uint32 l_11 = info[static_info.x[uint32(8)] + uint32(1) - 10];
const uint32 l_12 = l_8 * l_11;
l_mut_4 = l_mut_4 + l_12;
l_mut_3 = l_6;
const uint32 l_13 = l_mut_3 >> scalars_uint32.x[0];
const uint32 l_14 = l_mut_3 & scalars_uint32.x[1];
const uint32 l_15 = l_14 + scalars_uint32.x[4];
const uint32 l_16 = info[static_info.x[uint32(9)] + uint32(0) - 10];
const uint32 l_17 = l_14 * l_16;
l_mut_5 = l_mut_5 + l_17;
const uint32 l_18 = info[static_info.x[uint32(8)] + uint32(0) - 10];
const uint32 l_19 = l_15 * l_18;
l_mut_4 = l_mut_4 + l_19;
l_mut_3 = l_13;
const uint32 l_20 = l_mut_4 / uint32(8);
const uint32 l_21 = l_mut_5 / uint32(8);
const float_8 l_22 = buffer_1[l_21];
buffer_0[l_20] = reinterpret_cast< float_8 const&>(l_22);
}
```
[END_KERNEL_COMPILATION]
|| 124.293029ms | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 49.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 30.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 29.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 32.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 31.401µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.081µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.561µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.76µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.421µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.841µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.961µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.63µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.861µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.661µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 26.741µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 29.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 29.461µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 29.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.97µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.901µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.511µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.78µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.301µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 37.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 29.011µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.491µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.181µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.03µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.891µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.22µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.661µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 25.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.971µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 25.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
[START_KERNEL_COMPILATION]
name: burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<
i32,
cubecl_cuda::runtime::CudaRuntime,
>
cube_dim: (16, 16, 1)
info: KernelId {
type_id: TypeId (
0x69f09211cfcda605c3b12b9ee6e6f05a,
),
info: Some (
(
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
),
),
mode: Some (
Unchecked,
),
type_name: "burn_cubecl: : kernel: : index: : slice_assign: : slice_assign_kernel: : SliceAssignKernel<i32,
cubecl_cuda: : runtime: : CudaRuntime>",
}
source:
```cpp
#include <cuda_runtime.h>
typedef unsigned int uint;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
struct scalars_uint32_st {
uint32 x[6];
};
struct metadata_st {
uint x[10];
};
extern "C" __global__ void slice_assign_kernel (
int32* buffer_0, const int32* __restrict__ buffer_1, const uint32* __restrict__ info, const __grid_constant__ metadata_st static_info, const __grid_constant__ scalars_uint32_st scalars_uint32
) {
uint3 absoluteIdx = make_uint3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint32 idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint32 l_mut_3;
uint32 l_mut_4;
uint32 l_mut_5;
const uint32 l_0 = static_info.x[uint32(3)];
const bool l_1 = idxGlobal >= l_0;
if (l_1) {
return;}
const uint32 l_2 = idxGlobal * uint32(1);
l_mut_3 = l_2;
l_mut_4 = uint32(0);
l_mut_5 = uint32(0);
const uint32 l_6 = l_mut_3 >> scalars_uint32.x[2];
const uint32 l_7 = l_mut_3 & scalars_uint32.x[3];
const uint32 l_8 = l_7 + scalars_uint32.x[5];
const uint32 l_9 = info[static_info.x[uint32(9)] + uint32(1) - 10];
const uint32 l_10 = l_7 * l_9;
l_mut_5 = l_mut_5 + l_10;
const uint32 l_11 = info[static_info.x[uint32(8)] + uint32(1) - 10];
const uint32 l_12 = l_8 * l_11;
l_mut_4 = l_mut_4 + l_12;
l_mut_3 = l_6;
const uint32 l_13 = l_mut_3 >> scalars_uint32.x[0];
const uint32 l_14 = l_mut_3 & scalars_uint32.x[1];
const uint32 l_15 = l_14 + scalars_uint32.x[4];
const uint32 l_16 = info[static_info.x[uint32(9)] + uint32(0) - 10];
const uint32 l_17 = l_14 * l_16;
l_mut_5 = l_mut_5 + l_17;
const uint32 l_18 = info[static_info.x[uint32(8)] + uint32(0) - 10];
const uint32 l_19 = l_15 * l_18;
l_mut_4 = l_mut_4 + l_19;
l_mut_3 = l_13;
const uint32 l_20 = l_mut_4 / uint32(1);
const uint32 l_21 = l_mut_5 / uint32(1);
const int32 l_22 = buffer_1[l_21];
buffer_0[l_20] = l_22;
}
```
[END_KERNEL_COMPILATION]
|| 130.528105ms | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 37.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.61µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.281µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.871µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.01µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.671µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.281µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.241µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 39.271µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.1µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.531µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.851µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.411µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.261µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.01µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.261µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.661µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.571µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.02µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.36µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.981µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.671µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.941µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.4µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.71µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.351µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.051µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.901µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.21µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.381µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.431µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.051µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.94µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.69µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.821µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.781µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.521µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.67µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.44µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.671µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.731µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.711µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.07µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.64µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.521µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.921µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.561µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 41.991µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.841µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.291µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.101µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.911µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.971µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.43µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.71µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.11µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.93µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.611µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.891µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 33.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.37µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 26.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 26.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 26.221µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 25.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.941µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 23.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.271µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 26.081µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 23.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 29.861µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.371µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.661µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 24.721µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.21µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.93µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.271µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.881µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 23.931µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.281µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.081µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.921µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.181µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.341µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.64µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.6µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.74µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
[START_KERNEL_COMPILATION]
name: burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<
i32,
cubecl_cuda::runtime::CudaRuntime,
>
cube_dim: (16, 16, 1)
info: KernelId {
type_id: TypeId (
0x69f09211cfcda605c3b12b9ee6e6f05a,
),
info: Some (
(
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
),
),
mode: Some (
Unchecked,
),
type_name: "burn_cubecl: : kernel: : index: : slice_assign: : slice_assign_kernel: : SliceAssignKernel<i32,
cubecl_cuda: : runtime: : CudaRuntime>",
}
source:
```cpp
#include <cuda_runtime.h>
typedef unsigned int uint;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
struct scalars_uint32_st {
uint32 x[7];
};
struct metadata_st {
uint x[10];
};
extern "C" __global__ void slice_assign_kernel (
int32* buffer_0, const int32* __restrict__ buffer_1, const uint32* __restrict__ info, const __grid_constant__ metadata_st static_info, const __grid_constant__ scalars_uint32_st scalars_uint32
) {
uint3 absoluteIdx = make_uint3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint32 idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint32 l_mut_3;
uint32 l_mut_4;
uint32 l_mut_5;
const uint32 l_0 = static_info.x[uint32(3)];
const bool l_1 = idxGlobal >= l_0;
if (l_1) {
return;}
const uint32 l_2 = idxGlobal * uint32(1);
l_mut_3 = l_2;
l_mut_4 = uint32(0);
l_mut_5 = uint32(0);
const uint32 l_6 = __umulhi(l_mut_3, scalars_uint32.x[3]);
const uint32 l_7 = l_6 + l_mut_3;
const uint32 l_8 = l_7 >> scalars_uint32.x[4];
const uint32 l_9 = l_8 * scalars_uint32.x[2];
const uint32 l_10 = l_mut_3 - l_9;
const uint32 l_11 = l_10 + scalars_uint32.x[6];
const uint32 l_12 = info[static_info.x[uint32(9)] + uint32(1) - 10];
const uint32 l_13 = l_10 * l_12;
l_mut_5 = l_mut_5 + l_13;
const uint32 l_14 = info[static_info.x[uint32(8)] + uint32(1) - 10];
const uint32 l_15 = l_11 * l_14;
l_mut_4 = l_mut_4 + l_15;
l_mut_3 = l_8;
const uint32 l_16 = l_mut_3 >> scalars_uint32.x[0];
const uint32 l_17 = l_mut_3 & scalars_uint32.x[1];
const uint32 l_18 = l_17 + scalars_uint32.x[5];
const uint32 l_19 = info[static_info.x[uint32(9)] + uint32(0) - 10];
const uint32 l_20 = l_17 * l_19;
l_mut_5 = l_mut_5 + l_20;
const uint32 l_21 = info[static_info.x[uint32(8)] + uint32(0) - 10];
const uint32 l_22 = l_18 * l_21;
l_mut_4 = l_mut_4 + l_22;
l_mut_3 = l_16;
const uint32 l_23 = l_mut_4 / uint32(1);
const uint32 l_24 = l_mut_5 / uint32(1);
const int32 l_25 = buffer_1[l_24];
buffer_0[l_23] = l_25;
}
```
[END_KERNEL_COMPILATION]| | 132.590506ms | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 37.431µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.13µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.211µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.65µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.66µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.861µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.861µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.911µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.271µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.081µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.561µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.381µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 39.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.631µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.29µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.981µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.081µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.901µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.561µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.531µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.261µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
[START_KERNEL_COMPILATION]
name: burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<
f32,
cubecl_cuda::runtime::CudaRuntime,
>
cube_dim: (16, 16, 1)
info: KernelId {
type_id: TypeId (
0xbb7da037008a4eaa774030a150129d16,
),
info: Some (
(
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
),
),
mode: Some (
Unchecked,
),
type_name: "burn_cubecl: : kernel: : index: : slice_assign: : slice_assign_kernel: : SliceAssignKernel<f32,
cubecl_cuda: : runtime: : CudaRuntime>",
}
source:
```cpp
#include <cuda_runtime.h>
typedef unsigned int uint;
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint32;
typedef unsigned long long int uint64;
typedef signed char int8;
typedef signed short int16;
typedef signed int int32;
typedef signed long long int int64;
struct scalars_uint32_st {
uint32 x[7];
};
struct metadata_st {
uint x[10];
};
extern "C" __global__ void slice_assign_kernel (
float* buffer_0, const float* __restrict__ buffer_1, const uint32* __restrict__ info, const __grid_constant__ metadata_st static_info, const __grid_constant__ scalars_uint32_st scalars_uint32
) {
uint3 absoluteIdx = make_uint3(
blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z
);
uint32 idxGlobal = (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) + (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
uint32 l_mut_3;
uint32 l_mut_4;
uint32 l_mut_5;
const uint32 l_0 = static_info.x[uint32(3)];
const bool l_1 = idxGlobal >= l_0;
if (l_1) {
return;}
const uint32 l_2 = idxGlobal * uint32(1);
l_mut_3 = l_2;
l_mut_4 = uint32(0);
l_mut_5 = uint32(0);
const uint32 l_6 = __umulhi(l_mut_3, scalars_uint32.x[3]);
const uint32 l_7 = l_6 + l_mut_3;
const uint32 l_8 = l_7 >> scalars_uint32.x[4];
const uint32 l_9 = l_8 * scalars_uint32.x[2];
const uint32 l_10 = l_mut_3 - l_9;
const uint32 l_11 = l_10 + scalars_uint32.x[6];
const uint32 l_12 = info[static_info.x[uint32(9)] + uint32(1) - 10];
const uint32 l_13 = l_10 * l_12;
l_mut_5 = l_mut_5 + l_13;
const uint32 l_14 = info[static_info.x[uint32(8)] + uint32(1) - 10];
const uint32 l_15 = l_11 * l_14;
l_mut_4 = l_mut_4 + l_15;
l_mut_3 = l_8;
const uint32 l_16 = l_mut_3 >> scalars_uint32.x[0];
const uint32 l_17 = l_mut_3 & scalars_uint32.x[1];
const uint32 l_18 = l_17 + scalars_uint32.x[5];
const uint32 l_19 = info[static_info.x[uint32(9)] + uint32(0) - 10];
const uint32 l_20 = l_17 * l_19;
l_mut_5 = l_mut_5 + l_20;
const uint32 l_21 = info[static_info.x[uint32(8)] + uint32(0) - 10];
const uint32 l_22 = l_18 * l_21;
l_mut_4 = l_mut_4 + l_22;
l_mut_3 = l_16;
const uint32 l_23 = l_mut_4 / uint32(1);
const uint32 l_24 = l_mut_5 / uint32(1);
const float l_25 = buffer_1[l_24];
buffer_0[l_23] = l_25;
}
```
[END_KERNEL_COMPILATION]
|| 139.990504ms | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 49.761µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.221µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 40.38µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.43µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.401µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.481µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.381µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.011µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.31µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.13µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.37µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.91µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.48µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.411µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.951µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.621µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.911µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.971µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.711µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.03µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 40.52µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.801µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 24.341µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.341µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.251µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.311µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.031µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.741µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.371µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 41.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.63µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.37µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 21.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.261µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.671µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.31µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.38µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.441µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.871µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.981µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.341µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.39µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.56µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 20.06µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.921µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 32.821µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.861µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.43µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.711µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.511µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.45µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.661µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.541µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.881µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.5µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 32.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.06µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.951µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.371µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.39µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.911µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.67µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.481µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.64µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.851µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.261µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.1µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.29µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.07µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.85µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.441µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.251µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.66µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.941µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 30.461µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.411µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.83µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.491µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1| 93.232µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 31.07µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 31.731µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 31.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 32.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 48.131µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.511µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.761µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 32.01µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 31.001µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.941µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 30.761µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.311µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 32.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 30.011µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.181µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.761µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 30.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.491µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.111µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.531µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.071µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.91µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.681µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 46.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 30.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 31.461µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.111µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.031µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.391µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.891µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.721µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 29.19µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.731µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.991µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.85µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.481µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.82µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.44µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.541µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.31µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 40.521µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.351µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.4µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.82µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.091µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.63µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.541µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.56µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.381µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.07µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.541µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.471µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.13µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.391µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 26.21µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.03µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.71µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.031µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.06µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.991µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.38µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.91µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.421µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.371µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.531µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.711µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.81µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 28.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.221µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.67µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.091µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.131µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 23.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.74µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.83µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.971µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.13µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.31µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.601µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.781µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.881µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.63µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 27.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.36µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.731µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.691µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.311µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.05µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 22.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.48µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.101µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.94µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.991µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 27.06µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.091µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.21µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.601µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.65µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.71µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.56µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.211µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.78µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.951µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.391µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.38µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.36µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.251µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.05µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.94µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 22.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.231µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.29µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.48µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.011µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.11µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.461µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 23.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.91µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.301µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.421µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.821µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 308.776µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.131µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.13µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.621µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.85µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.531µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.5µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.37µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.621µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.46µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.931µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.39µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.74µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.661µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.77µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.43µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.56µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.5µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.46µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.91µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.211µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.761µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.85µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.841µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 54.681µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 19.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.52µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.381µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.46µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.241µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.581µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.44µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 16.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.261µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.52µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 16.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 29.741µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.711µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.451µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.501µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.13µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.211µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.591µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.411µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.52µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.45µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.81µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.281µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 18.03µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.851µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 17.83µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 29.461µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.51µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.44µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.231µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.351µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.22µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.22µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.441µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.37µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.491µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.251µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.211µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.301µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.111µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.321µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.1µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.351µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.07µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.901µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.19µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.411µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.22µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.431µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.97µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: | 42.044302ms | cubecl_std::tensor::contiguous::into_contiguous_kernel::IntoContiguousKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
1,
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
StridedLayoutCompilationArg: : None,
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (11, 10, 1)
| 39.321µs | cubecl_reduce::launch::reduce_kernel::ReduceKernel<f32, f32, f32, cubecl_reduce::instructions::mixed::ReduceFn, cubecl_reduce::args::TensorArgs, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 32,
y: 8,
z: 1,
},
ReduceParams {
shared: Some (
8,
),
use_planes: true,
line_size_input: 4,
line_size_output: 1,
line_mode: Parallel,
bound_checks: true,
bound_checks_inner: Mask,
},
Mean,
TensorCompilationArg {
inplace: None,
vectorisation: Some (
4,
),
},
(),
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (64, 1, 1)
ce: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.74µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 33.571µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 20.341µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 22.33µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.22µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.501µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.64µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.11µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.8µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.78µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.301µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.69µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.39µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.12µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.221µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.791µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.44µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.65µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.951µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.731µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.31µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.66µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.031µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 75.542µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.19µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.551µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.79µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.07µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.77µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.671µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.441µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 25.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 19.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.02µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.871µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.191µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.99µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.771µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.82µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.821µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.88µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.871µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.371µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.271µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.43µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 28.821µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.131µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.001µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.82µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.421µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.52µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.25µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.521µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.48µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.97µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.61µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.841µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.15µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.971µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.77µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.58µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.471µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 33.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.901µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.08µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.02µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.721µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.991µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.7µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.421µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.311µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.291µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.06µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.29µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.321µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.31µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.581µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.78µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.351µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.811µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 19.45µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.59µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.841µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 17.86µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.161µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.251µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.691µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.4µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.021µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.41µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.751µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.61µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.22µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.411µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.47µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 18.381µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (2, 1, 1)
| 21.691µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.19µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.38µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.291µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.02µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.75µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.851µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.451µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.54µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.101µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.09µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.84µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.571µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.27µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.97µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.471µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.53µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.49µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.391µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.331µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.401µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.5µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.4µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.041µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.72µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.65µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.821µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.501µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.44µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.94µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.171µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.36µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.651µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.42µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.14µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.451µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.24µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.35µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.211µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.16µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.71µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.961µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<i32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 20.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.891µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.57µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 21.97µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.881µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.4µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.46µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.121µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.85µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.04µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.891µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.05µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.11µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.901µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.471µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.02µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.34µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.481µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.55µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.281µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 18.32µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.28µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.741µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.11µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.201µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.73µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.89µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.92µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.291µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.151µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.96µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.961µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.18µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.1µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.361µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.11µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.23µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 16.951µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.21µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.05µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.431µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.26µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 17.62µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : Fast {
divisor: (),
multiplier: (),
shift_right: (),
},
],
Sequence [
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
1,
),
},
) CubeCount (1, 1, 1)
| 40.291µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 25.87µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 28.91µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 26.141µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 25.2µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.66µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.98µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.9µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.63µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.3µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.95µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 25.831µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.321µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 24.841µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 25.031µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.061µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.881µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.701µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.111µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.431µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.291µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.881µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 23.68µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
x: 16,
y: 16,
z: 1,
},
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
Sequence [
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
FastDivmodCompilationArg: : PowerOfTwo {
shift: (),
mask: (),
},
],
Sequence [
(),
(),
(),
(),
],
TensorCompilationArg {
inplace: None,
vectorisation: Some (
8,
),
},
) CubeCount (1, 1, 1)
| 22.76µs | burn_cubecl::kernel::index::slice_assign::slice_assign_kernel::SliceAssignKernel<f32, cubecl_cuda::runtime::CudaRuntime>: (
CubeDim {
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment