Created
March 14, 2025 14:07
-
-
Save FL33TW00D/b3caede9601f6c5c52dee63b621b7475 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
OP CODE | OP TYPE | GLOBAL CALL COUNT | DEVICE ID | ATTRIBUTES | MATH FIDELITY | CORE COUNT | PARALLELIZATION STRATEGY | HOST START TS | HOST END TS | HOST DURATION [ns] | DEVICE FW START CYCLE | DEVICE FW END CYCLE | OP TO OP LATENCY [ns] | DEVICE FW DURATION [ns] | DEVICE KERNEL DURATION [ns] | DEVICE KERNEL DURATION PER CORE MIN [ns] | DEVICE KERNEL DURATION PER CORE MAX [ns] | DEVICE KERNEL DURATION PER CORE AVG [ns] | DEVICE KERNEL FIRST TO LAST START [ns] | DEVICE BRISC KERNEL DURATION [ns] | DEVICE NCRISC KERNEL DURATION [ns] | DEVICE TRISC0 KERNEL DURATION [ns] | DEVICE TRISC1 KERNEL DURATION [ns] | DEVICE TRISC2 KERNEL DURATION [ns] | DEVICE ERISC KERNEL DURATION [ns] | DEVICE COMPUTE CB WAIT FRONT [ns] | DEVICE COMPUTE CB RESERVE BACK [ns] | INPUT_0_W | INPUT_0_Z | INPUT_0_Y | INPUT_0_X | INPUT_0_LAYOUT | INPUT_0_DATATYPE | INPUT_0_MEMORY | INPUT_1_W | INPUT_1_Z | INPUT_1_Y | INPUT_1_X | INPUT_1_LAYOUT | INPUT_1_DATATYPE | INPUT_1_MEMORY | INPUT_2_W | INPUT_2_Z | INPUT_2_Y | INPUT_2_X | INPUT_2_LAYOUT | INPUT_2_DATATYPE | INPUT_2_MEMORY | INPUT_3_W | INPUT_3_Z | INPUT_3_Y | INPUT_3_X | INPUT_3_LAYOUT | INPUT_3_DATATYPE | INPUT_3_MEMORY | OUTPUT_0_W | OUTPUT_0_Z | OUTPUT_0_Y | OUTPUT_0_X | OUTPUT_0_LAYOUT | OUTPUT_0_DATATYPE | OUTPUT_0_MEMORY | OUTPUT_1_W | OUTPUT_1_Z | OUTPUT_1_Y | OUTPUT_1_X | OUTPUT_1_LAYOUT | OUTPUT_1_DATATYPE | OUTPUT_1_MEMORY | OUTPUT_2_W | OUTPUT_2_Z | OUTPUT_2_Y | OUTPUT_2_X | OUTPUT_2_LAYOUT | OUTPUT_2_DATATYPE | OUTPUT_2_MEMORY | METAL TRACE ID | METAL TRACE REPLAY SESSION ID | COMPUTE KERNEL SOURCE | COMPUTE KERNEL HASH | DATA MOVEMENT KERNEL SOURCE | DATA MOVEMENT KERNEL HASH | BRISC MAX KERNEL SIZE [B] | NCRISC MAX KERNEL SIZE [B] | TRISC 0 MAX KERNEL SIZE [B] | TRISC 1 MAX KERNEL SIZE [B] | TRISC 2 MAX KERNEL SIZE [B] | ERISC MAX KERNEL SIZE [B] | PM IDEAL [ns] | PM COMPUTE [ns] | PM BANDWIDTH [ns] | PM REQ I BW | PM REQ O BW | CompileProgram_TT_HOST_FUNC [ns] | HWCommandQueue_write_buffer_TT_HOST_FUNC [ns] | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Embeddings | tt_dnn_device | 1 | 0 | {'embeddings_type': 'EmbeddingsType::PADDED'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'pad_token': '1'; 'tilized': 'true'} | HiFi4 | 56 | 11525792262 | 12077256151 | 551463889 | 4652784950082 | 4652785021676 | 0 | 71594 | 69677 | 25328 | 69562 | 50259 | 719 | 69657 | 43289 | 45558 | 45484 | 45707 | 8 | 1 | 1 | 384 | ROW_MAJOR | UINT32 | DEV_0_L1_INTERLEAVED | 1 | 1 | 50265 | 1024 | ROW_MAJOR | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 8 | 1 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'; 'ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'] | ['tilize/17927064194417795862/'; 'tilize/1955606672871876418/'] | ['ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['embeddings_tilize/9293793899976367391/'; 'writer_unary_interleaved_start_id/12513060238449668381/'] | 712 | 1312 | 1172 | 668 | 1556 | 0 | 371600 | 1 | 371600 | [0.033067815005779266; 277.0256042480469] | [16.930721282958984] | 550578291 | 14380 | |||||||||||||||||||||||||||||||||||
Embeddings | tt_dnn_device | 2 | 0 | {'embeddings_type': 'EmbeddingsType::GENERIC'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'pad_token': 'std::nullopt'; 'tilized': 'true'} | HiFi4 | 56 | 12077425693 | 12594462065 | 517036372 | 4653294717567 | 4653295016608 | 509697904 | 299041 | 297018 | 30011 | 296968 | 177917 | 610 | 297018 | 283191 | 285425 | 285368 | 285583 | 8 | 1 | 1 | 384 | ROW_MAJOR | UINT32 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1 | 1024 | ROW_MAJOR | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 8 | 1 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'; 'ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'] | ['tilize/5795839852233331467/'; 'tilize/5310988429227515762/'] | ['ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['embeddings_tilize/13875115944824425364/'; 'writer_unary_interleaved_start_id/3927395697778282413/'] | 712 | 1068 | 1172 | 668 | 1556 | 0 | 7 | 1 | 7 | [1755.4285888671875; 292.5714416503906] | [898779.4375] | 516381656 | 33810 | |||||||||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 3 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 12594627056 | 13093377525 | 498750469 | 4653786383365 | 4653786431916 | 491367415 | 48551 | 47896 | 42176 | 47828 | 44967 | 516 | 47592 | 47047 | 47218 | 46466 | 46910 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 498069652 | 34600 | |||||||||||||||||||||||||||||||||||
Embeddings | tt_dnn_device | 4 | 0 | {'embeddings_type': 'EmbeddingsType::GENERIC'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'pad_token': 'std::nullopt'; 'tilized': 'true'} | HiFi4 | 56 | 13093527596 | 13094712817 | 1185221 | 4653787698671 | 4653787764340 | 1268767 | 65669 | 63662 | 21993 | 63539 | 47695 | 617 | 63662 | 36226 | 38478 | 38425 | 38640 | 8 | 1 | 1 | 384 | ROW_MAJOR | UINT32 | DEV_0_L1_INTERLEAVED | 1 | 1 | 514 | 1024 | ROW_MAJOR | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 8 | 1 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'; 'ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'] | ['tilize/5795839852233331467/'; 'tilize/5310988429227515762/'] | ['ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['embeddings_tilize/13875115944824425364/'; 'writer_unary_interleaved_start_id/3927395697778282413/'] | 712 | 1068 | 1172 | 668 | 1556 | 0 | 3799 | 1 | 3799 | [3.2345354557037354; 277.09185791015625] | [1656.0821533203125] | 670176 | 3250 | |||||||||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 5 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 13094795548 | 13094841269 | 45721 | 4653787954547 | 4653788003022 | 190858 | 48475 | 47821 | 41312 | 47749 | 44883 | 496 | 47524 | 46968 | 47134 | 46352 | 46806 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 340 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 6 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-12'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 13094919309 | 13967384018 | 872464709 | 4654647675029 | 4654647769195 | 859672692 | 94166 | 93487 | 54968 | 93407 | 77461 | 1064 | 92643 | 67317 | 91314 | 91766 | 91751 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 871547090 | 15640 | ||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 7 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 13967587580 | 14611713730 | 644126150 | 4655282603361 | 4655283077856 | 634836596 | 474495 | 472036 | 457921 | 471802 | 467503 | 622 | 472016 | 395654 | 446500 | 446639 | 446523 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 643129050 | 34141 | ||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 8 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 14611835521 | 15120918458 | 509082937 | 4655784417753 | 4655784495315 | 501341946 | 77562 | 75537 | 34535 | 75345 | 62029 | 993 | 75510 | 74611 | 63374 | 62655 | 63227 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 508283029 | 14820 | ||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 9 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 15121073289 | 15719694976 | 598621687 | 4656374547519 | 4656374678343 | 590054183 | 130824 | 128845 | 51220 | 128667 | 97481 | 628 | 128845 | 81740 | 122799 | 122927 | 123024 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 597801429 | 15490 | |||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 10 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 15719815897 | 16514774820 | 794958923 | 4657158131646 | 4657158344226 | 783453971 | 212580 | 211915 | 197182 | 211866 | 204381 | 692 | 211410 | 198004 | 210589 | 211062 | 210988 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 794152965 | 14770 | |||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 11 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 16514902141 | 17139900439 | 624998298 | 4657774159751 | 4657774260532 | 615817492 | 100781 | 98804 | 60730 | 98609 | 75794 | 645 | 98804 | 95440 | 97387 | 97525 | 97744 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 624206290 | 14420 | |||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 12 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 17140025360 | 17479133602 | 339108242 | 4658108514090 | 4658108540152 | 334254417 | 26062 | 25216 | 11777 | 25152 | 20938 | 220 | 25170 | 24221 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 338519706 | 14390 | ||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 13 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 17479272753 | 18098134863 | 618862110 | 4658718494612 | 4658718585790 | 609956899 | 91178 | 88725 | 66536 | 88643 | 75810 | 599 | 88725 | 52433 | 62845 | 62639 | 63240 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 617908611 | 33670 | ||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 14 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 18098263904 | 18098285384 | 21480 | 4658718833836 | 4658718882095 | 248709 | 48259 | 47561 | 41056 | 47532 | 44875 | 458 | 47268 | 46725 | 46901 | 46107 | 46588 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 410 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 15 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 18098351425 | 18099303794 | 952369 | 4658719762002 | 4658719857126 | 880631 | 95124 | 94445 | 58010 | 94371 | 78162 | 1117 | 93572 | 68336 | 92281 | 92724 | 92696 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 379304 | 2900 | ||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 16 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 18099468646 | 18785022099 | 685553453 | 4659395324444 | 4659395837153 | 675469750 | 512709 | 510259 | 507480 | 510207 | 509108 | 615 | 510259 | 430270 | 508301 | 509442 | 509453 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 684436553 | 20641 | ||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 17 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 18785172190 | 19428655654 | 643483464 | 4660029590021 | 4660029842769 | 633755317 | 252748 | 250303 | 227091 | 250076 | 236456 | 597 | 250302 | 213825 | 224969 | 224773 | 225375 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 642518804 | 14381 | ||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 18 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 19428799475 | 20386345992 | 957546517 | 4660973470648 | 4660973593844 | 943628577 | 123196 | 122514 | 62411 | 122486 | 101981 | 1058 | 121671 | 93181 | 120287 | 120740 | 120674 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 956630049 | 15681 | |||||||||||||||||||||
Matmul | tt_dnn_device | 19 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20386592755 | 20386637365 | 44610 | 4660973904640 | 4660974379847 | 313230 | 475207 | 472756 | 458469 | 472485 | 467917 | 604 | 472756 | 396661 | 447064 | 447185 | 447090 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 490 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 20 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20386692746 | 20386704716 | 11970 | 4660974367061 | 4660974454722 | 642 | 87661 | 74246 | 34840 | 74183 | 61686 | 2052 | 74245 | 73242 | 61919 | 61187 | 61640 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 310 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 21 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20386754026 | 20386764876 | 10850 | 4660974416798 | 4660974583975 | 692 | 167177 | 128561 | 53043 | 128554 | 97761 | 1745 | 128561 | 82934 | 122296 | 122456 | 122546 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 60 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 22 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20386782267 | 20386840837 | 58570 | 4660974509977 | 4660974799334 | 647 | 289357 | 214716 | 197584 | 214596 | 206779 | 1917 | 214692 | 201237 | 213719 | 213810 | 214094 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 23 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20386874527 | 20386884388 | 9861 | 4660974783690 | 4660974905574 | 605 | 121884 | 105623 | 52495 | 105584 | 79165 | 1809 | 105623 | 102148 | 104058 | 104186 | 104405 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 24 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20386898778 | 20386909428 | 10650 | 4660974853971 | 4660974931692 | 710 | 77721 | 25421 | 11737 | 24959 | 20911 | 1559 | 25420 | 24799 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 100 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 25 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20386925848 | 20386957758 | 31910 | 4660974919064 | 4660975021765 | 708 | 102701 | 89343 | 66756 | 89251 | 75800 | 1288 | 89343 | 52354 | 62667 | 62465 | 63100 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 26 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20386984178 | 20386999179 | 15001 | 4660975000641 | 4660975073578 | 658 | 72937 | 51170 | 40956 | 51090 | 47128 | 1682 | 51169 | 50550 | 50624 | 49686 | 50294 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 40 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 27 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20387070369 | 20387104190 | 33821 | 4660975064834 | 4660975166242 | 671 | 101408 | 91998 | 47716 | 91733 | 75791 | 2217 | 91992 | 66981 | 90528 | 90266 | 90887 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 160 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 28 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20387165820 | 20387179470 | 13650 | 4660975123394 | 4660975676896 | 630 | 553502 | 510007 | 507249 | 509858 | 508892 | 1771 | 510005 | 430394 | 507894 | 509053 | 509060 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 29 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20387219181 | 20387229361 | 10180 | 4660975676300 | 4660975929366 | 735 | 253066 | 251737 | 228729 | 250842 | 237558 | 1693 | 251736 | 216183 | 226990 | 226784 | 227388 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 300 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 30 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20387244501 | 20387291461 | 46960 | 4660975907828 | 4660976049872 | 628 | 142044 | 119892 | 56679 | 119788 | 96858 | 2296 | 119888 | 91467 | 118495 | 118244 | 118840 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 130 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 31 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20387317292 | 20387324582 | 7290 | 4660975988087 | 4660976524346 | 625 | 536259 | 473830 | 458768 | 473733 | 469560 | 1811 | 473828 | 397498 | 448206 | 448347 | 448249 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 60 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 32 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20387340252 | 20387345152 | 4900 | 4660976510763 | 4660976600369 | 665 | 89606 | 75359 | 35083 | 75300 | 62022 | 2030 | 75342 | 74347 | 62730 | 62101 | 62690 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 33 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20387444353 | 20387469153 | 24800 | 4660976561579 | 4660976728375 | 678 | 166796 | 127329 | 54205 | 127320 | 96436 | 1779 | 127329 | 83928 | 120338 | 120484 | 120589 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 60 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 34 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20387479553 | 20387487463 | 7910 | 4660976656778 | 4660976943750 | 676 | 286972 | 214715 | 197663 | 214589 | 206705 | 1911 | 214700 | 201201 | 213715 | 213836 | 214098 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 50 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 35 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20387498303 | 20387502493 | 4190 | 4660976928158 | 4660977045801 | 616 | 117643 | 101429 | 58143 | 101390 | 78096 | 1775 | 101429 | 97912 | 99824 | 99975 | 100196 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 36 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20387511833 | 20387515934 | 4101 | 4660977004046 | 4660977072453 | 722 | 68407 | 25939 | 11762 | 25882 | 21119 | 1532 | 25938 | 25015 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 101 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 37 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20387551844 | 20387557064 | 5220 | 4660977059289 | 4660977162421 | 653 | 103132 | 89299 | 66834 | 89215 | 76030 | 1335 | 89299 | 52401 | 62770 | 62565 | 63174 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 38 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20387617244 | 20387623915 | 6671 | 4660977141428 | 4660977211795 | 634 | 70367 | 48754 | 41475 | 48700 | 45979 | 1710 | 48748 | 48129 | 48210 | 47269 | 47815 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 60 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 39 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20387659265 | 20387666135 | 6870 | 4660977205981 | 4660977303423 | 698 | 97442 | 90928 | 48761 | 89830 | 74877 | 2188 | 90928 | 65950 | 89481 | 89231 | 89831 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 40 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20387694355 | 20387742176 | 47821 | 4660977262732 | 4660977812959 | 601 | 550227 | 508917 | 506036 | 508800 | 507564 | 1784 | 508915 | 429310 | 506925 | 508091 | 508098 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 160 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 41 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20387796406 | 20387802316 | 5910 | 4660977811577 | 4660978065447 | 734 | 253870 | 251763 | 228804 | 251466 | 238007 | 1692 | 251763 | 216221 | 227065 | 226854 | 227484 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 80 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 42 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20387816516 | 20387823026 | 6510 | 4660978044063 | 4660978190475 | 628 | 146412 | 124407 | 61382 | 124299 | 102339 | 2295 | 124407 | 95879 | 122958 | 122704 | 123282 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 43 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20387847647 | 20387876537 | 28890 | 4660978128871 | 4660978664382 | 632 | 535511 | 473264 | 459486 | 473167 | 468104 | 1772 | 473263 | 397261 | 447782 | 447927 | 447818 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 44 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20387932317 | 20387956508 | 24191 | 4660978652096 | 4660978740204 | 656 | 88108 | 75172 | 35031 | 75046 | 61819 | 2032 | 75153 | 74436 | 62872 | 62265 | 62843 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 80 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 45 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20387973258 | 20387978718 | 5460 | 4660978701533 | 4660978871243 | 602 | 169710 | 130437 | 55035 | 130428 | 97557 | 1833 | 130437 | 86057 | 123836 | 123993 | 124088 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 50 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 46 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20387987988 | 20387995738 | 7750 | 4660978797371 | 4660979083714 | 669 | 286343 | 211811 | 196892 | 211686 | 204022 | 1922 | 211810 | 198304 | 210835 | 210955 | 211211 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 47 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20388006118 | 20388010098 | 3980 | 4660979070191 | 4660979190519 | 610 | 120328 | 106185 | 68491 | 106124 | 83969 | 1794 | 106185 | 102606 | 104358 | 104499 | 104726 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 48 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20388040559 | 20388044709 | 4150 | 4660979154348 | 4660979216353 | 705 | 62005 | 25138 | 11977 | 25038 | 20892 | 1572 | 25137 | 24502 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 120 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 49 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20388095489 | 20388100479 | 4990 | 4660979204239 | 4660979305490 | 688 | 101251 | 88433 | 66740 | 88297 | 75970 | 1294 | 88433 | 52435 | 62847 | 62656 | 63285 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 60 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 50 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20388134029 | 20388160500 | 26471 | 4660979285436 | 4660979356402 | 633 | 70966 | 50262 | 40326 | 50156 | 46157 | 1700 | 50250 | 49646 | 49726 | 48870 | 49453 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 80 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 51 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20388175400 | 20388181620 | 6220 | 4660979347946 | 4660979447819 | 648 | 99873 | 90799 | 42410 | 90673 | 74560 | 2297 | 90799 | 65854 | 89358 | 89090 | 89699 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 52 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20388228210 | 20388255621 | 27411 | 4660979400888 | 4660979958524 | 627 | 557636 | 510070 | 507201 | 509984 | 508619 | 1775 | 510068 | 429477 | 508144 | 509301 | 509330 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 53 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20388271521 | 20388276761 | 5240 | 4660979957336 | 4660980208724 | 627 | 251388 | 249568 | 227197 | 248842 | 236094 | 1745 | 249558 | 214639 | 225154 | 224958 | 225582 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 60 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 54 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20388311301 | 20388318021 | 6720 | 4660980187816 | 4660980333644 | 633 | 145828 | 124304 | 64538 | 124222 | 102573 | 2353 | 124304 | 95349 | 122372 | 122124 | 122716 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 80 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 55 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20388381732 | 20388410112 | 28380 | 4660980275325 | 4660980806564 | 659 | 531239 | 472251 | 458825 | 472151 | 468165 | 1770 | 472251 | 396715 | 447141 | 447275 | 447176 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 56 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20388444982 | 20388449932 | 4950 | 4660980794614 | 4660980882103 | 641 | 87489 | 74908 | 34655 | 74847 | 62033 | 2034 | 74907 | 73975 | 61926 | 61341 | 62309 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 120 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 57 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20388466673 | 20388471443 | 4770 | 4660980843328 | 4660981012763 | 696 | 169435 | 129954 | 51939 | 129951 | 96440 | 1723 | 129952 | 86169 | 122645 | 122772 | 122876 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 120 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 58 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20388480203 | 20388508553 | 28350 | 4660980936262 | 4660981226963 | 648 | 290701 | 213563 | 197379 | 213452 | 205676 | 1954 | 213546 | 200035 | 212529 | 212661 | 212924 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 90 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 59 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20388519343 | 20388523513 | 4170 | 4660981212274 | 4660981329060 | 620 | 116786 | 101472 | 60434 | 101435 | 78674 | 1771 | 101472 | 97951 | 99875 | 100008 | 100238 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 30 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 60 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20388553263 | 20388581274 | 28011 | 4660981289564 | 4660981354931 | 714 | 65367 | 25166 | 11671 | 25094 | 20814 | 1538 | 25165 | 24033 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 100 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 61 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20388614654 | 20388620194 | 5540 | 4660981342481 | 4660981444233 | 658 | 101752 | 88624 | 66888 | 88540 | 75682 | 1326 | 88624 | 52131 | 62391 | 62218 | 62832 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 62 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20388654304 | 20388660724 | 6420 | 4660981424004 | 4660981495063 | 643 | 71059 | 50203 | 40845 | 50101 | 46050 | 1687 | 50203 | 49582 | 49670 | 48741 | 49371 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 60 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 63 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20388674405 | 20388702185 | 27780 | 4660981487255 | 4660981588078 | 635 | 100823 | 92381 | 45871 | 91763 | 75190 | 2301 | 92381 | 67187 | 90533 | 90260 | 90865 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 64 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20388747845 | 20388755235 | 7390 | 4660981543044 | 4660982097842 | 612 | 554798 | 509138 | 505778 | 509051 | 507689 | 1781 | 509136 | 428858 | 506961 | 508130 | 508137 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 80 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 65 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20388809436 | 20388899677 | 90241 | 4660982096462 | 4660982349205 | 680 | 252743 | 250686 | 228944 | 249930 | 237342 | 1702 | 250686 | 215600 | 226300 | 226093 | 226702 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 170 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 66 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20388915747 | 20388922237 | 6490 | 4660982328952 | 4660982469439 | 620 | 140487 | 119624 | 57007 | 119524 | 94186 | 2275 | 119624 | 91084 | 118139 | 117884 | 118470 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 67 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20388987007 | 20388993888 | 6881 | 4660982408253 | 4660982942342 | 633 | 534089 | 472251 | 460002 | 472118 | 468601 | 1776 | 472251 | 396225 | 446543 | 446680 | 446580 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 68 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20389029538 | 20389034638 | 5100 | 4660982931506 | 4660983018487 | 701 | 86981 | 75447 | 34862 | 75383 | 62041 | 2039 | 75426 | 74824 | 62601 | 61895 | 63128 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 69 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20389070568 | 20389077448 | 6880 | 4660982979455 | 4660983148467 | 686 | 169012 | 129299 | 55029 | 129290 | 96673 | 1757 | 129299 | 82954 | 122362 | 122505 | 122611 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 90 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 70 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20389107379 | 20389135829 | 28450 | 4660983075742 | 4660983362208 | 637 | 286466 | 213114 | 196836 | 212987 | 204845 | 1929 | 213114 | 199568 | 212013 | 212132 | 212409 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 100 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 71 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20389145779 | 20389150839 | 5060 | 4660983347398 | 4660983468665 | 604 | 121267 | 105844 | 57564 | 105804 | 79666 | 1805 | 105844 | 102381 | 104341 | 104462 | 104699 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 72 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20389159979 | 20389164579 | 4600 | 4660983421942 | 4660983494677 | 625 | 72735 | 25393 | 11892 | 25295 | 20994 | 1640 | 25392 | 24771 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 73 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20389215890 | 20389220960 | 5070 | 4660983482258 | 4660983584387 | 635 | 102129 | 89062 | 65942 | 88963 | 75758 | 1331 | 89061 | 52167 | 62631 | 62435 | 63046 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 100 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 74 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20389254070 | 20389260970 | 6900 | 4660983563395 | 4660983634249 | 634 | 70854 | 49242 | 40609 | 49134 | 45936 | 1681 | 49242 | 48614 | 48696 | 47770 | 48359 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 60 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 75 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20389274550 | 20389281330 | 6780 | 4660983627084 | 4660983727034 | 633 | 99950 | 92154 | 43598 | 91804 | 76082 | 2254 | 92149 | 67058 | 90819 | 90563 | 91176 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 100 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 76 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20389347441 | 20389354681 | 7240 | 4660983679892 | 4660984238776 | 638 | 558884 | 511090 | 507773 | 511024 | 509436 | 1747 | 511090 | 431117 | 509244 | 510397 | 510397 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 77 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20389389701 | 20389394911 | 5210 | 4660984236992 | 4660984490209 | 646 | 253217 | 250789 | 228219 | 250676 | 237222 | 1738 | 250789 | 215747 | 226461 | 226251 | 226881 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 100 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 78 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20389446422 | 20389452992 | 6570 | 4660984469416 | 4660984613515 | 629 | 144099 | 122685 | 59971 | 122539 | 102099 | 2309 | 122679 | 94139 | 121230 | 120981 | 121564 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 79 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20389487342 | 20389534733 | 47391 | 4660984552215 | 4660985085890 | 607 | 533675 | 471754 | 458598 | 471656 | 467511 | 1820 | 471753 | 396223 | 446748 | 446902 | 446795 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 70 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 80 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20389551533 | 20389556453 | 4920 | 4660985074147 | 4660985160912 | 654 | 86765 | 74382 | 34737 | 74328 | 61769 | 2018 | 74374 | 73598 | 62184 | 61593 | 62020 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 81 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20389621834 | 20389628314 | 6480 | 4660985122731 | 4660985290072 | 699 | 167341 | 128461 | 53896 | 128454 | 97350 | 1707 | 128461 | 86306 | 122195 | 122345 | 122449 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 120 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 82 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20389638214 | 20389645624 | 7410 | 4660985217034 | 4660985502913 | 639 | 285879 | 212209 | 196718 | 212080 | 203925 | 1926 | 212206 | 198666 | 211145 | 211261 | 211522 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 83 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20389676804 | 20389700604 | 23800 | 4660985488902 | 4660985610423 | 615 | 121521 | 106888 | 59914 | 106844 | 81953 | 1788 | 106888 | 103306 | 105212 | 105358 | 105582 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 60 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 84 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20389710144 | 20389714134 | 3990 | 4660985565445 | 4660985636303 | 719 | 70858 | 25164 | 11799 | 25072 | 20959 | 1532 | 25163 | 24221 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 50 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 85 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20389748285 | 20389753805 | 5520 | 4660985623981 | 4660985725977 | 632 | 101996 | 89030 | 66195 | 88947 | 76024 | 1377 | 89030 | 52330 | 62919 | 62703 | 63323 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 86 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20389787985 | 20389813465 | 25480 | 4660985705215 | 4660985775534 | 641 | 70319 | 48898 | 41443 | 48814 | 45508 | 1720 | 48898 | 48275 | 48352 | 47408 | 48027 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 60 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 87 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20389829035 | 20389835186 | 6151 | 4660985769540 | 4660985866094 | 670 | 96554 | 89918 | 41771 | 89781 | 72331 | 2221 | 89918 | 65058 | 88556 | 88299 | 88912 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 88 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20389901736 | 20389909656 | 7920 | 4660985819363 | 4660986377028 | 643 | 557665 | 510271 | 507249 | 510184 | 508765 | 1789 | 510269 | 429992 | 508279 | 509433 | 509458 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 89 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20389944607 | 20389950417 | 5810 | 4660986375823 | 4660986629027 | 659 | 253204 | 251347 | 228056 | 251139 | 236919 | 1741 | 251347 | 215228 | 225971 | 225761 | 226410 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 120 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 90 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20390003247 | 20390010117 | 6870 | 4660986607193 | 4660986752419 | 654 | 145226 | 122750 | 59686 | 122606 | 101149 | 2271 | 122743 | 94199 | 121264 | 121020 | 121608 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 60 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 91 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20390033367 | 20390039537 | 6170 | 4660986690776 | 4660987225004 | 609 | 534228 | 471967 | 458287 | 471875 | 467591 | 1784 | 471965 | 395954 | 446305 | 446446 | 446350 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 92 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20390079098 | 20390084428 | 5330 | 4660987212805 | 4660987302253 | 643 | 89448 | 76619 | 34919 | 76568 | 62348 | 2037 | 76618 | 75624 | 63824 | 63102 | 63532 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 93 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20390141238 | 20390166779 | 25541 | 4660987262015 | 4660987431151 | 678 | 169136 | 128210 | 51203 | 128199 | 96887 | 1727 | 128210 | 80857 | 121126 | 121292 | 121387 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 110 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 94 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20390196889 | 20390204899 | 8010 | 4660987355666 | 4660987645342 | 658 | 289676 | 213540 | 197046 | 213421 | 205484 | 1951 | 213539 | 200107 | 212516 | 212630 | 212916 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 50 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 95 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20390214789 | 20390218899 | 4110 | 4660987630349 | 4660987751489 | 625 | 121140 | 105521 | 64690 | 105494 | 82683 | 1801 | 105521 | 102099 | 104027 | 104175 | 104395 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 96 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20390227849 | 20390231819 | 3970 | 4660987712178 | 4660987777383 | 730 | 65205 | 25168 | 11709 | 25086 | 20832 | 1521 | 25167 | 24359 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 97 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20390244789 | 20390249679 | 4890 | 4660987764972 | 4660987867307 | 659 | 102335 | 89250 | 67796 | 89160 | 76012 | 1320 | 89250 | 52123 | 62602 | 62414 | 63030 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 98 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20390285700 | 20390311630 | 25930 | 4660987847531 | 4660987917109 | 616 | 69578 | 49200 | 39871 | 49096 | 45925 | 1730 | 49187 | 48498 | 48575 | 47651 | 48233 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 110 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 99 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20390374181 | 20390381441 | 7260 | 4660987909238 | 4660988009776 | 607 | 100538 | 92063 | 43572 | 91926 | 75538 | 2312 | 92063 | 67098 | 90658 | 90390 | 91008 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 100 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20390407241 | 20390414081 | 6840 | 4660987962696 | 4660988521041 | 590 | 558345 | 510656 | 507980 | 510605 | 509534 | 1799 | 510656 | 431030 | 508517 | 509679 | 509669 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 101 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20390429171 | 20390456441 | 27270 | 4660988520439 | 4660988773370 | 722 | 252931 | 251613 | 229133 | 250818 | 237552 | 1680 | 251568 | 215272 | 226369 | 226156 | 226771 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 40 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 102 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20390470192 | 20390496982 | 26790 | 4660988752369 | 4660988897316 | 625 | 144947 | 123332 | 67222 | 123218 | 100258 | 2282 | 123325 | 94889 | 121881 | 121622 | 122213 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 110 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 103 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20390585233 | 20390592733 | 7500 | 4660988842647 | 4660989369048 | 612 | 526401 | 471101 | 458293 | 471004 | 468255 | 1819 | 471101 | 395251 | 445903 | 446029 | 445932 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 90 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 104 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20390609323 | 20390617163 | 7840 | 4660989357713 | 4660989448783 | 644 | 91070 | 79097 | 35569 | 78561 | 62986 | 2071 | 79096 | 78013 | 65390 | 64626 | 66175 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 80 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 105 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20390632703 | 20390637423 | 4720 | 4660989407020 | 4660989578782 | 698 | 171762 | 129302 | 54606 | 129290 | 97199 | 1769 | 129302 | 84015 | 122865 | 123005 | 123107 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 50 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 106 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20390671213 | 20390699404 | 28191 | 4660989505613 | 4660989793987 | 665 | 288374 | 214552 | 197403 | 214434 | 206769 | 1905 | 214539 | 201026 | 213361 | 213457 | 213739 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 150 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 107 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20390729594 | 20390754494 | 24900 | 4660989778306 | 4660989894968 | 622 | 116662 | 100351 | 58311 | 100309 | 77637 | 1794 | 100351 | 96858 | 98782 | 98917 | 99148 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 108 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20390783255 | 20390788695 | 5440 | 4660989854487 | 4660989921534 | 714 | 67047 | 25857 | 11700 | 25136 | 21080 | 1548 | 25856 | 25214 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 70 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 109 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20390801635 | 20390806065 | 4430 | 4660989908419 | 4660990012039 | 712 | 103620 | 89779 | 66660 | 89692 | 75716 | 1274 | 89779 | 52367 | 62765 | 62565 | 63198 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 110 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20390819995 | 20390825685 | 5690 | 4660989990403 | 4660990061804 | 621 | 71401 | 49157 | 39506 | 49070 | 45135 | 1708 | 49146 | 48516 | 48603 | 47634 | 48214 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 40 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 111 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20390862845 | 20390869355 | 6510 | 4660990053613 | 4660990153542 | 658 | 99929 | 91080 | 47837 | 90950 | 75199 | 2248 | 91073 | 65870 | 89549 | 89283 | 89890 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 112 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20390953306 | 20390960766 | 7460 | 4660990111703 | 4660990663831 | 615 | 552128 | 509659 | 505919 | 509518 | 507961 | 1786 | 509659 | 429635 | 507691 | 508831 | 508838 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 113 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20390995767 | 20391001047 | 5280 | 4660990662010 | 4660990914639 | 752 | 252629 | 250057 | 229124 | 249982 | 237177 | 1682 | 250057 | 215727 | 226231 | 226032 | 226677 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 40 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 114 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20391014197 | 20391021897 | 7700 | 4660990895155 | 4660991038950 | 652 | 143795 | 123679 | 66350 | 123552 | 101394 | 2272 | 123679 | 95184 | 122202 | 121967 | 122557 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 30 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 115 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20391065417 | 20391091737 | 26320 | 4660990983052 | 4660991511983 | 614 | 528931 | 472402 | 458433 | 472307 | 468060 | 1785 | 472400 | 396426 | 446785 | 446930 | 446822 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 80 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 116 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20391126678 | 20391131388 | 4710 | 4660991499481 | 4660991587628 | 649 | 88147 | 75002 | 35070 | 74870 | 61930 | 2054 | 74991 | 74445 | 62365 | 61793 | 62777 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 117 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20391166868 | 20391192858 | 25990 | 4660991549174 | 4660991717461 | 614 | 168287 | 129223 | 53682 | 129212 | 98015 | 1798 | 129223 | 84223 | 123300 | 123456 | 123550 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 70 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 118 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20391202689 | 20391210559 | 7870 | 4660991643471 | 4660991929901 | 651 | 286430 | 211795 | 196653 | 211681 | 203768 | 1916 | 211779 | 198238 | 210705 | 210857 | 211092 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 90 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 119 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20391220469 | 20391224809 | 4340 | 4660991916260 | 4660992033235 | 625 | 116975 | 102703 | 65609 | 102694 | 81078 | 1781 | 102703 | 99026 | 100934 | 101074 | 101302 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 120 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20391255119 | 20391259519 | 4400 | 4660991997707 | 4660992059447 | 653 | 61740 | 25564 | 11759 | 25471 | 20927 | 1617 | 25563 | 23963 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 121 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20391291999 | 20391317610 | 25611 | 4660992046688 | 4660992147903 | 631 | 101215 | 87815 | 66609 | 87728 | 75753 | 1351 | 87815 | 52050 | 62708 | 62515 | 63124 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 60 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 122 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20391351550 | 20391357800 | 6250 | 4660992128186 | 4660992197331 | 637 | 69145 | 48761 | 41337 | 48668 | 45926 | 1700 | 48756 | 48141 | 48211 | 47169 | 47831 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 40 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 123 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20391391530 | 20391398390 | 6860 | 4660992191374 | 4660992288011 | 698 | 96637 | 90022 | 38866 | 89894 | 73547 | 2292 | 90022 | 64921 | 88423 | 88166 | 88775 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 124 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20391422671 | 20391430111 | 7440 | 4660992238268 | 4660992798844 | 603 | 560576 | 510218 | 506679 | 510133 | 508798 | 1808 | 510218 | 429935 | 508141 | 509285 | 509296 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 125 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20391506711 | 20391512581 | 5870 | 4660992797739 | 4660993050204 | 684 | 252465 | 250679 | 228583 | 250490 | 237267 | 1706 | 250678 | 215546 | 226457 | 226262 | 226872 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 60 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 126 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20391526782 | 20391554992 | 28210 | 4660993029577 | 4660993178763 | 630 | 149186 | 127938 | 77926 | 127858 | 103558 | 2283 | 127938 | 99479 | 126481 | 126236 | 126810 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 127 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20391605752 | 20391613312 | 7560 | 4660993130171 | 4660993651583 | 649 | 521412 | 472157 | 458850 | 472059 | 467494 | 1750 | 472157 | 396261 | 446742 | 446874 | 446771 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 80 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 128 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20391628293 | 20391633273 | 4980 | 4660993639733 | 4660993727959 | 661 | 88226 | 75728 | 35108 | 75670 | 62030 | 2024 | 75720 | 74779 | 63189 | 62462 | 63157 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 90 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 129 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20391709983 | 20391715663 | 5680 | 4660993688818 | 4660993859373 | 675 | 170555 | 130735 | 53551 | 130726 | 98247 | 1747 | 130735 | 84636 | 124136 | 124286 | 124388 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 70 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 130 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20391725214 | 20391753324 | 28110 | 4660993783710 | 4660994073530 | 667 | 289820 | 213491 | 198063 | 213378 | 206156 | 1890 | 213475 | 199957 | 212423 | 212559 | 212835 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 131 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20391763804 | 20391768414 | 4610 | 4660994059555 | 4660994173376 | 618 | 113821 | 99228 | 63487 | 99218 | 79262 | 1806 | 99226 | 95742 | 97673 | 97800 | 98034 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 132 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20391797664 | 20391802054 | 4390 | 4660994139163 | 4660994199615 | 654 | 60452 | 25591 | 11483 | 25502 | 20874 | 1639 | 25590 | 24348 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 80 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 133 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20391815204 | 20391820744 | 5540 | 4660994186550 | 4660994287883 | 635 | 101333 | 87620 | 66123 | 87530 | 75497 | 1361 | 87618 | 51968 | 62409 | 62207 | 62836 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 134 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20391855435 | 20391880535 | 25100 | 4660994268253 | 4660994337245 | 645 | 68992 | 48721 | 39103 | 48640 | 45598 | 1693 | 48714 | 48101 | 48183 | 47234 | 47873 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 140 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 135 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20391913865 | 20391920945 | 7080 | 4660994329097 | 4660994429760 | 667 | 100663 | 91853 | 41796 | 91610 | 74085 | 2239 | 91853 | 66751 | 90286 | 90003 | 90630 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 136 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20391965496 | 20391992656 | 27160 | 4660994381146 | 4660994939604 | 639 | 558458 | 509195 | 506004 | 509107 | 507870 | 1785 | 509193 | 429463 | 507296 | 508447 | 508444 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 137 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20392009166 | 20392014366 | 5200 | 4660994938818 | 4660995191114 | 614 | 252296 | 250896 | 229038 | 250264 | 237151 | 1759 | 250895 | 215202 | 226030 | 225834 | 226452 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 100 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 138 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20392027216 | 20392055217 | 28001 | 4660995170733 | 4660995316377 | 621 | 145644 | 124654 | 64106 | 124544 | 102764 | 2286 | 124652 | 95866 | 122927 | 122671 | 123265 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 139 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20392115797 | 20392122497 | 6700 | 4660995257257 | 4660995790104 | 642 | 532847 | 473071 | 459779 | 472945 | 469137 | 1733 | 473071 | 396353 | 446925 | 447052 | 446951 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 50 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 140 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20392157928 | 20392162878 | 4950 | 4660995778269 | 4660995865706 | 653 | 87437 | 74949 | 34842 | 74854 | 62027 | 2068 | 74940 | 74143 | 62554 | 61921 | 62443 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 90 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 141 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20392198808 | 20392203788 | 4980 | 4660995827111 | 4660995994195 | 662 | 167084 | 127823 | 53498 | 127811 | 97901 | 1806 | 127823 | 85639 | 120713 | 120858 | 120949 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 60 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 142 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20392212778 | 20392221378 | 8600 | 4660995921390 | 4660996207840 | 646 | 286450 | 213016 | 197128 | 212886 | 205013 | 1947 | 213013 | 199519 | 212050 | 212145 | 212428 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 90 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 143 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20392251989 | 20392275899 | 23910 | 4660996193361 | 4660996313334 | 597 | 119973 | 104889 | 56006 | 104880 | 79453 | 1827 | 104889 | 101415 | 103375 | 103505 | 103733 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 144 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20392285199 | 20392308679 | 23480 | 4660996265990 | 4660996339421 | 643 | 73431 | 25453 | 11522 | 25159 | 20912 | 1672 | 25452 | 24758 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 50 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 145 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20392320919 | 20392345969 | 25050 | 4660996326532 | 4660996428225 | 641 | 101693 | 88145 | 66557 | 88038 | 75631 | 1334 | 88145 | 52184 | 62694 | 62502 | 63115 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 146 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20392359500 | 20392365280 | 5780 | 4660996408106 | 4660996477817 | 656 | 69711 | 48951 | 40470 | 48905 | 45894 | 1672 | 48945 | 48318 | 48384 | 47343 | 48008 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 50 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 147 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20392398800 | 20392404970 | 6170 | 4660996470798 | 4660996568701 | 603 | 97903 | 90289 | 43885 | 89644 | 74042 | 2286 | 90283 | 65243 | 88826 | 88578 | 89192 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 148 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20392470131 | 20392496891 | 26760 | 4660996523718 | 4660997079552 | 571 | 555834 | 510263 | 506915 | 510132 | 508846 | 1845 | 510263 | 430297 | 508399 | 509566 | 509571 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 90 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 149 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20392513311 | 20392518281 | 4970 | 4660997077710 | 4660997332246 | 648 | 254536 | 252039 | 229236 | 251610 | 238415 | 1713 | 252039 | 216430 | 227297 | 227076 | 227699 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 60 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 150 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20392552741 | 20392565121 | 12380 | 4660997311368 | 4660997461868 | 646 | 150500 | 128997 | 65618 | 128890 | 105950 | 2231 | 128997 | 100084 | 127169 | 126916 | 127505 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 151 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20392607732 | 20392614982 | 7250 | 4660997399914 | 4660997934595 | 617 | 534681 | 472088 | 458118 | 471988 | 467631 | 1800 | 472088 | 396462 | 446844 | 446985 | 446871 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 50 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 152 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20392651652 | 20392676363 | 24711 | 4660997922020 | 4660998010387 | 645 | 88367 | 75164 | 34428 | 75105 | 61628 | 2074 | 75153 | 74193 | 62684 | 61926 | 62390 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 153 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20392712823 | 20392718013 | 5190 | 4660997971183 | 4660998140623 | 668 | 169440 | 129568 | 52657 | 129555 | 96434 | 1737 | 129568 | 87663 | 123156 | 123290 | 123393 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 60 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 154 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20392746783 | 20392754453 | 7670 | 4660998065244 | 4660998355648 | 636 | 290404 | 214390 | 196842 | 214274 | 205636 | 1943 | 214387 | 200882 | 213273 | 213389 | 213654 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 90 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 155 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20392764223 | 20392768273 | 4050 | 4660998339578 | 4660998457573 | 635 | 117995 | 101280 | 61593 | 101261 | 80029 | 1766 | 101280 | 97842 | 99768 | 99897 | 100129 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 156 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20392797984 | 20392802204 | 4220 | 4660998419437 | 4660998483697 | 724 | 64260 | 25414 | 11751 | 25288 | 20904 | 1526 | 25413 | 24530 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 157 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20392917555 | 20392923175 | 5620 | 4660998471072 | 4660998573533 | 613 | 102461 | 89208 | 67977 | 89124 | 76028 | 1381 | 89208 | 52321 | 62666 | 62468 | 63102 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 158 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20392979315 | 20392985755 | 6440 | 4660998553826 | 4660998623883 | 623 | 70057 | 49716 | 40876 | 49634 | 45376 | 1742 | 49716 | 49088 | 49172 | 48240 | 48824 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 70 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 159 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20393001076 | 20393028246 | 27170 | 4660998616535 | 4660998717589 | 688 | 101054 | 93045 | 57041 | 92946 | 77728 | 2193 | 93045 | 67866 | 91439 | 91190 | 91770 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 160 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20393075406 | 20393083256 | 7850 | 4660998683017 | 4660999228797 | 648 | 545780 | 510540 | 507216 | 510450 | 508964 | 1756 | 510540 | 430016 | 508522 | 509672 | 509692 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 161 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20393117287 | 20393122767 | 5480 | 4660999227431 | 4660999479377 | 656 | 251946 | 249927 | 227908 | 249395 | 236587 | 1713 | 249927 | 214483 | 225461 | 225236 | 225871 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 120 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 162 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20393175337 | 20393183027 | 7690 | 4660999458832 | 4660999604297 | 641 | 145465 | 124292 | 58099 | 124228 | 102032 | 2301 | 124292 | 95769 | 122817 | 122576 | 123153 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 60 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 163 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20393230398 | 20393237238 | 6840 | 4660999539536 | 4661000077744 | 659 | 538208 | 472775 | 458797 | 472678 | 467568 | 1756 | 472775 | 396446 | 447196 | 447324 | 447230 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 80 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 164 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20393253188 | 20393278738 | 25550 | 4661000065252 | 4661000154025 | 633 | 88773 | 75659 | 34638 | 75600 | 61966 | 2049 | 75649 | 74624 | 63505 | 62791 | 63204 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 30 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 165 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20393315369 | 20393339469 | 24100 | 4661000114479 | 4661000284382 | 679 | 169903 | 129671 | 55044 | 129662 | 97161 | 1740 | 129671 | 88813 | 123554 | 123694 | 123789 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 50 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 166 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20393349159 | 20393377299 | 28140 | 4661000211265 | 4661000497550 | 665 | 286285 | 212513 | 196897 | 212393 | 204521 | 1953 | 212499 | 198910 | 211316 | 211439 | 211705 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 120 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 167 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20393387729 | 20393392089 | 4360 | 4661000483328 | 4661000601483 | 621 | 118155 | 103307 | 61803 | 103266 | 80530 | 1763 | 103307 | 99796 | 101719 | 101864 | 102089 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 168 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20393421690 | 20393425750 | 4060 | 4661000561541 | 4661000628503 | 704 | 66962 | 26324 | 11976 | 26244 | 20993 | 1561 | 26323 | 24107 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 50 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 169 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20393438560 | 20393443540 | 4980 | 4661000615206 | 4661000718646 | 672 | 103440 | 89453 | 66475 | 89362 | 75946 | 1285 | 89452 | 52261 | 62683 | 62486 | 63095 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 170 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20393477150 | 20393484170 | 7020 | 4661000697137 | 4661000769808 | 644 | 72671 | 50531 | 40895 | 50431 | 45811 | 1692 | 50522 | 49896 | 49965 | 49028 | 49608 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 90 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 171 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20393537701 | 20393568561 | 30860 | 4661000761646 | 4661000860699 | 642 | 99053 | 90251 | 43424 | 90144 | 74631 | 2299 | 90251 | 65030 | 88682 | 88418 | 89030 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 172 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20393595411 | 20393624762 | 29351 | 4661000815301 | 4661001370935 | 643 | 555634 | 509571 | 506853 | 509483 | 508188 | 1765 | 509571 | 429802 | 507564 | 508709 | 508711 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 20931 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 173 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20393639522 | 20393644532 | 5010 | 4661001370115 | 4661001621596 | 668 | 251481 | 249999 | 228726 | 249431 | 237131 | 1701 | 249998 | 215449 | 226302 | 226110 | 226742 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 40 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 174 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20393678962 | 20393706142 | 27180 | 4661001601783 | 4661001743103 | 659 | 141320 | 120860 | 57530 | 120775 | 96691 | 2281 | 120860 | 92126 | 119109 | 118868 | 119460 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 60 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 175 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20393767663 | 20393774583 | 6920 | 4661001681200 | 4661002216646 | 657 | 535446 | 472875 | 459642 | 472783 | 469429 | 1743 | 472873 | 396704 | 447545 | 447690 | 447592 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 90 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 176 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20393789613 | 20393794093 | 4480 | 4661002204881 | 4661002291914 | 662 | 87033 | 74611 | 35016 | 74487 | 62128 | 2060 | 74610 | 74258 | 62247 | 61640 | 62069 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 177 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20393831344 | 20393836234 | 4890 | 4661002253822 | 4661002420478 | 624 | 166656 | 127936 | 51733 | 127932 | 97166 | 1789 | 127936 | 81876 | 120723 | 120871 | 120964 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 40 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 178 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20393865474 | 20393873824 | 8350 | 4661002345812 | 4661002634635 | 660 | 288823 | 213511 | 197047 | 213384 | 204754 | 1921 | 213510 | 199957 | 212444 | 212546 | 212831 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 100 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 179 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20393904054 | 20393908374 | 4320 | 4661002619682 | 4661002741083 | 611 | 121401 | 105835 | 65642 | 105792 | 80594 | 1800 | 105835 | 102344 | 104277 | 104403 | 104624 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 180 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20393956085 | 20393960285 | 4200 | 4661002702412 | 4661002767865 | 697 | 65453 | 26087 | 11653 | 25314 | 21028 | 1583 | 26086 | 25304 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 80 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 181 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20393972735 | 20393977845 | 5110 | 4661002754473 | 4661002857507 | 710 | 103034 | 88918 | 66523 | 88830 | 76128 | 1286 | 88918 | 52516 | 62971 | 62761 | 63381 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 182 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20394012085 | 20394018025 | 5940 | 4661002836566 | 4661002907217 | 643 | 70651 | 49069 | 40342 | 48921 | 45816 | 1690 | 49069 | 48432 | 48520 | 47561 | 48106 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 60 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 183 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20394031665 | 20394058096 | 26431 | 4661002899942 | 4661002999363 | 599 | 99421 | 91555 | 43062 | 90441 | 74733 | 2335 | 91555 | 66303 | 90101 | 89846 | 90457 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 184 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20394120946 | 20394147197 | 26251 | 4661002952370 | 4661003509264 | 605 | 556894 | 509289 | 506030 | 509271 | 507828 | 1785 | 509288 | 428963 | 507380 | 508525 | 508540 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 185 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20394162907 | 20394167957 | 5050 | 4661003508022 | 4661003761650 | 641 | 253628 | 251740 | 228369 | 251487 | 237289 | 1772 | 251740 | 215492 | 226252 | 226040 | 226651 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 50 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 186 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20394180947 | 20394209107 | 28160 | 4661003740115 | 4661003882967 | 632 | 142852 | 120702 | 55503 | 120595 | 97695 | 2307 | 120697 | 92286 | 119338 | 119091 | 119680 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 110 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 187 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20394232267 | 20394258778 | 26511 | 4661003819190 | 4661004357211 | 642 | 538021 | 473586 | 458598 | 473493 | 468255 | 1754 | 473586 | 397795 | 448117 | 448239 | 448143 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 188 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20394293748 | 20394317878 | 24130 | 4661004343715 | 4661004432721 | 632 | 89006 | 74888 | 35024 | 74760 | 61518 | 2071 | 74870 | 74360 | 62174 | 61601 | 62621 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 120 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 189 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20394354078 | 20394359929 | 5851 | 4661004394332 | 4661004564295 | 628 | 169963 | 130951 | 55063 | 130942 | 97474 | 1765 | 130940 | 86113 | 123753 | 123876 | 123977 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 40 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 190 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20394369059 | 20394377669 | 8610 | 4661004489928 | 4661004777122 | 648 | 287194 | 212177 | 196430 | 212053 | 203729 | 1918 | 212176 | 198655 | 211115 | 211216 | 211501 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 70 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 191 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20394408879 | 20394413499 | 4620 | 4661004762777 | 4661004888642 | 617 | 125865 | 110896 | 63271 | 110889 | 84486 | 1789 | 110896 | 107460 | 109335 | 109478 | 109696 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 192 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20394422859 | 20394446989 | 24130 | 4661004842530 | 4661004914539 | 635 | 72009 | 25265 | 11664 | 25027 | 20894 | 1681 | 25264 | 24333 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 30 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 193 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20394478400 | 20394483550 | 5150 | 4661004901966 | 4661005003187 | 638 | 101221 | 88003 | 67300 | 87923 | 76193 | 1342 | 87993 | 52102 | 62531 | 62320 | 62938 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 194 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20394516760 | 20394522830 | 6070 | 4661004984522 | 4661005054001 | 647 | 69479 | 50143 | 39944 | 50034 | 46374 | 1680 | 50142 | 49522 | 49585 | 48759 | 49348 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 50 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 195 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20394557410 | 20394569481 | 12071 | 4661005045288 | 4661005145983 | 679 | 100695 | 91341 | 40230 | 90985 | 75407 | 2317 | 91339 | 66060 | 89952 | 89687 | 90297 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 196 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20394615591 | 20394622611 | 7020 | 4661005096278 | 4661005656743 | 617 | 560465 | 510123 | 507104 | 509993 | 508601 | 1776 | 510123 | 429541 | 508153 | 509293 | 509301 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 197 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20394657911 | 20394663281 | 5370 | 4661005655555 | 4661005907486 | 739 | 251931 | 250006 | 227924 | 249913 | 236674 | 1692 | 250006 | 214378 | 225125 | 224896 | 225539 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 40 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 198 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20394717082 | 20394723552 | 6470 | 4661005886877 | 4661006037453 | 657 | 150576 | 129329 | 77150 | 129268 | 107545 | 2266 | 129329 | 100777 | 127887 | 127636 | 128216 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 199 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20394765692 | 20394772002 | 6310 | 4661005986710 | 4661006512388 | 677 | 525678 | 474238 | 458929 | 474141 | 468168 | 1744 | 474236 | 398254 | 449176 | 449326 | 449219 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 200 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20394806993 | 20394811873 | 4880 | 4661006498538 | 4661006589075 | 644 | 90537 | 76044 | 34951 | 75920 | 62151 | 2039 | 76036 | 75300 | 63486 | 62864 | 63676 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 201 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20394848303 | 20394854363 | 6060 | 4661006549434 | 4661006718592 | 619 | 169158 | 128909 | 52019 | 128896 | 96636 | 1840 | 128909 | 81640 | 121455 | 121616 | 121718 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 90 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 202 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20394888694 | 20394916294 | 27600 | 4661006643231 | 4661006931503 | 659 | 288272 | 212259 | 196629 | 212132 | 204459 | 1940 | 212258 | 198697 | 211140 | 211252 | 211538 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 60 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 203 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20394926714 | 20394931064 | 4350 | 4661006917284 | 4661007035765 | 603 | 118481 | 103656 | 65653 | 103615 | 80640 | 1787 | 103651 | 100159 | 102077 | 102207 | 102434 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 204 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20394960384 | 20394964714 | 4330 | 4661006999300 | 4661007061735 | 711 | 62435 | 25266 | 11829 | 25193 | 20869 | 1547 | 25265 | 24357 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 60 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 205 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20394997565 | 20395002535 | 4970 | 4661007049347 | 4661007151808 | 662 | 102461 | 89387 | 66121 | 89291 | 75743 | 1314 | 89387 | 52191 | 62619 | 62423 | 63047 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 206 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20395016675 | 20395043125 | 26450 | 4661007130242 | 4661007202053 | 644 | 71811 | 49621 | 39905 | 49514 | 45266 | 1701 | 49615 | 49015 | 49081 | 48224 | 48809 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 40 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 207 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20395057415 | 20395084785 | 27370 | 4661007193822 | 4661007292210 | 634 | 98388 | 89518 | 40034 | 89407 | 74402 | 2312 | 89511 | 64273 | 88037 | 87783 | 88389 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 208 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20395148576 | 20395155966 | 7390 | 4661007244135 | 4661007802254 | 646 | 558119 | 509386 | 506383 | 509298 | 507905 | 1769 | 509386 | 429307 | 507372 | 508524 | 508542 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 209 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20395170306 | 20395196816 | 26510 | 4661007801157 | 4661008053595 | 684 | 252438 | 250662 | 228047 | 249630 | 236912 | 1703 | 250661 | 215402 | 226200 | 225988 | 226620 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 90 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 210 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20395210657 | 20395217207 | 6550 | 4661008032444 | 4661008177325 | 640 | 144881 | 123098 | 55428 | 123031 | 97040 | 2273 | 123092 | 94685 | 121698 | 121453 | 122040 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 211 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20395279187 | 20395305198 | 26011 | 4661008111066 | 4661008650261 | 597 | 539195 | 472331 | 459393 | 472238 | 468589 | 1760 | 472331 | 396063 | 446663 | 446791 | 446689 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 40 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 212 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20395320638 | 20395326078 | 5440 | 4661008638726 | 4661008726863 | 628 | 88137 | 75967 | 35069 | 75832 | 62127 | 2108 | 75956 | 75286 | 62931 | 62256 | 63374 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 60 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 213 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20395362938 | 20395388578 | 25640 | 4661008687467 | 4661008856136 | 634 | 168669 | 128648 | 55348 | 128639 | 97803 | 1799 | 128646 | 80817 | 122294 | 122417 | 122521 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 90 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 214 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20395398308 | 20395405838 | 7530 | 4661008784364 | 4661009069509 | 656 | 285145 | 212727 | 197133 | 212614 | 204936 | 1915 | 212714 | 199243 | 211748 | 211864 | 212126 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 215 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20395415169 | 20395439939 | 24770 | 4661009055402 | 4661009174774 | 636 | 119372 | 104624 | 64173 | 104589 | 82332 | 1795 | 104617 | 101108 | 102954 | 103086 | 103307 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 216 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20395468239 | 20395472269 | 4030 | 4661009135824 | 4661009200745 | 704 | 64921 | 25271 | 11706 | 25127 | 20981 | 1566 | 25270 | 24829 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 217 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20395504019 | 20395508789 | 4770 | 4661009188218 | 4661009289794 | 711 | 101576 | 88318 | 69010 | 87973 | 76211 | 1278 | 88318 | 52335 | 62983 | 62784 | 63401 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 60 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 218 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20395521820 | 20395528220 | 6400 | 4661009271951 | 4661009339834 | 650 | 67883 | 49407 | 40854 | 49294 | 45803 | 1697 | 49406 | 48882 | 48971 | 48131 | 48713 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 50 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 219 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20395567920 | 20395593810 | 25890 | 4661009332700 | 4661009430573 | 626 | 97873 | 90115 | 40327 | 89771 | 73762 | 2312 | 90115 | 64989 | 88750 | 88496 | 89106 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 220 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20395639701 | 20395646911 | 7210 | 4661009382291 | 4661009942139 | 646 | 559848 | 510910 | 507786 | 510796 | 509553 | 1730 | 510908 | 431379 | 508878 | 510036 | 510052 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 120 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 221 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20395681741 | 20395706781 | 25040 | 4661009940583 | 4661010195396 | 699 | 254813 | 252557 | 229721 | 251314 | 238248 | 1721 | 252556 | 216459 | 227368 | 227167 | 227813 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 60 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 222 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20395719881 | 20395726502 | 6621 | 4661010174029 | 4661010319181 | 624 | 145152 | 123175 | 67972 | 123047 | 99163 | 2263 | 123168 | 94410 | 121416 | 121166 | 121752 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 223 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20395791992 | 20395800522 | 8530 | 4661010265398 | 4661010792080 | 617 | 526682 | 472265 | 458638 | 472168 | 467857 | 1781 | 472265 | 396111 | 446772 | 446919 | 446820 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 140 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 224 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20395816132 | 20395841053 | 24921 | 4661010779907 | 4661010867834 | 650 | 87927 | 75116 | 35010 | 75062 | 61792 | 2059 | 75098 | 74274 | 62541 | 61847 | 62637 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 90 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 225 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20395876273 | 20395901833 | 25560 | 4661010829221 | 4661010999708 | 698 | 170487 | 131176 | 53312 | 131163 | 96323 | 1739 | 131176 | 83214 | 123679 | 123846 | 123934 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 120 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 226 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20395911333 | 20395918713 | 7380 | 4661010923378 | 4661011211797 | 657 | 288419 | 211433 | 196418 | 211318 | 203663 | 1919 | 211432 | 197893 | 210359 | 210493 | 210755 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 227 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20395928253 | 20395932353 | 4100 | 4661011198277 | 4661011315673 | 631 | 117396 | 103236 | 66644 | 103202 | 81881 | 1753 | 103236 | 99685 | 101589 | 101735 | 101959 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 228 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20395940764 | 20395966034 | 25270 | 4661011280622 | 4661011341507 | 716 | 60885 | 25131 | 11757 | 24894 | 20840 | 1553 | 25130 | 24388 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 50 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 229 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20395997204 | 20396002504 | 5300 | 4661011329167 | 4661011432312 | 696 | 103145 | 90092 | 66459 | 90008 | 76088 | 1291 | 90091 | 52208 | 63026 | 62835 | 63451 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 40 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 230 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20396036584 | 20396061775 | 25191 | 4661011410259 | 4661011482440 | 640 | 72181 | 49476 | 39682 | 49365 | 46001 | 1694 | 49474 | 48863 | 48931 | 48038 | 48662 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 70 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 231 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20396094975 | 20396101095 | 6120 | 4661011474128 | 4661011574541 | 648 | 100413 | 91482 | 50346 | 90374 | 75146 | 2283 | 91475 | 66305 | 89723 | 89471 | 90062 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 232 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20396124325 | 20396130475 | 6150 | 4661011534892 | 4661012085201 | 642 | 550309 | 509999 | 507139 | 509907 | 508574 | 1763 | 509999 | 429476 | 507972 | 509118 | 509132 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 60 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 233 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20396185756 | 20396191036 | 5280 | 4661012083832 | 4661012335217 | 702 | 251385 | 249315 | 227841 | 249007 | 236330 | 1673 | 249304 | 214615 | 225394 | 225185 | 225825 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 40 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 234 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20396203726 | 20396230406 | 26680 | 4661012315383 | 4661012458848 | 625 | 143465 | 123020 | 82824 | 122957 | 102537 | 2291 | 123020 | 94457 | 121537 | 121294 | 121879 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 235 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20396291477 | 20396298137 | 6660 | 4661012420087 | 4661012931191 | 675 | 511104 | 471651 | 457907 | 471553 | 467181 | 1717 | 471651 | 395636 | 446171 | 446309 | 446209 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 50 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 236 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20396312487 | 20396316827 | 4340 | 4661012918935 | 4661013007180 | 640 | 88245 | 75365 | 35094 | 75240 | 61905 | 2055 | 75357 | 74759 | 62525 | 61899 | 63050 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 30 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 237 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20396331997 | 20396357588 | 25591 | 4661012968396 | 4661013138553 | 612 | 170157 | 130758 | 52778 | 130749 | 97467 | 1792 | 130756 | 85869 | 123906 | 124055 | 124159 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 50 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 238 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20396385828 | 20396393248 | 7420 | 4661013062077 | 4661013354348 | 643 | 292271 | 215155 | 196909 | 215025 | 206147 | 1971 | 215155 | 201617 | 214135 | 214246 | 214523 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 50 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 239 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20396441778 | 20396446038 | 4260 | 4661013337600 | 4661013457348 | 620 | 119748 | 102373 | 58757 | 102335 | 78505 | 1782 | 102373 | 98947 | 100758 | 100901 | 101125 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 240 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20396474229 | 20396478349 | 4120 | 4661013415261 | 4661013483473 | 700 | 68212 | 25437 | 11706 | 25084 | 20989 | 1567 | 25436 | 24551 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 241 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20396491409 | 20396496359 | 4950 | 4661013470790 | 4661013573376 | 691 | 102586 | 89195 | 66522 | 89108 | 75884 | 1285 | 89195 | 52589 | 62848 | 62628 | 63250 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 242 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20396509239 | 20396514769 | 5530 | 4661013552583 | 4661013623172 | 636 | 70589 | 49172 | 42445 | 49091 | 45844 | 1690 | 49164 | 48559 | 48638 | 47702 | 48284 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 50 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 243 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20396527659 | 20396556599 | 28940 | 4661013617948 | 4661013714249 | 671 | 96301 | 90411 | 44901 | 89426 | 74768 | 2264 | 90411 | 65311 | 88962 | 88699 | 89317 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 60 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 244 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20396650120 | 20396660200 | 10080 | 4661013670202 | 4661014224647 | 616 | 554445 | 509767 | 507073 | 509680 | 508463 | 1789 | 509766 | 430154 | 507806 | 508969 | 508975 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 200 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 245 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20396706761 | 20396716001 | 9240 | 4661014223593 | 4661014475820 | 664 | 252227 | 250510 | 227892 | 249836 | 236791 | 1711 | 250498 | 214770 | 225639 | 225449 | 226067 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 150 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 246 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20396734741 | 20396765471 | 30730 | 4661014454673 | 4661014599738 | 627 | 145065 | 123304 | 65519 | 123249 | 102554 | 2267 | 123304 | 94695 | 121730 | 121471 | 122059 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 247 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20396809322 | 20396899153 | 89831 | 4661014543373 | 4661015072481 | 675 | 529108 | 472054 | 459291 | 471957 | 468582 | 1772 | 472053 | 395476 | 446072 | 446218 | 446118 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 90 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 248 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20396937153 | 20396962653 | 25500 | 4661015061209 | 4661015149263 | 635 | 88054 | 76143 | 35454 | 75543 | 62194 | 2048 | 76133 | 75563 | 64201 | 63508 | 64227 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 80 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 249 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20397019064 | 20397024954 | 5890 | 4661015110233 | 4661015280446 | 632 | 170213 | 130555 | 52715 | 130544 | 97516 | 1805 | 130555 | 82289 | 123835 | 123977 | 124083 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 50 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 250 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20397034224 | 20397042724 | 8500 | 4661015204145 | 4661015494304 | 644 | 290159 | 213226 | 197253 | 213110 | 205575 | 1930 | 213213 | 199642 | 212004 | 212098 | 212387 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 40 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 251 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20397072864 | 20397077564 | 4700 | 4661015479829 | 4661015595825 | 629 | 115996 | 100888 | 60503 | 100853 | 79122 | 1800 | 100888 | 97412 | 99354 | 99488 | 99715 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 252 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20397086284 | 20397090604 | 4320 | 4661015556984 | 4661015621663 | 613 | 64679 | 25227 | 11824 | 25156 | 20940 | 1635 | 25226 | 24697 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 80 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 253 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20397122625 | 20397127925 | 5300 | 4661015609341 | 4661015711908 | 684 | 102567 | 89549 | 66177 | 89393 | 75933 | 1309 | 89535 | 52235 | 62830 | 62599 | 63238 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 254 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20397161995 | 20397187485 | 25490 | 4661015690449 | 4661015761821 | 634 | 71372 | 49290 | 40613 | 49183 | 45569 | 1696 | 49284 | 48659 | 48746 | 47693 | 48344 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 40 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 255 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20397220116 | 20397226786 | 6670 | 4661015754612 | 4661015852916 | 643 | 98304 | 90452 | 43015 | 90179 | 74112 | 2301 | 90447 | 65413 | 88937 | 88679 | 89290 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 256 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20397271716 | 20397278966 | 7250 | 4661015806941 | 4661016362837 | 609 | 555896 | 509298 | 506305 | 509283 | 508052 | 1810 | 509296 | 429834 | 507357 | 508506 | 508514 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 257 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20397313717 | 20397318947 | 5230 | 4661016361716 | 4661016616165 | 664 | 254449 | 252664 | 228639 | 252040 | 237736 | 1760 | 252664 | 215931 | 226766 | 226566 | 227199 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 90 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 258 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20397351917 | 20397377237 | 25320 | 4661016593914 | 4661016739469 | 636 | 145555 | 122686 | 60559 | 122585 | 100324 | 2277 | 122681 | 94280 | 121302 | 121054 | 121646 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 18970 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 259 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20397417528 | 20397423878 | 6350 | 4661016678762 | 4661017211540 | 627 | 532778 | 471430 | 457474 | 471330 | 467167 | 1774 | 471430 | 395502 | 445758 | 445918 | 445803 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 50 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 260 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20397459328 | 20397464158 | 4830 | 4661017198998 | 4661017287707 | 647 | 88709 | 75525 | 34790 | 75460 | 61971 | 2040 | 75516 | 74451 | 62053 | 61328 | 61986 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 100 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 261 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20397478898 | 20397503538 | 24640 | 4661017248485 | 4661017416763 | 694 | 168278 | 128367 | 51526 | 128359 | 96105 | 1746 | 128365 | 82337 | 122404 | 122536 | 122641 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 40 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 262 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20397512908 | 20397540019 | 27111 | 4661017341448 | 4661017630338 | 638 | 288890 | 212942 | 196803 | 212822 | 204683 | 1939 | 212926 | 199415 | 211619 | 211739 | 212000 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 70 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 263 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20397600199 | 20397605769 | 5570 | 4661017615609 | 4661017732325 | 609 | 116716 | 101367 | 62837 | 101346 | 79871 | 1772 | 101367 | 97881 | 99814 | 99950 | 100171 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 60 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 264 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20397615629 | 20397622840 | 7211 | 4661017696120 | 4661017758430 | 646 | 62310 | 25469 | 11647 | 25375 | 20874 | 1661 | 25469 | 24471 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 30 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 265 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20397635360 | 20397661160 | 25800 | 4661017745647 | 4661017848357 | 628 | 102710 | 89281 | 66603 | 89185 | 75720 | 1349 | 89281 | 52237 | 62585 | 62395 | 63029 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 60 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 266 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20397675540 | 20397681630 | 6090 | 4661017827408 | 4661017897677 | 649 | 70269 | 48647 | 40596 | 48555 | 45419 | 1703 | 48644 | 48035 | 48111 | 47153 | 47749 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 60 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 267 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20397715950 | 20397722430 | 6480 | 4661017892073 | 4661017990608 | 671 | 98535 | 92303 | 43118 | 91950 | 75779 | 2268 | 92301 | 67214 | 90920 | 90659 | 91262 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 268 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20397807411 | 20397815051 | 7640 | 4661017942913 | 4661018501283 | 622 | 558370 | 510037 | 507084 | 509950 | 508381 | 1759 | 510036 | 429072 | 508058 | 509210 | 509215 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 150 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 269 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20397827941 | 20397853542 | 25601 | 4661018500017 | 4661018751618 | 631 | 251601 | 249708 | 227997 | 249544 | 236464 | 1761 | 249707 | 214372 | 225325 | 225122 | 225746 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 50 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 270 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20397866652 | 20397873802 | 7150 | 4661018731437 | 4661018876518 | 638 | 145081 | 124273 | 74836 | 124235 | 103142 | 2278 | 124268 | 95794 | 122831 | 122596 | 123180 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 60 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 271 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20397916272 | 20397942943 | 26671 | 4661018828533 | 4661019349349 | 699 | 520816 | 472119 | 458625 | 472025 | 467790 | 1665 | 472118 | 395877 | 446538 | 446674 | 446575 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 70 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 272 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20397997753 | 20398002753 | 5000 | 4661019337320 | 4661019425588 | 653 | 88268 | 75595 | 35005 | 75531 | 62008 | 2022 | 75587 | 74677 | 62109 | 61409 | 62648 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 60 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 273 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20398018263 | 20398024003 | 5740 | 4661019386465 | 4661019556113 | 696 | 169648 | 129828 | 52752 | 129821 | 97714 | 1717 | 129826 | 83604 | 122624 | 122748 | 122856 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 70 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 274 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20398054374 | 20398061824 | 7450 | 4661019480559 | 4661019768816 | 635 | 288257 | 212076 | 196811 | 211959 | 204411 | 1943 | 212076 | 198606 | 211094 | 211196 | 211474 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 50 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 275 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20398071804 | 20398096184 | 24380 | 4661019755005 | 4661019873356 | 604 | 118351 | 103929 | 60985 | 103896 | 82202 | 1796 | 103929 | 100217 | 102160 | 102280 | 102504 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 50 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 276 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20398105444 | 20398129074 | 23630 | 4661019831952 | 4661019899407 | 702 | 67455 | 25356 | 11676 | 24913 | 20786 | 1556 | 25356 | 24994 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 277 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20398160575 | 20398165885 | 5310 | 4661019886766 | 4661019989464 | 706 | 102698 | 89338 | 67015 | 89250 | 75984 | 1277 | 89338 | 52032 | 62508 | 62305 | 62943 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 278 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20398199875 | 20398206165 | 6290 | 4661019968943 | 4661020038901 | 629 | 69958 | 48819 | 40826 | 48720 | 45381 | 1720 | 48808 | 48205 | 48292 | 47337 | 47917 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 30 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 279 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20398240025 | 20398247615 | 7590 | 4661020032362 | 4661020131148 | 642 | 98786 | 91606 | 49240 | 91453 | 75932 | 2322 | 91606 | 66610 | 90250 | 90000 | 90611 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 280 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20398292436 | 20398299616 | 7180 | 4661020090214 | 4661020643058 | 620 | 552844 | 511276 | 508150 | 511186 | 509696 | 1779 | 511276 | 431106 | 509289 | 510443 | 510451 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 281 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20398353756 | 20398359417 | 5661 | 4661020641601 | 4661020895372 | 675 | 253771 | 251633 | 228682 | 251208 | 237573 | 1705 | 251633 | 215503 | 226486 | 226260 | 226901 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 130 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 282 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20398392187 | 20398398497 | 6310 | 4661020873880 | 4661021019904 | 651 | 146024 | 123902 | 68356 | 123793 | 100340 | 2247 | 123902 | 95364 | 122449 | 122194 | 122780 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 30 | ||||||||||||||||||||||
Matmul | tt_dnn_device | 283 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20398440217 | 20398446767 | 6550 | 4661020965793 | 4661021492544 | 617 | 526751 | 472006 | 458815 | 471849 | 468599 | 1785 | 472006 | 395774 | 446290 | 446421 | 446327 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 3072 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] | 2856 | 1328 | 2340 | 2224 | 2876 | 0 | 73728 | 73728 | 22710 | [85.33333587646484; 85.33333587646484] | [128.0] | 70 | |||||||||||||||||||||||||||||
NlpCreateHeadsDeviceOperation | tt_dnn_device | 284 | 0 | {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} | HiFi4 | 56 | 20398461658 | 20398487228 | 25570 | 4661021480814 | 4661021567197 | 712 | 86383 | 73942 | 35306 | 73887 | 61547 | 1996 | 73933 | 73004 | 61325 | 60534 | 61222 | 8 | 1 | 384 | 3072 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] | ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] | ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] | 1356 | 1248 | 1080 | 652 | 1496 | 0 | 1 | 1 | 1 | [] | [] | 50 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 285 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20398541738 | 20398546928 | 5190 | 4661021530136 | 4661021697243 | 692 | 167107 | 129353 | 52811 | 129340 | 97287 | 1769 | 129353 | 81456 | 122454 | 122600 | 122698 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 64 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] | 1632 | 1032 | 2384 | 2024 | 3232 | 0 | 9216 | 9216 | 1 | [341.3333435058594; 341.3333435058594] | [2048.0] | 50 | ||||||||||||||||||||||||||||||||||||
Softmax | tt_dnn_device | 286 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} | HiFi4 | 56 | 20398556058 | 20398570809 | 14751 | 4661021622230 | 4661021911070 | 649 | 288840 | 213193 | 197270 | 213059 | 205100 | 1924 | 213193 | 199597 | 211923 | 212029 | 212304 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 1 | 32 | 384 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] | ['softmax/14128336124697917683/'] | ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] | ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] | 1128 | 1576 | 3344 | 2468 | 3164 | 0 | 1 | 1 | 1 | [18874368.0] | [18874368.0] | 30 | ||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 287 | 0 | {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} | LoFi | 56 | 20398601459 | 20398606649 | 5190 | 4661021896561 | 4661022012021 | 595 | 115460 | 100348 | 60649 | 100310 | 79302 | 1803 | 100338 | 96858 | 98804 | 98942 | 99159 | 8 | 16 | 384 | 384 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] | ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] | 1632 | 1032 | 2616 | 2004 | 3544 | 0 | 9216 | 9216 | 1 | [2048.0; 341.3333435058594] | [341.3333435058594] | 40 | ||||||||||||||||||||||||||||||||||||
ConcatenateHeads | tt_dnn_device | 288 | 0 | {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} | 56 | 20398635939 | 20398640259 | 4320 | 4661021973865 | 4661022038267 | 715 | 64402 | 25533 | 11645 | 25460 | 20968 | 1546 | 25532 | 24653 | 8 | 16 | 384 | 64 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | [] | [] | ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] | 720 | 904 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | [3145728.0] | [3145728.0] | 40 | |||||||||||||||||||||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 289 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20398652719 | 20398657839 | 5120 | 4661022025475 | 4661022126789 | 671 | 101314 | 87837 | 66714 | 87765 | 75819 | 1328 | 87829 | 52005 | 62523 | 62346 | 62941 | 1 | 8 | 384 | 1024 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] | 2540 | 1324 | 2304 | 2220 | 4552 | 0 | 24576 | 24576 | 7570 | [128.0; 85.33333587646484] | [256.0] | 50 | |||||||||||||||||||||||||||||
BinaryDeviceOperation | tt_dnn_device | 290 | 0 | {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} | HiFi4 | 56 | 20398690960 | 20398717550 | 26590 | 4661022107503 | 4661022176415 | 651 | 68912 | 48983 | 41851 | 48879 | 45769 | 1710 | 48977 | 48352 | 48426 | 47494 | 48081 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] | ['eltwise_binary_kernel/5705211043871595155/'] | ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] | ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] | 712 | 932 | 1312 | 716 | 1588 | 0 | 1456 | 1456 | 1 | [4321.05517578125; 4321.05517578125] | [4321.05517578125] | 19670 | ||||||||||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 291 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20398750590 | 20398756990 | 6400 | 4661022170743 | 4661022266939 | 647 | 96196 | 89885 | 39911 | 89351 | 72532 | 2267 | 89880 | 65064 | 88539 | 88280 | 88882 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/17805569453407806092/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] | 740 | 1852 | 4300 | 6312 | 6956 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 292 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} | HiFi2 | 56 | 20398800131 | 20398807271 | 7140 | 4661022218445 | 4661022778051 | 640 | 559606 | 510461 | 506614 | 510306 | 508577 | 1777 | 510461 | 430481 | 508565 | 509718 | 509721 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 1024 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 4096 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] | 2904 | 1320 | 3116 | 2804 | 3288 | 0 | 196608 | 196608 | 30281 | [32.0; 42.66666793823242] | [64.0] | 70 | |||||||||||||||||||||||||||||
Matmul | tt_dnn_device | 293 | 0 | {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} | LoFi | 56 | 20398842311 | 20398847321 | 5010 | 4661022776038 | 4661023029477 | 730 | 253439 | 250697 | 227875 | 250621 | 237167 | 1688 | 250687 | 215130 | 226033 | 225840 | 226485 | 1 | 8 | 384 | 4096 | TILE | BFLOAT8_B | DEV_0_L1_INTERLEAVED | 1 | 1 | 4096 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] | ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] | ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] | ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] | 2488 | 1300 | 2324 | 2224 | 4556 | 0 | 98304 | 98304 | 30281 | [128.0; 85.33333587646484] | [64.0] | 40 | |||||||||||||||||||||||||||||
LayerNorm | tt_dnn_device | 294 | 0 | {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} | HiFi4 | 56 | 20398881501 | 20398888502 | 7001 | 4661023008115 | 4661023149076 | 627 | 140961 | 118981 | 60930 | 118915 | 97590 | 2290 | 118977 | 90460 | 117527 | 117276 | 117867 | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 1 | 32 | 1024 | TILE | BFLOAT16 | DEV_0_DRAM_INTERLEAVED | 1 | 8 | 384 | 1024 | TILE | BFLOAT16 | DEV_0_L1_INTERLEAVED | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] | ['layernorm/16918772478690167680/'] | ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] | ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] | 740 | 2164 | 4776 | 7472 | 7912 | 0 | 1 | 1 | 1 | [6291456.0] | [6291456.0] | 40 | ||||||||||||||||||||||
(torch) __get__ | python_fallback | 295 | None | 20446632075 | 20446660675 | 28600 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment