Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save FL33TW00D/b3caede9601f6c5c52dee63b621b7475 to your computer and use it in GitHub Desktop.
Save FL33TW00D/b3caede9601f6c5c52dee63b621b7475 to your computer and use it in GitHub Desktop.
OP CODE OP TYPE GLOBAL CALL COUNT DEVICE ID ATTRIBUTES MATH FIDELITY CORE COUNT PARALLELIZATION STRATEGY HOST START TS HOST END TS HOST DURATION [ns] DEVICE FW START CYCLE DEVICE FW END CYCLE OP TO OP LATENCY [ns] DEVICE FW DURATION [ns] DEVICE KERNEL DURATION [ns] DEVICE KERNEL DURATION PER CORE MIN [ns] DEVICE KERNEL DURATION PER CORE MAX [ns] DEVICE KERNEL DURATION PER CORE AVG [ns] DEVICE KERNEL FIRST TO LAST START [ns] DEVICE BRISC KERNEL DURATION [ns] DEVICE NCRISC KERNEL DURATION [ns] DEVICE TRISC0 KERNEL DURATION [ns] DEVICE TRISC1 KERNEL DURATION [ns] DEVICE TRISC2 KERNEL DURATION [ns] DEVICE ERISC KERNEL DURATION [ns] DEVICE COMPUTE CB WAIT FRONT [ns] DEVICE COMPUTE CB RESERVE BACK [ns] INPUT_0_W INPUT_0_Z INPUT_0_Y INPUT_0_X INPUT_0_LAYOUT INPUT_0_DATATYPE INPUT_0_MEMORY INPUT_1_W INPUT_1_Z INPUT_1_Y INPUT_1_X INPUT_1_LAYOUT INPUT_1_DATATYPE INPUT_1_MEMORY INPUT_2_W INPUT_2_Z INPUT_2_Y INPUT_2_X INPUT_2_LAYOUT INPUT_2_DATATYPE INPUT_2_MEMORY INPUT_3_W INPUT_3_Z INPUT_3_Y INPUT_3_X INPUT_3_LAYOUT INPUT_3_DATATYPE INPUT_3_MEMORY OUTPUT_0_W OUTPUT_0_Z OUTPUT_0_Y OUTPUT_0_X OUTPUT_0_LAYOUT OUTPUT_0_DATATYPE OUTPUT_0_MEMORY OUTPUT_1_W OUTPUT_1_Z OUTPUT_1_Y OUTPUT_1_X OUTPUT_1_LAYOUT OUTPUT_1_DATATYPE OUTPUT_1_MEMORY OUTPUT_2_W OUTPUT_2_Z OUTPUT_2_Y OUTPUT_2_X OUTPUT_2_LAYOUT OUTPUT_2_DATATYPE OUTPUT_2_MEMORY METAL TRACE ID METAL TRACE REPLAY SESSION ID COMPUTE KERNEL SOURCE COMPUTE KERNEL HASH DATA MOVEMENT KERNEL SOURCE DATA MOVEMENT KERNEL HASH BRISC MAX KERNEL SIZE [B] NCRISC MAX KERNEL SIZE [B] TRISC 0 MAX KERNEL SIZE [B] TRISC 1 MAX KERNEL SIZE [B] TRISC 2 MAX KERNEL SIZE [B] ERISC MAX KERNEL SIZE [B] PM IDEAL [ns] PM COMPUTE [ns] PM BANDWIDTH [ns] PM REQ I BW PM REQ O BW CompileProgram_TT_HOST_FUNC [ns] HWCommandQueue_write_buffer_TT_HOST_FUNC [ns]
Embeddings tt_dnn_device 1 0 {'embeddings_type': 'EmbeddingsType::PADDED'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'pad_token': '1'; 'tilized': 'true'} HiFi4 56 11525792262 12077256151 551463889 4652784950082 4652785021676 0 71594 69677 25328 69562 50259 719 69657 43289 45558 45484 45707 8 1 1 384 ROW_MAJOR UINT32 DEV_0_L1_INTERLEAVED 1 1 50265 1024 ROW_MAJOR BFLOAT16 DEV_0_DRAM_INTERLEAVED 8 1 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'; 'ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'] ['tilize/17927064194417795862/'; 'tilize/1955606672871876418/'] ['ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['embeddings_tilize/9293793899976367391/'; 'writer_unary_interleaved_start_id/12513060238449668381/'] 712 1312 1172 668 1556 0 371600 1 371600 [0.033067815005779266; 277.0256042480469] [16.930721282958984] 550578291 14380
Embeddings tt_dnn_device 2 0 {'embeddings_type': 'EmbeddingsType::GENERIC'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'pad_token': 'std::nullopt'; 'tilized': 'true'} HiFi4 56 12077425693 12594462065 517036372 4653294717567 4653295016608 509697904 299041 297018 30011 296968 177917 610 297018 283191 285425 285368 285583 8 1 1 384 ROW_MAJOR UINT32 DEV_0_L1_INTERLEAVED 1 1 1 1024 ROW_MAJOR BFLOAT16 DEV_0_DRAM_INTERLEAVED 8 1 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'; 'ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'] ['tilize/5795839852233331467/'; 'tilize/5310988429227515762/'] ['ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['embeddings_tilize/13875115944824425364/'; 'writer_unary_interleaved_start_id/3927395697778282413/'] 712 1068 1172 668 1556 0 7 1 7 [1755.4285888671875; 292.5714416503906] [898779.4375] 516381656 33810
BinaryDeviceOperation tt_dnn_device 3 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 12594627056 13093377525 498750469 4653786383365 4653786431916 491367415 48551 47896 42176 47828 44967 516 47592 47047 47218 46466 46910 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 498069652 34600
Embeddings tt_dnn_device 4 0 {'embeddings_type': 'EmbeddingsType::GENERIC'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'pad_token': 'std::nullopt'; 'tilized': 'true'} HiFi4 56 13093527596 13094712817 1185221 4653787698671 4653787764340 1268767 65669 63662 21993 63539 47695 617 63662 36226 38478 38425 38640 8 1 1 384 ROW_MAJOR UINT32 DEV_0_L1_INTERLEAVED 1 1 514 1024 ROW_MAJOR BFLOAT16 DEV_0_DRAM_INTERLEAVED 8 1 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'; 'ttnn/cpp/ttnn/operations/data_movement/tilize/device/kernels/compute/tilize.cpp'] ['tilize/5795839852233331467/'; 'tilize/5310988429227515762/'] ['ttnn/cpp/ttnn/operations/embedding/device/kernels/dataflow/embeddings_tilize.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['embeddings_tilize/13875115944824425364/'; 'writer_unary_interleaved_start_id/3927395697778282413/'] 712 1068 1172 668 1556 0 3799 1 3799 [3.2345354557037354; 277.09185791015625] [1656.0821533203125] 670176 3250
BinaryDeviceOperation tt_dnn_device 5 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 13094795548 13094841269 45721 4653787954547 4653788003022 190858 48475 47821 41312 47749 44883 496 47524 46968 47134 46352 46806 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 340
LayerNorm tt_dnn_device 6 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-12'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 13094919309 13967384018 872464709 4654647675029 4654647769195 859672692 94166 93487 54968 93407 77461 1064 92643 67317 91314 91766 91751 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 871547090 15640
Matmul tt_dnn_device 7 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 13967587580 14611713730 644126150 4655282603361 4655283077856 634836596 474495 472036 457921 471802 467503 622 472016 395654 446500 446639 446523 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 643129050 34141
NlpCreateHeadsDeviceOperation tt_dnn_device 8 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 14611835521 15120918458 509082937 4655784417753 4655784495315 501341946 77562 75537 34535 75345 62029 993 75510 74611 63374 62655 63227 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 508283029 14820
Matmul tt_dnn_device 9 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 15121073289 15719694976 598621687 4656374547519 4656374678343 590054183 130824 128845 51220 128667 97481 628 128845 81740 122799 122927 123024 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 597801429 15490
Softmax tt_dnn_device 10 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 15719815897 16514774820 794958923 4657158131646 4657158344226 783453971 212580 211915 197182 211866 204381 692 211410 198004 210589 211062 210988 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 794152965 14770
Matmul tt_dnn_device 11 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 16514902141 17139900439 624998298 4657774159751 4657774260532 615817492 100781 98804 60730 98609 75794 645 98804 95440 97387 97525 97744 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 624206290 14420
ConcatenateHeads tt_dnn_device 12 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 17140025360 17479133602 339108242 4658108514090 4658108540152 334254417 26062 25216 11777 25152 20938 220 25170 24221 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 338519706 14390
Matmul tt_dnn_device 13 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 17479272753 18098134863 618862110 4658718494612 4658718585790 609956899 91178 88725 66536 88643 75810 599 88725 52433 62845 62639 63240 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 617908611 33670
BinaryDeviceOperation tt_dnn_device 14 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 18098263904 18098285384 21480 4658718833836 4658718882095 248709 48259 47561 41056 47532 44875 458 47268 46725 46901 46107 46588 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 410
LayerNorm tt_dnn_device 15 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 18098351425 18099303794 952369 4658719762002 4658719857126 880631 95124 94445 58010 94371 78162 1117 93572 68336 92281 92724 92696 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 379304 2900
Matmul tt_dnn_device 16 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 18099468646 18785022099 685553453 4659395324444 4659395837153 675469750 512709 510259 507480 510207 509108 615 510259 430270 508301 509442 509453 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 684436553 20641
Matmul tt_dnn_device 17 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 18785172190 19428655654 643483464 4660029590021 4660029842769 633755317 252748 250303 227091 250076 236456 597 250302 213825 224969 224773 225375 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 642518804 14381
LayerNorm tt_dnn_device 18 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 19428799475 20386345992 957546517 4660973470648 4660973593844 943628577 123196 122514 62411 122486 101981 1058 121671 93181 120287 120740 120674 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 956630049 15681
Matmul tt_dnn_device 19 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20386592755 20386637365 44610 4660973904640 4660974379847 313230 475207 472756 458469 472485 467917 604 472756 396661 447064 447185 447090 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 490
NlpCreateHeadsDeviceOperation tt_dnn_device 20 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20386692746 20386704716 11970 4660974367061 4660974454722 642 87661 74246 34840 74183 61686 2052 74245 73242 61919 61187 61640 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 310
Matmul tt_dnn_device 21 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20386754026 20386764876 10850 4660974416798 4660974583975 692 167177 128561 53043 128554 97761 1745 128561 82934 122296 122456 122546 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 60
Softmax tt_dnn_device 22 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20386782267 20386840837 58570 4660974509977 4660974799334 647 289357 214716 197584 214596 206779 1917 214692 201237 213719 213810 214094 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 23 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20386874527 20386884388 9861 4660974783690 4660974905574 605 121884 105623 52495 105584 79165 1809 105623 102148 104058 104186 104405 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 24 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20386898778 20386909428 10650 4660974853971 4660974931692 710 77721 25421 11737 24959 20911 1559 25420 24799 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 100
Matmul tt_dnn_device 25 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20386925848 20386957758 31910 4660974919064 4660975021765 708 102701 89343 66756 89251 75800 1288 89343 52354 62667 62465 63100 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 26 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20386984178 20386999179 15001 4660975000641 4660975073578 658 72937 51170 40956 51090 47128 1682 51169 50550 50624 49686 50294 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 40
LayerNorm tt_dnn_device 27 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20387070369 20387104190 33821 4660975064834 4660975166242 671 101408 91998 47716 91733 75791 2217 91992 66981 90528 90266 90887 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 160
Matmul tt_dnn_device 28 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20387165820 20387179470 13650 4660975123394 4660975676896 630 553502 510007 507249 509858 508892 1771 510005 430394 507894 509053 509060 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 29 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20387219181 20387229361 10180 4660975676300 4660975929366 735 253066 251737 228729 250842 237558 1693 251736 216183 226990 226784 227388 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 300
LayerNorm tt_dnn_device 30 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20387244501 20387291461 46960 4660975907828 4660976049872 628 142044 119892 56679 119788 96858 2296 119888 91467 118495 118244 118840 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 130
Matmul tt_dnn_device 31 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20387317292 20387324582 7290 4660975988087 4660976524346 625 536259 473830 458768 473733 469560 1811 473828 397498 448206 448347 448249 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 60
NlpCreateHeadsDeviceOperation tt_dnn_device 32 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20387340252 20387345152 4900 4660976510763 4660976600369 665 89606 75359 35083 75300 62022 2030 75342 74347 62730 62101 62690 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 40
Matmul tt_dnn_device 33 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20387444353 20387469153 24800 4660976561579 4660976728375 678 166796 127329 54205 127320 96436 1779 127329 83928 120338 120484 120589 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 60
Softmax tt_dnn_device 34 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20387479553 20387487463 7910 4660976656778 4660976943750 676 286972 214715 197663 214589 206705 1911 214700 201201 213715 213836 214098 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 50
Matmul tt_dnn_device 35 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20387498303 20387502493 4190 4660976928158 4660977045801 616 117643 101429 58143 101390 78096 1775 101429 97912 99824 99975 100196 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 36 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20387511833 20387515934 4101 4660977004046 4660977072453 722 68407 25939 11762 25882 21119 1532 25938 25015 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 101
Matmul tt_dnn_device 37 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20387551844 20387557064 5220 4660977059289 4660977162421 653 103132 89299 66834 89215 76030 1335 89299 52401 62770 62565 63174 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 38 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20387617244 20387623915 6671 4660977141428 4660977211795 634 70367 48754 41475 48700 45979 1710 48748 48129 48210 47269 47815 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 60
LayerNorm tt_dnn_device 39 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20387659265 20387666135 6870 4660977205981 4660977303423 698 97442 90928 48761 89830 74877 2188 90928 65950 89481 89231 89831 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 40 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20387694355 20387742176 47821 4660977262732 4660977812959 601 550227 508917 506036 508800 507564 1784 508915 429310 506925 508091 508098 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 160
Matmul tt_dnn_device 41 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20387796406 20387802316 5910 4660977811577 4660978065447 734 253870 251763 228804 251466 238007 1692 251763 216221 227065 226854 227484 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 80
LayerNorm tt_dnn_device 42 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20387816516 20387823026 6510 4660978044063 4660978190475 628 146412 124407 61382 124299 102339 2295 124407 95879 122958 122704 123282 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 43 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20387847647 20387876537 28890 4660978128871 4660978664382 632 535511 473264 459486 473167 468104 1772 473263 397261 447782 447927 447818 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 44 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20387932317 20387956508 24191 4660978652096 4660978740204 656 88108 75172 35031 75046 61819 2032 75153 74436 62872 62265 62843 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 80
Matmul tt_dnn_device 45 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20387973258 20387978718 5460 4660978701533 4660978871243 602 169710 130437 55035 130428 97557 1833 130437 86057 123836 123993 124088 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 50
Softmax tt_dnn_device 46 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20387987988 20387995738 7750 4660978797371 4660979083714 669 286343 211811 196892 211686 204022 1922 211810 198304 210835 210955 211211 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 47 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20388006118 20388010098 3980 4660979070191 4660979190519 610 120328 106185 68491 106124 83969 1794 106185 102606 104358 104499 104726 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 48 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20388040559 20388044709 4150 4660979154348 4660979216353 705 62005 25138 11977 25038 20892 1572 25137 24502 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 120
Matmul tt_dnn_device 49 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20388095489 20388100479 4990 4660979204239 4660979305490 688 101251 88433 66740 88297 75970 1294 88433 52435 62847 62656 63285 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 60
BinaryDeviceOperation tt_dnn_device 50 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20388134029 20388160500 26471 4660979285436 4660979356402 633 70966 50262 40326 50156 46157 1700 50250 49646 49726 48870 49453 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 80
LayerNorm tt_dnn_device 51 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20388175400 20388181620 6220 4660979347946 4660979447819 648 99873 90799 42410 90673 74560 2297 90799 65854 89358 89090 89699 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 52 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20388228210 20388255621 27411 4660979400888 4660979958524 627 557636 510070 507201 509984 508619 1775 510068 429477 508144 509301 509330 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 50
Matmul tt_dnn_device 53 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20388271521 20388276761 5240 4660979957336 4660980208724 627 251388 249568 227197 248842 236094 1745 249558 214639 225154 224958 225582 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 60
LayerNorm tt_dnn_device 54 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20388311301 20388318021 6720 4660980187816 4660980333644 633 145828 124304 64538 124222 102573 2353 124304 95349 122372 122124 122716 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 80
Matmul tt_dnn_device 55 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20388381732 20388410112 28380 4660980275325 4660980806564 659 531239 472251 458825 472151 468165 1770 472251 396715 447141 447275 447176 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 56 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20388444982 20388449932 4950 4660980794614 4660980882103 641 87489 74908 34655 74847 62033 2034 74907 73975 61926 61341 62309 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 120
Matmul tt_dnn_device 57 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20388466673 20388471443 4770 4660980843328 4660981012763 696 169435 129954 51939 129951 96440 1723 129952 86169 122645 122772 122876 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 120
Softmax tt_dnn_device 58 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20388480203 20388508553 28350 4660980936262 4660981226963 648 290701 213563 197379 213452 205676 1954 213546 200035 212529 212661 212924 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 90
Matmul tt_dnn_device 59 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20388519343 20388523513 4170 4660981212274 4660981329060 620 116786 101472 60434 101435 78674 1771 101472 97951 99875 100008 100238 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 30
ConcatenateHeads tt_dnn_device 60 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20388553263 20388581274 28011 4660981289564 4660981354931 714 65367 25166 11671 25094 20814 1538 25165 24033 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 100
Matmul tt_dnn_device 61 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20388614654 20388620194 5540 4660981342481 4660981444233 658 101752 88624 66888 88540 75682 1326 88624 52131 62391 62218 62832 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 62 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20388654304 20388660724 6420 4660981424004 4660981495063 643 71059 50203 40845 50101 46050 1687 50203 49582 49670 48741 49371 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 60
LayerNorm tt_dnn_device 63 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20388674405 20388702185 27780 4660981487255 4660981588078 635 100823 92381 45871 91763 75190 2301 92381 67187 90533 90260 90865 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 64 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20388747845 20388755235 7390 4660981543044 4660982097842 612 554798 509138 505778 509051 507689 1781 509136 428858 506961 508130 508137 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 80
Matmul tt_dnn_device 65 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20388809436 20388899677 90241 4660982096462 4660982349205 680 252743 250686 228944 249930 237342 1702 250686 215600 226300 226093 226702 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 170
LayerNorm tt_dnn_device 66 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20388915747 20388922237 6490 4660982328952 4660982469439 620 140487 119624 57007 119524 94186 2275 119624 91084 118139 117884 118470 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 67 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20388987007 20388993888 6881 4660982408253 4660982942342 633 534089 472251 460002 472118 468601 1776 472251 396225 446543 446680 446580 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 68 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20389029538 20389034638 5100 4660982931506 4660983018487 701 86981 75447 34862 75383 62041 2039 75426 74824 62601 61895 63128 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 40
Matmul tt_dnn_device 69 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20389070568 20389077448 6880 4660982979455 4660983148467 686 169012 129299 55029 129290 96673 1757 129299 82954 122362 122505 122611 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 90
Softmax tt_dnn_device 70 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20389107379 20389135829 28450 4660983075742 4660983362208 637 286466 213114 196836 212987 204845 1929 213114 199568 212013 212132 212409 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 100
Matmul tt_dnn_device 71 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20389145779 20389150839 5060 4660983347398 4660983468665 604 121267 105844 57564 105804 79666 1805 105844 102381 104341 104462 104699 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 72 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20389159979 20389164579 4600 4660983421942 4660983494677 625 72735 25393 11892 25295 20994 1640 25392 24771 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 73 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20389215890 20389220960 5070 4660983482258 4660983584387 635 102129 89062 65942 88963 75758 1331 89061 52167 62631 62435 63046 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 100
BinaryDeviceOperation tt_dnn_device 74 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20389254070 20389260970 6900 4660983563395 4660983634249 634 70854 49242 40609 49134 45936 1681 49242 48614 48696 47770 48359 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 60
LayerNorm tt_dnn_device 75 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20389274550 20389281330 6780 4660983627084 4660983727034 633 99950 92154 43598 91804 76082 2254 92149 67058 90819 90563 91176 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 100
Matmul tt_dnn_device 76 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20389347441 20389354681 7240 4660983679892 4660984238776 638 558884 511090 507773 511024 509436 1747 511090 431117 509244 510397 510397 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 77 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20389389701 20389394911 5210 4660984236992 4660984490209 646 253217 250789 228219 250676 237222 1738 250789 215747 226461 226251 226881 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 100
LayerNorm tt_dnn_device 78 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20389446422 20389452992 6570 4660984469416 4660984613515 629 144099 122685 59971 122539 102099 2309 122679 94139 121230 120981 121564 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 79 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20389487342 20389534733 47391 4660984552215 4660985085890 607 533675 471754 458598 471656 467511 1820 471753 396223 446748 446902 446795 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 70
NlpCreateHeadsDeviceOperation tt_dnn_device 80 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20389551533 20389556453 4920 4660985074147 4660985160912 654 86765 74382 34737 74328 61769 2018 74374 73598 62184 61593 62020 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 40
Matmul tt_dnn_device 81 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20389621834 20389628314 6480 4660985122731 4660985290072 699 167341 128461 53896 128454 97350 1707 128461 86306 122195 122345 122449 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 120
Softmax tt_dnn_device 82 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20389638214 20389645624 7410 4660985217034 4660985502913 639 285879 212209 196718 212080 203925 1926 212206 198666 211145 211261 211522 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 83 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20389676804 20389700604 23800 4660985488902 4660985610423 615 121521 106888 59914 106844 81953 1788 106888 103306 105212 105358 105582 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 60
ConcatenateHeads tt_dnn_device 84 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20389710144 20389714134 3990 4660985565445 4660985636303 719 70858 25164 11799 25072 20959 1532 25163 24221 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 50
Matmul tt_dnn_device 85 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20389748285 20389753805 5520 4660985623981 4660985725977 632 101996 89030 66195 88947 76024 1377 89030 52330 62919 62703 63323 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 86 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20389787985 20389813465 25480 4660985705215 4660985775534 641 70319 48898 41443 48814 45508 1720 48898 48275 48352 47408 48027 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 60
LayerNorm tt_dnn_device 87 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20389829035 20389835186 6151 4660985769540 4660985866094 670 96554 89918 41771 89781 72331 2221 89918 65058 88556 88299 88912 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 88 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20389901736 20389909656 7920 4660985819363 4660986377028 643 557665 510271 507249 510184 508765 1789 510269 429992 508279 509433 509458 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 89 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20389944607 20389950417 5810 4660986375823 4660986629027 659 253204 251347 228056 251139 236919 1741 251347 215228 225971 225761 226410 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 120
LayerNorm tt_dnn_device 90 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20390003247 20390010117 6870 4660986607193 4660986752419 654 145226 122750 59686 122606 101149 2271 122743 94199 121264 121020 121608 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 60
Matmul tt_dnn_device 91 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20390033367 20390039537 6170 4660986690776 4660987225004 609 534228 471967 458287 471875 467591 1784 471965 395954 446305 446446 446350 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 92 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20390079098 20390084428 5330 4660987212805 4660987302253 643 89448 76619 34919 76568 62348 2037 76618 75624 63824 63102 63532 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 40
Matmul tt_dnn_device 93 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20390141238 20390166779 25541 4660987262015 4660987431151 678 169136 128210 51203 128199 96887 1727 128210 80857 121126 121292 121387 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 110
Softmax tt_dnn_device 94 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20390196889 20390204899 8010 4660987355666 4660987645342 658 289676 213540 197046 213421 205484 1951 213539 200107 212516 212630 212916 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 50
Matmul tt_dnn_device 95 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20390214789 20390218899 4110 4660987630349 4660987751489 625 121140 105521 64690 105494 82683 1801 105521 102099 104027 104175 104395 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 96 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20390227849 20390231819 3970 4660987712178 4660987777383 730 65205 25168 11709 25086 20832 1521 25167 24359 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 97 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20390244789 20390249679 4890 4660987764972 4660987867307 659 102335 89250 67796 89160 76012 1320 89250 52123 62602 62414 63030 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 98 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20390285700 20390311630 25930 4660987847531 4660987917109 616 69578 49200 39871 49096 45925 1730 49187 48498 48575 47651 48233 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 110
LayerNorm tt_dnn_device 99 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20390374181 20390381441 7260 4660987909238 4660988009776 607 100538 92063 43572 91926 75538 2312 92063 67098 90658 90390 91008 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 100 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20390407241 20390414081 6840 4660987962696 4660988521041 590 558345 510656 507980 510605 509534 1799 510656 431030 508517 509679 509669 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 40
Matmul tt_dnn_device 101 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20390429171 20390456441 27270 4660988520439 4660988773370 722 252931 251613 229133 250818 237552 1680 251568 215272 226369 226156 226771 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 40
LayerNorm tt_dnn_device 102 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20390470192 20390496982 26790 4660988752369 4660988897316 625 144947 123332 67222 123218 100258 2282 123325 94889 121881 121622 122213 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 110
Matmul tt_dnn_device 103 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20390585233 20390592733 7500 4660988842647 4660989369048 612 526401 471101 458293 471004 468255 1819 471101 395251 445903 446029 445932 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 90
NlpCreateHeadsDeviceOperation tt_dnn_device 104 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20390609323 20390617163 7840 4660989357713 4660989448783 644 91070 79097 35569 78561 62986 2071 79096 78013 65390 64626 66175 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 80
Matmul tt_dnn_device 105 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20390632703 20390637423 4720 4660989407020 4660989578782 698 171762 129302 54606 129290 97199 1769 129302 84015 122865 123005 123107 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 50
Softmax tt_dnn_device 106 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20390671213 20390699404 28191 4660989505613 4660989793987 665 288374 214552 197403 214434 206769 1905 214539 201026 213361 213457 213739 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 150
Matmul tt_dnn_device 107 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20390729594 20390754494 24900 4660989778306 4660989894968 622 116662 100351 58311 100309 77637 1794 100351 96858 98782 98917 99148 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 108 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20390783255 20390788695 5440 4660989854487 4660989921534 714 67047 25857 11700 25136 21080 1548 25856 25214 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 70
Matmul tt_dnn_device 109 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20390801635 20390806065 4430 4660989908419 4660990012039 712 103620 89779 66660 89692 75716 1274 89779 52367 62765 62565 63198 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 110 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20390819995 20390825685 5690 4660989990403 4660990061804 621 71401 49157 39506 49070 45135 1708 49146 48516 48603 47634 48214 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 40
LayerNorm tt_dnn_device 111 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20390862845 20390869355 6510 4660990053613 4660990153542 658 99929 91080 47837 90950 75199 2248 91073 65870 89549 89283 89890 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 112 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20390953306 20390960766 7460 4660990111703 4660990663831 615 552128 509659 505919 509518 507961 1786 509659 429635 507691 508831 508838 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 113 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20390995767 20391001047 5280 4660990662010 4660990914639 752 252629 250057 229124 249982 237177 1682 250057 215727 226231 226032 226677 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 40
LayerNorm tt_dnn_device 114 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20391014197 20391021897 7700 4660990895155 4660991038950 652 143795 123679 66350 123552 101394 2272 123679 95184 122202 121967 122557 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 30
Matmul tt_dnn_device 115 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20391065417 20391091737 26320 4660990983052 4660991511983 614 528931 472402 458433 472307 468060 1785 472400 396426 446785 446930 446822 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 80
NlpCreateHeadsDeviceOperation tt_dnn_device 116 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20391126678 20391131388 4710 4660991499481 4660991587628 649 88147 75002 35070 74870 61930 2054 74991 74445 62365 61793 62777 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 50
Matmul tt_dnn_device 117 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20391166868 20391192858 25990 4660991549174 4660991717461 614 168287 129223 53682 129212 98015 1798 129223 84223 123300 123456 123550 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 70
Softmax tt_dnn_device 118 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20391202689 20391210559 7870 4660991643471 4660991929901 651 286430 211795 196653 211681 203768 1916 211779 198238 210705 210857 211092 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 90
Matmul tt_dnn_device 119 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20391220469 20391224809 4340 4660991916260 4660992033235 625 116975 102703 65609 102694 81078 1781 102703 99026 100934 101074 101302 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 120 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20391255119 20391259519 4400 4660991997707 4660992059447 653 61740 25564 11759 25471 20927 1617 25563 23963 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 121 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20391291999 20391317610 25611 4660992046688 4660992147903 631 101215 87815 66609 87728 75753 1351 87815 52050 62708 62515 63124 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 60
BinaryDeviceOperation tt_dnn_device 122 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20391351550 20391357800 6250 4660992128186 4660992197331 637 69145 48761 41337 48668 45926 1700 48756 48141 48211 47169 47831 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 40
LayerNorm tt_dnn_device 123 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20391391530 20391398390 6860 4660992191374 4660992288011 698 96637 90022 38866 89894 73547 2292 90022 64921 88423 88166 88775 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 124 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20391422671 20391430111 7440 4660992238268 4660992798844 603 560576 510218 506679 510133 508798 1808 510218 429935 508141 509285 509296 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 50
Matmul tt_dnn_device 125 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20391506711 20391512581 5870 4660992797739 4660993050204 684 252465 250679 228583 250490 237267 1706 250678 215546 226457 226262 226872 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 60
LayerNorm tt_dnn_device 126 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20391526782 20391554992 28210 4660993029577 4660993178763 630 149186 127938 77926 127858 103558 2283 127938 99479 126481 126236 126810 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 127 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20391605752 20391613312 7560 4660993130171 4660993651583 649 521412 472157 458850 472059 467494 1750 472157 396261 446742 446874 446771 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 80
NlpCreateHeadsDeviceOperation tt_dnn_device 128 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20391628293 20391633273 4980 4660993639733 4660993727959 661 88226 75728 35108 75670 62030 2024 75720 74779 63189 62462 63157 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 90
Matmul tt_dnn_device 129 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20391709983 20391715663 5680 4660993688818 4660993859373 675 170555 130735 53551 130726 98247 1747 130735 84636 124136 124286 124388 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 70
Softmax tt_dnn_device 130 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20391725214 20391753324 28110 4660993783710 4660994073530 667 289820 213491 198063 213378 206156 1890 213475 199957 212423 212559 212835 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 131 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20391763804 20391768414 4610 4660994059555 4660994173376 618 113821 99228 63487 99218 79262 1806 99226 95742 97673 97800 98034 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 132 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20391797664 20391802054 4390 4660994139163 4660994199615 654 60452 25591 11483 25502 20874 1639 25590 24348 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 80
Matmul tt_dnn_device 133 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20391815204 20391820744 5540 4660994186550 4660994287883 635 101333 87620 66123 87530 75497 1361 87618 51968 62409 62207 62836 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 134 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20391855435 20391880535 25100 4660994268253 4660994337245 645 68992 48721 39103 48640 45598 1693 48714 48101 48183 47234 47873 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 140
LayerNorm tt_dnn_device 135 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20391913865 20391920945 7080 4660994329097 4660994429760 667 100663 91853 41796 91610 74085 2239 91853 66751 90286 90003 90630 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 136 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20391965496 20391992656 27160 4660994381146 4660994939604 639 558458 509195 506004 509107 507870 1785 509193 429463 507296 508447 508444 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 137 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20392009166 20392014366 5200 4660994938818 4660995191114 614 252296 250896 229038 250264 237151 1759 250895 215202 226030 225834 226452 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 100
LayerNorm tt_dnn_device 138 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20392027216 20392055217 28001 4660995170733 4660995316377 621 145644 124654 64106 124544 102764 2286 124652 95866 122927 122671 123265 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 139 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20392115797 20392122497 6700 4660995257257 4660995790104 642 532847 473071 459779 472945 469137 1733 473071 396353 446925 447052 446951 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 50
NlpCreateHeadsDeviceOperation tt_dnn_device 140 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20392157928 20392162878 4950 4660995778269 4660995865706 653 87437 74949 34842 74854 62027 2068 74940 74143 62554 61921 62443 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 90
Matmul tt_dnn_device 141 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20392198808 20392203788 4980 4660995827111 4660995994195 662 167084 127823 53498 127811 97901 1806 127823 85639 120713 120858 120949 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 60
Softmax tt_dnn_device 142 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20392212778 20392221378 8600 4660995921390 4660996207840 646 286450 213016 197128 212886 205013 1947 213013 199519 212050 212145 212428 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 90
Matmul tt_dnn_device 143 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20392251989 20392275899 23910 4660996193361 4660996313334 597 119973 104889 56006 104880 79453 1827 104889 101415 103375 103505 103733 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 144 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20392285199 20392308679 23480 4660996265990 4660996339421 643 73431 25453 11522 25159 20912 1672 25452 24758 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 50
Matmul tt_dnn_device 145 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20392320919 20392345969 25050 4660996326532 4660996428225 641 101693 88145 66557 88038 75631 1334 88145 52184 62694 62502 63115 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 146 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20392359500 20392365280 5780 4660996408106 4660996477817 656 69711 48951 40470 48905 45894 1672 48945 48318 48384 47343 48008 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 50
LayerNorm tt_dnn_device 147 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20392398800 20392404970 6170 4660996470798 4660996568701 603 97903 90289 43885 89644 74042 2286 90283 65243 88826 88578 89192 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 148 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20392470131 20392496891 26760 4660996523718 4660997079552 571 555834 510263 506915 510132 508846 1845 510263 430297 508399 509566 509571 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 90
Matmul tt_dnn_device 149 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20392513311 20392518281 4970 4660997077710 4660997332246 648 254536 252039 229236 251610 238415 1713 252039 216430 227297 227076 227699 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 60
LayerNorm tt_dnn_device 150 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20392552741 20392565121 12380 4660997311368 4660997461868 646 150500 128997 65618 128890 105950 2231 128997 100084 127169 126916 127505 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 151 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20392607732 20392614982 7250 4660997399914 4660997934595 617 534681 472088 458118 471988 467631 1800 472088 396462 446844 446985 446871 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 50
NlpCreateHeadsDeviceOperation tt_dnn_device 152 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20392651652 20392676363 24711 4660997922020 4660998010387 645 88367 75164 34428 75105 61628 2074 75153 74193 62684 61926 62390 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 50
Matmul tt_dnn_device 153 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20392712823 20392718013 5190 4660997971183 4660998140623 668 169440 129568 52657 129555 96434 1737 129568 87663 123156 123290 123393 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 60
Softmax tt_dnn_device 154 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20392746783 20392754453 7670 4660998065244 4660998355648 636 290404 214390 196842 214274 205636 1943 214387 200882 213273 213389 213654 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 90
Matmul tt_dnn_device 155 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20392764223 20392768273 4050 4660998339578 4660998457573 635 117995 101280 61593 101261 80029 1766 101280 97842 99768 99897 100129 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 156 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20392797984 20392802204 4220 4660998419437 4660998483697 724 64260 25414 11751 25288 20904 1526 25413 24530 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 157 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20392917555 20392923175 5620 4660998471072 4660998573533 613 102461 89208 67977 89124 76028 1381 89208 52321 62666 62468 63102 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 158 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20392979315 20392985755 6440 4660998553826 4660998623883 623 70057 49716 40876 49634 45376 1742 49716 49088 49172 48240 48824 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 70
LayerNorm tt_dnn_device 159 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20393001076 20393028246 27170 4660998616535 4660998717589 688 101054 93045 57041 92946 77728 2193 93045 67866 91439 91190 91770 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 160 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20393075406 20393083256 7850 4660998683017 4660999228797 648 545780 510540 507216 510450 508964 1756 510540 430016 508522 509672 509692 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 161 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20393117287 20393122767 5480 4660999227431 4660999479377 656 251946 249927 227908 249395 236587 1713 249927 214483 225461 225236 225871 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 120
LayerNorm tt_dnn_device 162 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20393175337 20393183027 7690 4660999458832 4660999604297 641 145465 124292 58099 124228 102032 2301 124292 95769 122817 122576 123153 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 60
Matmul tt_dnn_device 163 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20393230398 20393237238 6840 4660999539536 4661000077744 659 538208 472775 458797 472678 467568 1756 472775 396446 447196 447324 447230 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 80
NlpCreateHeadsDeviceOperation tt_dnn_device 164 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20393253188 20393278738 25550 4661000065252 4661000154025 633 88773 75659 34638 75600 61966 2049 75649 74624 63505 62791 63204 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 30
Matmul tt_dnn_device 165 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20393315369 20393339469 24100 4661000114479 4661000284382 679 169903 129671 55044 129662 97161 1740 129671 88813 123554 123694 123789 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 50
Softmax tt_dnn_device 166 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20393349159 20393377299 28140 4661000211265 4661000497550 665 286285 212513 196897 212393 204521 1953 212499 198910 211316 211439 211705 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 120
Matmul tt_dnn_device 167 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20393387729 20393392089 4360 4661000483328 4661000601483 621 118155 103307 61803 103266 80530 1763 103307 99796 101719 101864 102089 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 168 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20393421690 20393425750 4060 4661000561541 4661000628503 704 66962 26324 11976 26244 20993 1561 26323 24107 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 50
Matmul tt_dnn_device 169 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20393438560 20393443540 4980 4661000615206 4661000718646 672 103440 89453 66475 89362 75946 1285 89452 52261 62683 62486 63095 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 170 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20393477150 20393484170 7020 4661000697137 4661000769808 644 72671 50531 40895 50431 45811 1692 50522 49896 49965 49028 49608 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 90
LayerNorm tt_dnn_device 171 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20393537701 20393568561 30860 4661000761646 4661000860699 642 99053 90251 43424 90144 74631 2299 90251 65030 88682 88418 89030 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 172 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20393595411 20393624762 29351 4661000815301 4661001370935 643 555634 509571 506853 509483 508188 1765 509571 429802 507564 508709 508711 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 20931
Matmul tt_dnn_device 173 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20393639522 20393644532 5010 4661001370115 4661001621596 668 251481 249999 228726 249431 237131 1701 249998 215449 226302 226110 226742 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 40
LayerNorm tt_dnn_device 174 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20393678962 20393706142 27180 4661001601783 4661001743103 659 141320 120860 57530 120775 96691 2281 120860 92126 119109 118868 119460 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 60
Matmul tt_dnn_device 175 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20393767663 20393774583 6920 4661001681200 4661002216646 657 535446 472875 459642 472783 469429 1743 472873 396704 447545 447690 447592 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 90
NlpCreateHeadsDeviceOperation tt_dnn_device 176 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20393789613 20393794093 4480 4661002204881 4661002291914 662 87033 74611 35016 74487 62128 2060 74610 74258 62247 61640 62069 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 40
Matmul tt_dnn_device 177 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20393831344 20393836234 4890 4661002253822 4661002420478 624 166656 127936 51733 127932 97166 1789 127936 81876 120723 120871 120964 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 40
Softmax tt_dnn_device 178 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20393865474 20393873824 8350 4661002345812 4661002634635 660 288823 213511 197047 213384 204754 1921 213510 199957 212444 212546 212831 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 100
Matmul tt_dnn_device 179 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20393904054 20393908374 4320 4661002619682 4661002741083 611 121401 105835 65642 105792 80594 1800 105835 102344 104277 104403 104624 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 180 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20393956085 20393960285 4200 4661002702412 4661002767865 697 65453 26087 11653 25314 21028 1583 26086 25304 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 80
Matmul tt_dnn_device 181 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20393972735 20393977845 5110 4661002754473 4661002857507 710 103034 88918 66523 88830 76128 1286 88918 52516 62971 62761 63381 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 182 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20394012085 20394018025 5940 4661002836566 4661002907217 643 70651 49069 40342 48921 45816 1690 49069 48432 48520 47561 48106 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 60
LayerNorm tt_dnn_device 183 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20394031665 20394058096 26431 4661002899942 4661002999363 599 99421 91555 43062 90441 74733 2335 91555 66303 90101 89846 90457 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 184 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20394120946 20394147197 26251 4661002952370 4661003509264 605 556894 509289 506030 509271 507828 1785 509288 428963 507380 508525 508540 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 50
Matmul tt_dnn_device 185 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20394162907 20394167957 5050 4661003508022 4661003761650 641 253628 251740 228369 251487 237289 1772 251740 215492 226252 226040 226651 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 50
LayerNorm tt_dnn_device 186 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20394180947 20394209107 28160 4661003740115 4661003882967 632 142852 120702 55503 120595 97695 2307 120697 92286 119338 119091 119680 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 110
Matmul tt_dnn_device 187 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20394232267 20394258778 26511 4661003819190 4661004357211 642 538021 473586 458598 473493 468255 1754 473586 397795 448117 448239 448143 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 188 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20394293748 20394317878 24130 4661004343715 4661004432721 632 89006 74888 35024 74760 61518 2071 74870 74360 62174 61601 62621 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 120
Matmul tt_dnn_device 189 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20394354078 20394359929 5851 4661004394332 4661004564295 628 169963 130951 55063 130942 97474 1765 130940 86113 123753 123876 123977 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 40
Softmax tt_dnn_device 190 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20394369059 20394377669 8610 4661004489928 4661004777122 648 287194 212177 196430 212053 203729 1918 212176 198655 211115 211216 211501 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 70
Matmul tt_dnn_device 191 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20394408879 20394413499 4620 4661004762777 4661004888642 617 125865 110896 63271 110889 84486 1789 110896 107460 109335 109478 109696 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 192 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20394422859 20394446989 24130 4661004842530 4661004914539 635 72009 25265 11664 25027 20894 1681 25264 24333 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 30
Matmul tt_dnn_device 193 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20394478400 20394483550 5150 4661004901966 4661005003187 638 101221 88003 67300 87923 76193 1342 87993 52102 62531 62320 62938 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 194 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20394516760 20394522830 6070 4661004984522 4661005054001 647 69479 50143 39944 50034 46374 1680 50142 49522 49585 48759 49348 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 50
LayerNorm tt_dnn_device 195 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20394557410 20394569481 12071 4661005045288 4661005145983 679 100695 91341 40230 90985 75407 2317 91339 66060 89952 89687 90297 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 196 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20394615591 20394622611 7020 4661005096278 4661005656743 617 560465 510123 507104 509993 508601 1776 510123 429541 508153 509293 509301 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 40
Matmul tt_dnn_device 197 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20394657911 20394663281 5370 4661005655555 4661005907486 739 251931 250006 227924 249913 236674 1692 250006 214378 225125 224896 225539 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 40
LayerNorm tt_dnn_device 198 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20394717082 20394723552 6470 4661005886877 4661006037453 657 150576 129329 77150 129268 107545 2266 129329 100777 127887 127636 128216 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 199 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20394765692 20394772002 6310 4661005986710 4661006512388 677 525678 474238 458929 474141 468168 1744 474236 398254 449176 449326 449219 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 200 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20394806993 20394811873 4880 4661006498538 4661006589075 644 90537 76044 34951 75920 62151 2039 76036 75300 63486 62864 63676 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 50
Matmul tt_dnn_device 201 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20394848303 20394854363 6060 4661006549434 4661006718592 619 169158 128909 52019 128896 96636 1840 128909 81640 121455 121616 121718 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 90
Softmax tt_dnn_device 202 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20394888694 20394916294 27600 4661006643231 4661006931503 659 288272 212259 196629 212132 204459 1940 212258 198697 211140 211252 211538 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 60
Matmul tt_dnn_device 203 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20394926714 20394931064 4350 4661006917284 4661007035765 603 118481 103656 65653 103615 80640 1787 103651 100159 102077 102207 102434 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 204 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20394960384 20394964714 4330 4661006999300 4661007061735 711 62435 25266 11829 25193 20869 1547 25265 24357 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 60
Matmul tt_dnn_device 205 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20394997565 20395002535 4970 4661007049347 4661007151808 662 102461 89387 66121 89291 75743 1314 89387 52191 62619 62423 63047 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 206 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20395016675 20395043125 26450 4661007130242 4661007202053 644 71811 49621 39905 49514 45266 1701 49615 49015 49081 48224 48809 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 40
LayerNorm tt_dnn_device 207 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20395057415 20395084785 27370 4661007193822 4661007292210 634 98388 89518 40034 89407 74402 2312 89511 64273 88037 87783 88389 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 208 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20395148576 20395155966 7390 4661007244135 4661007802254 646 558119 509386 506383 509298 507905 1769 509386 429307 507372 508524 508542 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 209 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20395170306 20395196816 26510 4661007801157 4661008053595 684 252438 250662 228047 249630 236912 1703 250661 215402 226200 225988 226620 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 90
LayerNorm tt_dnn_device 210 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20395210657 20395217207 6550 4661008032444 4661008177325 640 144881 123098 55428 123031 97040 2273 123092 94685 121698 121453 122040 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 211 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20395279187 20395305198 26011 4661008111066 4661008650261 597 539195 472331 459393 472238 468589 1760 472331 396063 446663 446791 446689 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 40
NlpCreateHeadsDeviceOperation tt_dnn_device 212 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20395320638 20395326078 5440 4661008638726 4661008726863 628 88137 75967 35069 75832 62127 2108 75956 75286 62931 62256 63374 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 60
Matmul tt_dnn_device 213 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20395362938 20395388578 25640 4661008687467 4661008856136 634 168669 128648 55348 128639 97803 1799 128646 80817 122294 122417 122521 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 90
Softmax tt_dnn_device 214 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20395398308 20395405838 7530 4661008784364 4661009069509 656 285145 212727 197133 212614 204936 1915 212714 199243 211748 211864 212126 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 215 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20395415169 20395439939 24770 4661009055402 4661009174774 636 119372 104624 64173 104589 82332 1795 104617 101108 102954 103086 103307 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 216 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20395468239 20395472269 4030 4661009135824 4661009200745 704 64921 25271 11706 25127 20981 1566 25270 24829 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 217 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20395504019 20395508789 4770 4661009188218 4661009289794 711 101576 88318 69010 87973 76211 1278 88318 52335 62983 62784 63401 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 60
BinaryDeviceOperation tt_dnn_device 218 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20395521820 20395528220 6400 4661009271951 4661009339834 650 67883 49407 40854 49294 45803 1697 49406 48882 48971 48131 48713 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 50
LayerNorm tt_dnn_device 219 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20395567920 20395593810 25890 4661009332700 4661009430573 626 97873 90115 40327 89771 73762 2312 90115 64989 88750 88496 89106 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 220 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20395639701 20395646911 7210 4661009382291 4661009942139 646 559848 510910 507786 510796 509553 1730 510908 431379 508878 510036 510052 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 120
Matmul tt_dnn_device 221 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20395681741 20395706781 25040 4661009940583 4661010195396 699 254813 252557 229721 251314 238248 1721 252556 216459 227368 227167 227813 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 60
LayerNorm tt_dnn_device 222 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20395719881 20395726502 6621 4661010174029 4661010319181 624 145152 123175 67972 123047 99163 2263 123168 94410 121416 121166 121752 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 223 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20395791992 20395800522 8530 4661010265398 4661010792080 617 526682 472265 458638 472168 467857 1781 472265 396111 446772 446919 446820 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 140
NlpCreateHeadsDeviceOperation tt_dnn_device 224 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20395816132 20395841053 24921 4661010779907 4661010867834 650 87927 75116 35010 75062 61792 2059 75098 74274 62541 61847 62637 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 90
Matmul tt_dnn_device 225 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20395876273 20395901833 25560 4661010829221 4661010999708 698 170487 131176 53312 131163 96323 1739 131176 83214 123679 123846 123934 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 120
Softmax tt_dnn_device 226 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20395911333 20395918713 7380 4661010923378 4661011211797 657 288419 211433 196418 211318 203663 1919 211432 197893 210359 210493 210755 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 227 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20395928253 20395932353 4100 4661011198277 4661011315673 631 117396 103236 66644 103202 81881 1753 103236 99685 101589 101735 101959 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 228 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20395940764 20395966034 25270 4661011280622 4661011341507 716 60885 25131 11757 24894 20840 1553 25130 24388 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 50
Matmul tt_dnn_device 229 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20395997204 20396002504 5300 4661011329167 4661011432312 696 103145 90092 66459 90008 76088 1291 90091 52208 63026 62835 63451 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 40
BinaryDeviceOperation tt_dnn_device 230 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20396036584 20396061775 25191 4661011410259 4661011482440 640 72181 49476 39682 49365 46001 1694 49474 48863 48931 48038 48662 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 70
LayerNorm tt_dnn_device 231 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20396094975 20396101095 6120 4661011474128 4661011574541 648 100413 91482 50346 90374 75146 2283 91475 66305 89723 89471 90062 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 232 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20396124325 20396130475 6150 4661011534892 4661012085201 642 550309 509999 507139 509907 508574 1763 509999 429476 507972 509118 509132 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 60
Matmul tt_dnn_device 233 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20396185756 20396191036 5280 4661012083832 4661012335217 702 251385 249315 227841 249007 236330 1673 249304 214615 225394 225185 225825 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 40
LayerNorm tt_dnn_device 234 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20396203726 20396230406 26680 4661012315383 4661012458848 625 143465 123020 82824 122957 102537 2291 123020 94457 121537 121294 121879 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 235 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20396291477 20396298137 6660 4661012420087 4661012931191 675 511104 471651 457907 471553 467181 1717 471651 395636 446171 446309 446209 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 50
NlpCreateHeadsDeviceOperation tt_dnn_device 236 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20396312487 20396316827 4340 4661012918935 4661013007180 640 88245 75365 35094 75240 61905 2055 75357 74759 62525 61899 63050 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 30
Matmul tt_dnn_device 237 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20396331997 20396357588 25591 4661012968396 4661013138553 612 170157 130758 52778 130749 97467 1792 130756 85869 123906 124055 124159 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 50
Softmax tt_dnn_device 238 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20396385828 20396393248 7420 4661013062077 4661013354348 643 292271 215155 196909 215025 206147 1971 215155 201617 214135 214246 214523 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 50
Matmul tt_dnn_device 239 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20396441778 20396446038 4260 4661013337600 4661013457348 620 119748 102373 58757 102335 78505 1782 102373 98947 100758 100901 101125 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 240 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20396474229 20396478349 4120 4661013415261 4661013483473 700 68212 25437 11706 25084 20989 1567 25436 24551 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 241 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20396491409 20396496359 4950 4661013470790 4661013573376 691 102586 89195 66522 89108 75884 1285 89195 52589 62848 62628 63250 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 242 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20396509239 20396514769 5530 4661013552583 4661013623172 636 70589 49172 42445 49091 45844 1690 49164 48559 48638 47702 48284 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 50
LayerNorm tt_dnn_device 243 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20396527659 20396556599 28940 4661013617948 4661013714249 671 96301 90411 44901 89426 74768 2264 90411 65311 88962 88699 89317 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 60
Matmul tt_dnn_device 244 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20396650120 20396660200 10080 4661013670202 4661014224647 616 554445 509767 507073 509680 508463 1789 509766 430154 507806 508969 508975 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 200
Matmul tt_dnn_device 245 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20396706761 20396716001 9240 4661014223593 4661014475820 664 252227 250510 227892 249836 236791 1711 250498 214770 225639 225449 226067 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 150
LayerNorm tt_dnn_device 246 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20396734741 20396765471 30730 4661014454673 4661014599738 627 145065 123304 65519 123249 102554 2267 123304 94695 121730 121471 122059 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 247 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20396809322 20396899153 89831 4661014543373 4661015072481 675 529108 472054 459291 471957 468582 1772 472053 395476 446072 446218 446118 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 90
NlpCreateHeadsDeviceOperation tt_dnn_device 248 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20396937153 20396962653 25500 4661015061209 4661015149263 635 88054 76143 35454 75543 62194 2048 76133 75563 64201 63508 64227 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 80
Matmul tt_dnn_device 249 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20397019064 20397024954 5890 4661015110233 4661015280446 632 170213 130555 52715 130544 97516 1805 130555 82289 123835 123977 124083 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 50
Softmax tt_dnn_device 250 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20397034224 20397042724 8500 4661015204145 4661015494304 644 290159 213226 197253 213110 205575 1930 213213 199642 212004 212098 212387 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 40
Matmul tt_dnn_device 251 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20397072864 20397077564 4700 4661015479829 4661015595825 629 115996 100888 60503 100853 79122 1800 100888 97412 99354 99488 99715 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 252 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20397086284 20397090604 4320 4661015556984 4661015621663 613 64679 25227 11824 25156 20940 1635 25226 24697 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 80
Matmul tt_dnn_device 253 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20397122625 20397127925 5300 4661015609341 4661015711908 684 102567 89549 66177 89393 75933 1309 89535 52235 62830 62599 63238 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 254 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20397161995 20397187485 25490 4661015690449 4661015761821 634 71372 49290 40613 49183 45569 1696 49284 48659 48746 47693 48344 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 40
LayerNorm tt_dnn_device 255 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20397220116 20397226786 6670 4661015754612 4661015852916 643 98304 90452 43015 90179 74112 2301 90447 65413 88937 88679 89290 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 256 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20397271716 20397278966 7250 4661015806941 4661016362837 609 555896 509298 506305 509283 508052 1810 509296 429834 507357 508506 508514 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 50
Matmul tt_dnn_device 257 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20397313717 20397318947 5230 4661016361716 4661016616165 664 254449 252664 228639 252040 237736 1760 252664 215931 226766 226566 227199 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 90
LayerNorm tt_dnn_device 258 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20397351917 20397377237 25320 4661016593914 4661016739469 636 145555 122686 60559 122585 100324 2277 122681 94280 121302 121054 121646 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 18970
Matmul tt_dnn_device 259 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20397417528 20397423878 6350 4661016678762 4661017211540 627 532778 471430 457474 471330 467167 1774 471430 395502 445758 445918 445803 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 50
NlpCreateHeadsDeviceOperation tt_dnn_device 260 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20397459328 20397464158 4830 4661017198998 4661017287707 647 88709 75525 34790 75460 61971 2040 75516 74451 62053 61328 61986 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 100
Matmul tt_dnn_device 261 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20397478898 20397503538 24640 4661017248485 4661017416763 694 168278 128367 51526 128359 96105 1746 128365 82337 122404 122536 122641 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 40
Softmax tt_dnn_device 262 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20397512908 20397540019 27111 4661017341448 4661017630338 638 288890 212942 196803 212822 204683 1939 212926 199415 211619 211739 212000 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 70
Matmul tt_dnn_device 263 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20397600199 20397605769 5570 4661017615609 4661017732325 609 116716 101367 62837 101346 79871 1772 101367 97881 99814 99950 100171 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 60
ConcatenateHeads tt_dnn_device 264 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20397615629 20397622840 7211 4661017696120 4661017758430 646 62310 25469 11647 25375 20874 1661 25469 24471 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 30
Matmul tt_dnn_device 265 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20397635360 20397661160 25800 4661017745647 4661017848357 628 102710 89281 66603 89185 75720 1349 89281 52237 62585 62395 63029 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 60
BinaryDeviceOperation tt_dnn_device 266 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20397675540 20397681630 6090 4661017827408 4661017897677 649 70269 48647 40596 48555 45419 1703 48644 48035 48111 47153 47749 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 60
LayerNorm tt_dnn_device 267 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20397715950 20397722430 6480 4661017892073 4661017990608 671 98535 92303 43118 91950 75779 2268 92301 67214 90920 90659 91262 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 268 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20397807411 20397815051 7640 4661017942913 4661018501283 622 558370 510037 507084 509950 508381 1759 510036 429072 508058 509210 509215 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 150
Matmul tt_dnn_device 269 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20397827941 20397853542 25601 4661018500017 4661018751618 631 251601 249708 227997 249544 236464 1761 249707 214372 225325 225122 225746 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 50
LayerNorm tt_dnn_device 270 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20397866652 20397873802 7150 4661018731437 4661018876518 638 145081 124273 74836 124235 103142 2278 124268 95794 122831 122596 123180 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 60
Matmul tt_dnn_device 271 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20397916272 20397942943 26671 4661018828533 4661019349349 699 520816 472119 458625 472025 467790 1665 472118 395877 446538 446674 446575 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 70
NlpCreateHeadsDeviceOperation tt_dnn_device 272 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20397997753 20398002753 5000 4661019337320 4661019425588 653 88268 75595 35005 75531 62008 2022 75587 74677 62109 61409 62648 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 60
Matmul tt_dnn_device 273 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20398018263 20398024003 5740 4661019386465 4661019556113 696 169648 129828 52752 129821 97714 1717 129826 83604 122624 122748 122856 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 70
Softmax tt_dnn_device 274 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20398054374 20398061824 7450 4661019480559 4661019768816 635 288257 212076 196811 211959 204411 1943 212076 198606 211094 211196 211474 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 50
Matmul tt_dnn_device 275 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20398071804 20398096184 24380 4661019755005 4661019873356 604 118351 103929 60985 103896 82202 1796 103929 100217 102160 102280 102504 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 50
ConcatenateHeads tt_dnn_device 276 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20398105444 20398129074 23630 4661019831952 4661019899407 702 67455 25356 11676 24913 20786 1556 25356 24994 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 277 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20398160575 20398165885 5310 4661019886766 4661019989464 706 102698 89338 67015 89250 75984 1277 89338 52032 62508 62305 62943 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 278 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20398199875 20398206165 6290 4661019968943 4661020038901 629 69958 48819 40826 48720 45381 1720 48808 48205 48292 47337 47917 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 30
LayerNorm tt_dnn_device 279 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20398240025 20398247615 7590 4661020032362 4661020131148 642 98786 91606 49240 91453 75932 2322 91606 66610 90250 90000 90611 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 50
Matmul tt_dnn_device 280 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20398292436 20398299616 7180 4661020090214 4661020643058 620 552844 511276 508150 511186 509696 1779 511276 431106 509289 510443 510451 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 50
Matmul tt_dnn_device 281 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20398353756 20398359417 5661 4661020641601 4661020895372 675 253771 251633 228682 251208 237573 1705 251633 215503 226486 226260 226901 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 130
LayerNorm tt_dnn_device 282 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20398392187 20398398497 6310 4661020873880 4661021019904 651 146024 123902 68356 123793 100340 2247 123902 95364 122449 122194 122780 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 30
Matmul tt_dnn_device 283 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20398440217 20398446767 6550 4661020965793 4661021492544 617 526751 472006 458815 471849 468599 1785 472006 395774 446290 446421 446327 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 3072 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/1964607002770158222/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/10171387204552674729/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/3620545901818543209/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14040558449043769446/'; 'reader_bmm_tile_layout_in0_receiver/3741846892422066342/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/16120621507353413420/'; 'reader_bmm_tile_layout_in0_receiver/8544177351793425910/'] 2856 1328 2340 2224 2876 0 73728 73728 22710 [85.33333587646484; 85.33333587646484] [128.0] 70
NlpCreateHeadsDeviceOperation tt_dnn_device 284 0 {'head_dim': '64'; 'num_kv_heads': '16'; 'num_q_heads': '16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'transpose_k_heads': 'true'} HiFi4 56 20398461658 20398487228 25570 4661021480814 4661021567197 712 86383 73942 35306 73887 61547 1996 73933 73004 61325 60534 61222 8 1 384 3072 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'; 'ttnn/cpp/ttnn/deprecated/tt_dnn/kernels/compute/transpose_wh.cpp'] ['transpose_wh/3178062545855218618/'; 'transpose_wh/10550016288823521500/'] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_create_qkv_heads.cpp'; 'ttnn/cpp/ttnn/operations/experimental/transformer/nlp_create_qkv_heads/device/kernels/dataflow/writer_tm_tile_layout_nlp_create_qkv_heads.cpp'] ['reader_tm_tile_layout_nlp_create_qkv_heads/10655758333870859175/'; 'writer_tm_tile_layout_nlp_create_qkv_heads/6289684493255772102/'] 1356 1248 1080 652 1496 0 1 1 1 [] [] 50
Matmul tt_dnn_device 285 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20398541738 20398546928 5190 4661021530136 4661021697243 692 167107 129353 52811 129340 97287 1769 129353 81456 122454 122600 122698 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 64 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/16305281631831791739/'; 'bmm_large_block_zm_fused_bias_activation/11816330370691404186/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/13017654423404013589/'; 'reader_writer_bmm_tile_layout_in1/954737655012426725/'] 1632 1032 2384 2024 3232 0 9216 9216 1 [341.3333435058594; 341.3333435058594] [2048.0] 50
Softmax tt_dnn_device 286 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'inplace': 'true'; 'is_causal_mask': 'false'; 'is_scale_causal_mask_hw_dims_softmax': 'false'; 'numeric_stable': 'false'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'SoftmaxDefaultProgramConfig()'; 'scale': '0.125'} HiFi4 56 20398556058 20398570809 14751 4661021622230 4661021911070 649 288840 213193 197270 213059 205100 1924 213193 199597 211923 212029 212304 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 1 32 384 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/compute/softmax.cpp'] ['softmax/14128336124697917683/'] ['ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/reader_unary_interleaved_sm.cpp'; 'ttnn/cpp/ttnn/operations/normalization/softmax/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked_sm.cpp'] ['reader_unary_interleaved_sm/14855729956910835846/'; 'writer_unary_interleaved_start_id_blocked_sm/7912822949537755752/'] 1128 1576 3344 2468 3164 0 1 1 1 [18874368.0] [18874368.0] 30
Matmul tt_dnn_device 287 0 {'bcast_batch': '0'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'true'} LoFi 56 20398601459 20398606649 5190 4661021896561 4661022012021 595 115460 100348 60649 100310 79302 1803 100338 96858 98804 98942 99159 8 16 384 384 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14430505022624558712/'; 'bmm_large_block_zm_fused_bias_activation/8499094234426031006/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_writer_bmm_tile_layout_in1.cpp'] ['reader_bmm_tile_layout_in0/9089692411428126113/'; 'reader_writer_bmm_tile_layout_in1/14132425175164457887/'] 1632 1032 2616 2004 3544 0 9216 9216 1 [2048.0; 341.3333435058594] [341.3333435058594] 40
ConcatenateHeads tt_dnn_device 288 0 {'NLPConcatHeadsDeviceOperation::output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'} 56 20398635939 20398640259 4320 4661021973865 4661022038267 715 64402 25533 11645 25460 20968 1546 25532 24653 8 16 384 64 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED [] [] ['ttnn/cpp/ttnn/operations/experimental/transformer/nlp_concat_heads/device/kernels/dataflow/reader_tm_tile_layout_nlp_concat_heads.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_tm_tile_layout_nlp_concat_heads/6960558009113954434/'; 'writer_unary_interleaved_start_id/9569938723224986099/'] 720 904 0 0 0 0 1 1 1 [3145728.0] [3145728.0] 40
Matmul tt_dnn_device 289 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20398652719 20398657839 5120 4661022025475 4661022126789 671 101314 87837 66714 87765 75819 1328 87829 52005 62523 62346 62941 1 8 384 1024 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 1024 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/14734556434023176435/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/6919457500492820611/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9347022393441517796/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/5588764139449550201/'; 'reader_bmm_tile_layout_in0_receiver/2976717304524227257/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/12844204389361958428/'; 'reader_bmm_tile_layout_in0_receiver/5424071778043832743/'] 2540 1324 2304 2220 4552 0 24576 24576 7570 [128.0; 85.33333587646484] [256.0] 50
BinaryDeviceOperation tt_dnn_device 290 0 {'activations': 'std::nullopt'; 'binary_op_type': 'BinaryOpType::ADD'; 'compute_kernel_config': 'std::nullopt'; 'dtype': 'DataType::BFLOAT16'; 'input_tensor_a_activation': 'std::nullopt'; 'memory_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'scalar': 'std::nullopt'; 'worker_grid': '{[(x=0;y=0) - (x=7;y=6)]}'} HiFi4 56 20398690960 20398717550 26590 4661022107503 4661022176415 651 68912 48983 41851 48879 45769 1710 48977 48352 48426 47494 48081 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/compute/eltwise_binary_kernel.cpp'] ['eltwise_binary_kernel/5705211043871595155/'] ['ttnn/cpp/ttnn/operations/eltwise/binary/device/kernels/dataflow/reader_binary_interleaved_start_id.cpp'; 'ttnn/cpp/ttnn/operations/eltwise/unary/device/kernels/dataflow/writer_unary_interleaved_start_id.cpp'] ['reader_binary_interleaved_start_id/11854828065706211116/'; 'writer_unary_interleaved_start_id/10144635222209662953/'] 712 932 1312 716 1588 0 1456 1456 1 [4321.05517578125; 4321.05517578125] [4321.05517578125] 19670
LayerNorm tt_dnn_device 291 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20398750590 20398756990 6400 4661022170743 4661022266939 647 96196 89885 39911 89351 72532 2267 89880 65064 88539 88280 88882 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/17805569453407806092/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/10534817208772924740/'; 'writer_unary_interleaved_start_id_blocked/1043202296705612688/'] 740 1852 4300 6312 6956 0 1 1 1 [6291456.0] [6291456.0] 40
Matmul tt_dnn_device 292 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi2;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT8_B'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'UnaryWithParam(op_type=UnaryOpType::GELU;param={1})'; 'user_run_batched': 'false'} HiFi2 56 20398800131 20398807271 7140 4661022218445 4661022778051 640 559606 510461 506614 510306 508577 1777 510461 430481 508565 509718 509721 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 1024 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 4096 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/8773705861819299649/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/12660377478018547207/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/9149716475445932782/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/6384535104198044166/'; 'reader_bmm_tile_layout_in0_receiver/9484198705064493409/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/3785534505303773027/'; 'reader_bmm_tile_layout_in0_receiver/10772715124978766409/'] 2904 1320 3116 2804 3288 0 196608 196608 30281 [32.0; 42.66666793823242] [64.0] 70
Matmul tt_dnn_device 293 0 {'bcast_batch': '1'; 'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=LoFi;math_approx_mode=0;fp32_dest_acc_en=0;packer_l1_acc=1;dst_full_sync_en=0)'; 'global_cb': 'std::nullopt'; 'output_dtype': 'DataType::BFLOAT16'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'output_tile': 'Tile(tile_shape={32; 32};face_shape={16; 16};num_faces=4)'; 'program_config': 'std::nullopt'; 'transpose_a': 'false'; 'transpose_b': 'false'; 'untilize_out': 'false'; 'user_core_coord': '(x=8;y=8)'; 'user_fused_activation': 'std::nullopt'; 'user_run_batched': 'false'} LoFi 56 20398842311 20398847321 5010 4661022776038 4661023029477 730 253439 250697 227875 250621 237167 1688 250687 215130 226033 225840 226485 1 8 384 4096 TILE BFLOAT8_B DEV_0_L1_INTERLEAVED 1 1 4096 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/matmul/device/kernels/compute/bmm_large_block_zm_fused_bias_activation.cpp'] ['bmm_large_block_zm_fused_bias_activation/13833909087104853045/'] ['ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_sender_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_sender_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in1_receiver_writer_padding.cpp'; 'ttnn/cpp/ttnn/operations/matmul/device/kernels/dataflow/reader_bmm_tile_layout_in0_receiver.cpp'] ['reader_bmm_tile_layout_in0_sender_padding/9136839261552362542/'; 'reader_bmm_tile_layout_in1_sender_writer_padding/17060665106095136048/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/827038339646326823/'; 'reader_bmm_tile_layout_in0_receiver/9304053638197113836/'; 'reader_bmm_tile_layout_in1_receiver_writer_padding/14505295660414690454/'; 'reader_bmm_tile_layout_in0_receiver/12186755499796010595/'] 2488 1300 2324 2224 4556 0 98304 98304 30281 [128.0; 85.33333587646484] [64.0] 40
LayerNorm tt_dnn_device 294 0 {'compute_kernel_config': 'WormholeComputeKernelConfig(math_fidelity=HiFi4;math_approx_mode=1;fp32_dest_acc_en=0;packer_l1_acc=0;dst_full_sync_en=0)'; 'distributed_norm_stage': 'DistributedLayerNormStage::NOT_DISTRIBUTED'; 'dtype': 'std::nullopt'; 'eps': '1e-05'; 'norm_type': 'LayerNormType::LAYERNORM'; 'output_mem_config': 'MemoryConfig(memory_layout=TensorMemoryLayout::INTERLEAVED;buffer_type=BufferType::L1;shard_spec=std::nullopt)'; 'program_config': 'LayerNormDefaultProgramConfig()'} HiFi4 56 20398881501 20398888502 7001 4661023008115 4661023149076 627 140961 118981 60930 118915 97590 2290 118977 90460 117527 117276 117867 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 1 32 1024 TILE BFLOAT16 DEV_0_DRAM_INTERLEAVED 1 8 384 1024 TILE BFLOAT16 DEV_0_L1_INTERLEAVED ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/compute/layernorm.cpp'] ['layernorm/16918772478690167680/'] ['ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/reader_unary_interleaved_ln.cpp'; 'ttnn/cpp/ttnn/operations/normalization/layernorm/device/kernels/dataflow/writer_unary_interleaved_start_id_blocked.cpp'] ['reader_unary_interleaved_ln/13382430486894575066/'; 'writer_unary_interleaved_start_id_blocked/16112736523431464771/'] 740 2164 4776 7472 7912 0 1 1 1 [6291456.0] [6291456.0] 40
(torch) __get__ python_fallback 295 None 20446632075 20446660675 28600
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment