Last active
April 9, 2026 19:55
-
-
Save zoecarver/513a054a1422dba82606dfcb5fa35786 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| tt-lang compute (M=8, N=8, K=8, 21ms) | |
| // Phase 1: Zero-fill 64 acc tiles (8x8 output block) | |
| cb_reserve_back(cb2, 64); | |
| for (m = 0; m < 8; m++) // M=8 | |
| for (n = 0; n < 8; n += 4) // N=8, subblock of 4 | |
| acquire; fill x4; commit; wait; pack x4; release; | |
| cb_push_back(cb2, 64); | |
| // Phase 2: K-block loop (32 iterations, K_BLOCKS=256/8) | |
| for (k = 0; k < 32; k++) { // <-- 32 outer K iterations | |
| cb_wait_front(cb0, 64); // wait 64 A tiles (8x8) | |
| cb_wait_front(cb1, 64); // wait 64 B tiles (8x8) | |
| cb_wait_front(cb2, 64); // wait 64 acc tiles | |
| cb_reserve_back(cb2, 64); // reserve new acc | |
| mm_block_init(cb0, cb1, cb2, 0, 4, 1, 4); // FULL INIT every K block | |
| for (m = 0; m < 8; m++) // M subblocks | |
| for (n = 0; n < 8; n += 4) // N subblocks of 4 | |
| acquire; | |
| copy_tile_init(cb2); // RELOAD acc from CB to DST | |
| copy_tile(cb2, ..., 0); // 4 tiles per subblock | |
| copy_tile(cb2, ..., 1); | |
| copy_tile(cb2, ..., 2); | |
| copy_tile(cb2, ..., 3); | |
| mm_block_init_short(...); // re-init after copy_tile | |
| for (k_inner = 0; k_inner < 4; k_inner++) // inner K=4 (DST accum) | |
| matmul_block(cb0, cb1, ...); | |
| commit; wait; | |
| pack_tile x4 to cb2; // pack back to acc CB | |
| release; | |
| cb_push_back(cb2, 64); // push new acc | |
| cb_pop_front(cb2, 64); // pop old acc | |
| cb_pop_front(cb1, 64); | |
| cb_pop_front(cb0, 64); | |
| } | |
| // (writer reads final acc from cb2 via out_dfb.wait) | |
| TTNN compute (6ms) | |
| mm_block_init(cb0, cb1, cb_partials, ...); // ONCE | |
| for (block = 0; block < num_blocks_K; block++) { | |
| cb_wait_front(cb0, in0_block_tiles); | |
| cb_wait_front(cb1, in1_block_tiles); | |
| for (in0_sub = 0; in0_sub < in0_num_subblocks; in0_sub++) | |
| for (in1_sub = 0; in1_sub < in1_num_subblocks; in1_sub++) | |
| acquire; | |
| if (enable_reload) // ONLY on last K block | |
| copy_block_matmul_partials(...); // bulk reload (not per-tile) | |
| for (k = 0; k < in0_block_w; k++) // inner K | |
| matmul_block(cb0, cb1, ...); | |
| if (last_out): | |
| commit; reserve(cb_out); wait; | |
| PACK(pack_reconfig_l1_acc(0)); // disable L1 acc for final | |
| pack_tile_block(..., cb_out); // pack directly to output | |
| release; push(cb_out); | |
| else: | |
| commit; reserve(cb_partials); wait; | |
| PACK(pack_reconfig_l1_acc(block==0 ? 0 : 1)); // L1 ACC! | |
| pack_tile_block(..., cb_partials); // accumulate in-place in L1 | |
| release; push(cb_partials); | |
| // With L1 ACC: just advance FIFO pointer, no actual reload needed | |
| if (block < num_blocks_K - 2) | |
| cb_wait_front(cb_partials); cb_pop_front(cb_partials); | |
| if (block == num_blocks_K - 2) | |
| enable_reload = true; // reload only on very last | |
| cb_pop_front(cb0); cb_pop_front(cb1); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment