Skip to content

Instantly share code, notes, and snippets.

@zoecarver
Last active April 9, 2026 19:55
Show Gist options
  • Select an option

  • Save zoecarver/513a054a1422dba82606dfcb5fa35786 to your computer and use it in GitHub Desktop.

Select an option

Save zoecarver/513a054a1422dba82606dfcb5fa35786 to your computer and use it in GitHub Desktop.
tt-lang compute (M=8, N=8, K=8, 21ms)
// Phase 1: Zero-fill 64 acc tiles (8x8 output block)
cb_reserve_back(cb2, 64);
for (m = 0; m < 8; m++) // M=8
for (n = 0; n < 8; n += 4) // N=8, subblock of 4
acquire; fill x4; commit; wait; pack x4; release;
cb_push_back(cb2, 64);
// Phase 2: K-block loop (32 iterations, K_BLOCKS=256/8)
for (k = 0; k < 32; k++) { // <-- 32 outer K iterations
cb_wait_front(cb0, 64); // wait 64 A tiles (8x8)
cb_wait_front(cb1, 64); // wait 64 B tiles (8x8)
cb_wait_front(cb2, 64); // wait 64 acc tiles
cb_reserve_back(cb2, 64); // reserve new acc
mm_block_init(cb0, cb1, cb2, 0, 4, 1, 4); // FULL INIT every K block
for (m = 0; m < 8; m++) // M subblocks
for (n = 0; n < 8; n += 4) // N subblocks of 4
acquire;
copy_tile_init(cb2); // RELOAD acc from CB to DST
copy_tile(cb2, ..., 0); // 4 tiles per subblock
copy_tile(cb2, ..., 1);
copy_tile(cb2, ..., 2);
copy_tile(cb2, ..., 3);
mm_block_init_short(...); // re-init after copy_tile
for (k_inner = 0; k_inner < 4; k_inner++) // inner K=4 (DST accum)
matmul_block(cb0, cb1, ...);
commit; wait;
pack_tile x4 to cb2; // pack back to acc CB
release;
cb_push_back(cb2, 64); // push new acc
cb_pop_front(cb2, 64); // pop old acc
cb_pop_front(cb1, 64);
cb_pop_front(cb0, 64);
}
// (writer reads final acc from cb2 via out_dfb.wait)
TTNN compute (6ms)
mm_block_init(cb0, cb1, cb_partials, ...); // ONCE
for (block = 0; block < num_blocks_K; block++) {
cb_wait_front(cb0, in0_block_tiles);
cb_wait_front(cb1, in1_block_tiles);
for (in0_sub = 0; in0_sub < in0_num_subblocks; in0_sub++)
for (in1_sub = 0; in1_sub < in1_num_subblocks; in1_sub++)
acquire;
if (enable_reload) // ONLY on last K block
copy_block_matmul_partials(...); // bulk reload (not per-tile)
for (k = 0; k < in0_block_w; k++) // inner K
matmul_block(cb0, cb1, ...);
if (last_out):
commit; reserve(cb_out); wait;
PACK(pack_reconfig_l1_acc(0)); // disable L1 acc for final
pack_tile_block(..., cb_out); // pack directly to output
release; push(cb_out);
else:
commit; reserve(cb_partials); wait;
PACK(pack_reconfig_l1_acc(block==0 ? 0 : 1)); // L1 ACC!
pack_tile_block(..., cb_partials); // accumulate in-place in L1
release; push(cb_partials);
// With L1 ACC: just advance FIFO pointer, no actual reload needed
if (block < num_blocks_K - 2)
cb_wait_front(cb_partials); cb_pop_front(cb_partials);
if (block == num_blocks_K - 2)
enable_reload = true; // reload only on very last
cb_pop_front(cb0); cb_pop_front(cb1);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment