Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save Munksgaard/1ab28e296d7708cdc66087433a8b4efb to your computer and use it in GitHub Desktop.
Save Munksgaard/1ab28e296d7708cdc66087433a8b4efb to your computer and use it in GitHub Desktop.
-- mat_13766 : [m_13764][m_13765]f32@@mat_mem_19100->{base: [m_13764, m_13765]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13765, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13765]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
entry {[?0][?1]f32@?2->{base: [?0, ?1]; contiguous: True; LMADs: [{offset: 0i32;
strides: [?1, 1i32];
rotates: [0i32, 0i32];
shape: [?0, ?1];
permutation: [0, 1];
monotonicity: [Inc, Inc]}]}}
main (mem mat_mem_19100, i32 m_13764, i32 m_13765,
[m_13764][m_13765]f32 mat_13766) = {
let {bool dim_match_13767} = eq_i32(m_13764, m_13765)
let {cert empty_or_match_cert_13768} =
assert(dim_match_13767, "function arguments of wrong shape",
"lud.fut:108:1-186:39")
-- mat_13769 aliases mat_13766
-- mat_13769 : [m_13764][m_13764]f32@@mat_mem_19100->{base: [m_13764, m_13764]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13765, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13764]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[m_13764][m_13764]f32 mat_13769} =
<empty_or_match_cert_13768>
reshape((~m_13764, ~m_13764), mat_13766)
let {i32 x_13770} = add32(16i32, m_13764)
let {i32 x_13771} = sub32(x_13770, 1i32)
let {i32 num_blocks_13772} = sdiv32(x_13771, 16i32)
let {i32 n_13773} = mul32(16i32, num_blocks_13772)
let {i32 padding_13774} = sub32(n_13773, m_13764)
let {bool cond_13775} = eq_i32(padding_13774, 0i32)
let {bool cond_13776} = not cond_13775
let {i32 conc_tmp_13777} = add32(m_13764, padding_13774)
let {i32 size_13778} =
-- Branch returns: {i32}
if cond_13776
then {conc_tmp_13777} else {n_13773}
let {i64 binop_x_19102} = sext i32 padding_13774 to i64
let {i64 binop_y_19103} = sext i32 n_13773 to i64
let {i64 binop_x_19104} = mul64(binop_x_19102, binop_y_19103)
let {i64 bytes_19101} = mul64(4i64, binop_x_19104)
let {i64 binop_x_19107} = sext i32 m_13764 to i64
let {i64 binop_x_19109} = mul64(binop_x_19102, binop_x_19107)
let {i64 bytes_19106} = mul64(4i64, binop_x_19109)
let {i32 mat_ixfn_19121} =
-- Branch returns: {i32}
if cond_13776
then {n_13773} else {m_13765}
-- mat_mem_19122 aliases mat_mem_19100
-- mat_13779 aliases mat_13766
-- mat_13779 : [size_13778][n_13773]f32@@mat_mem_19122->{base: [size_13778, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [mat_ixfn_19121, 1i32]; rotates: [0i32, 0i32]; shape: [size_13778, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {mem mat_mem_19122;
[size_13778][n_13773]f32 mat_13779} =
-- Branch returns: {[size_13778][n_13773]f32@?0->{base: [size_13778, n_13773];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mat_ixfn_19121, 1i32];
-- rotates: [0i32, 0i32];
-- shape: [size_13778, n_13773];
-- permutation: [0, 1];
-- monotonicity: [Inc, Inc]}]}}
if cond_13776
then {
let {bool bounds_invalid_upwards_13780} = slt32(padding_13774, 0i32)
let {bool valid_13781} = not bounds_invalid_upwards_13780
let {cert range_valid_c_13782} =
assert(valid_13781, "Range ", 0i32, "..", 1i32, "..<", padding_13774,
" is invalid.", "/prelude/math.fut:453:23-30")
let {mem mem_19105} =
alloc(bytes_19101)
-- res_13783 : [padding_13774][n_13773]f32@@mem_19105->{base: [padding_13774, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [padding_13774, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[padding_13774][n_13773]f32 res_13783} = replicate([padding_13774,
n_13773], 0.0f32)
let {cert range_valid_c_13784} =
assert(valid_13781, "Range ", 0i32, "..", 1i32, "..<", padding_13774,
" is invalid.", "/prelude/math.fut:453:23-30")
let {bool dim_match_13786} = eq_i32(n_13773, conc_tmp_13777)
let {cert empty_or_match_cert_13787} =
assert(dim_match_13786, "Value of (core language) shape (",
conc_tmp_13777,
") cannot match shape of type `[", n_13773,
"]a`.", "lud.fut:103:3-36")
let {mem mem_19110} =
alloc(bytes_19106)
-- res_repd_14195 : [m_13764][padding_13774]f32@@mem_19110->{base: [m_13764, padding_13774]; contiguous: True; LMADs: [{offset: 0i32; strides: [padding_13774, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, padding_13774]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[m_13764][padding_13774]f32 res_repd_14195} = replicate([m_13764,
padding_13774],
0.0f32)
let {i64 binop_y_19113} =
<range_valid_c_13784>
sext i32 conc_tmp_13777 to i64
let {i64 binop_x_19114} =
<range_valid_c_13784>
mul64(binop_x_19107, binop_y_19113)
let {i64 bytes_19111} =
<range_valid_c_13784>
mul64(4i64, binop_x_19114)
let {mem mem_19115} =
<range_valid_c_13784>
alloc(bytes_19111)
-- res_r_14198 : [m_13764][conc_tmp_13777]f32@@mem_19115->{base: [m_13764, conc_tmp_13777]; contiguous: True; LMADs: [{offset: 0i32; strides: [conc_tmp_13777, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, conc_tmp_13777]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[m_13764][conc_tmp_13777]f32 res_r_14198} =
<range_valid_c_13784>
concat@1(mat_13769, res_repd_14195)
-- res_14199 aliases res_r_14198
-- res_14199 : [m_13764][n_13773]f32@@mem_19115->{base: [m_13764, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[m_13764][n_13773]f32 res_14199} =
<empty_or_match_cert_13787>
reshape((m_13764, n_13773), res_r_14198)
let {i64 binop_x_19119} =
<range_valid_c_13782>
mul64(binop_y_19103, binop_y_19113)
let {i64 bytes_19116} =
<range_valid_c_13782>
mul64(4i64, binop_x_19119)
let {mem mem_19120} =
<range_valid_c_13782>
alloc(bytes_19116)
-- res_13792 : [conc_tmp_13777][n_13773]f32@@mem_19120->{base: [conc_tmp_13777, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [conc_tmp_13777, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[conc_tmp_13777][n_13773]f32 res_13792} =
<range_valid_c_13782>
concat@0(res_14199, res_13783)
-- branch_ctx_reshaped_13793 aliases res_13792
-- branch_ctx_reshaped_13793 : [size_13778][n_13773]f32@@mem_19120->{base: [size_13778, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [size_13778, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[size_13778][n_13773]f32 branch_ctx_reshaped_13793} =
reshape((~size_13778, ~n_13773), res_13792)
in {mem_19120, branch_ctx_reshaped_13793}
} else {
let {bool dim_match_13794} = eq_i32(n_13773, m_13764)
let {bool match_13795} = logand(dim_match_13794, dim_match_13794)
let {cert empty_or_match_cert_13796} =
assert(match_13795, "Value of (core language) shape (", m_13764, ", ",
m_13764, ") cannot match shape of type `[", n_13773,
"][", n_13773, "]f32`.", "lud.fut:117:20-35")
-- branch_ctx_reshaped_13797 aliases mat_13766
-- branch_ctx_reshaped_13797 : [size_13778][n_13773]f32@@mat_mem_19100->{base: [size_13778, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13765, 1i32]; rotates: [0i32, 0i32]; shape: [size_13778, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[size_13778][n_13773]f32 branch_ctx_reshaped_13797} =
<empty_or_match_cert_13768, empty_or_match_cert_13796>
reshape((~size_13778, ~n_13773), mat_13766)
in {mat_mem_19100, branch_ctx_reshaped_13797}
}
let {bool bounds_invalid_upwards_13798} = slt32(num_blocks_13772, 0i32)
let {bool valid_13799} = not bounds_invalid_upwards_13798
let {cert range_valid_c_13800} =
assert(valid_13799, "Range ", 0i32, "..", 1i32, "..<", num_blocks_13772,
" is invalid.", "/prelude/math.fut:453:23-30")
let {cert range_valid_c_13802} =
assert(valid_13799, "Range ", 0i32, "..", 1i32, "..<", num_blocks_13772,
" is invalid.", "/prelude/math.fut:453:23-30")
let {i64 num_blocks_14664} = sext i32 num_blocks_13772 to i64
let {i64 y_14670} = mul64(256i64, num_blocks_14664)
let {i64 nest_size_14671} = mul64(num_blocks_14664, y_14670)
let {i32 segmap_group_size_14672} =
get_size(segmap_group_size_14346, group_size)
let {i64 segmap_group_size_14673} = sext i32 segmap_group_size_14672 to i64
let {i64 y_14674} = sub64(segmap_group_size_14673, 1i64)
let {i64 x_14675} = add64(nest_size_14671, y_14674)
let {i64 segmap_usable_groups_64_14677} =
squot64(x_14675, segmap_group_size_14673)
let {i32 segmap_usable_groups_14678} =
sext i64 segmap_usable_groups_64_14677 to i32
let {i64 binop_x_19126} =
<range_valid_c_13802>
mul64(num_blocks_14664, num_blocks_14664)
let {i64 binop_x_19128} =
<range_valid_c_13802>
mul64(16i64, binop_x_19126)
let {i64 binop_x_19130} =
<range_valid_c_13802>
mul64(16i64, binop_x_19128)
let {i64 bytes_19123} =
<range_valid_c_13802>
mul64(4i64, binop_x_19130)
let {mem mem_19131} =
<range_valid_c_13802>
alloc(bytes_19123)
-- res_14679 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 res_14679} =
<range_valid_c_13802>
segmap_thread
(#groups=segmap_usable_groups_14678; groupsize=segmap_group_size_14672)
(gtid_14333 < num_blocks_13772, gtid_14334 < num_blocks_13772,
gtid_14335 < 16i32, gtid_14336 < 16i32) (~phys_tid_14337) : {f32} {
let {i32 index_primexp_18535} = mul32(16i32, gtid_14334)
let {i32 binop_y_18552} = mul32(16i32, gtid_14333)
let {i32 index_primexp_18553} = add32(gtid_14335, binop_y_18552)
let {i32 i_14683} =
<range_valid_c_13802, range_valid_c_13800>
add32(gtid_14336, index_primexp_18535)
let {f32 res_14684} =
<range_valid_c_13802, range_valid_c_13800>
mat_13779[index_primexp_18553, i_14683]
return {returns res_14684}
}
let {i32 x_13817} = sdiv32(n_13773, 16i32)
let {i32 upper_bound_13818} = sub32(x_13817, 1i32)
let {bool loop_nonempty_13819} = slt32(0i32, upper_bound_13818)
let {i32 res_13820} =
-- Branch returns: {i32}
if <fallback> loop_nonempty_13819
then {
let {i32 x_13821} = opaque(1i32)
in {x_13821}
} else {0i32}
let {i32 x_13822} = mul32(16i32, res_13820)
let {bool assert_arg_13823} = eq_i32(x_13822, 16i32)
let {bool loop_not_taken_13824} = not loop_nonempty_13819
let {bool protect_assert_disj_13825} =
logor(assert_arg_13823, loop_not_taken_13824)
let {cert dim_ok_13826} =
assert(protect_assert_disj_13825,
"new shape has different number of elements than old shape",
"/prelude/array.fut:95:3-33")
let {bool y_13827} = slt32(0i32, res_13820)
let {bool protect_assert_disj_13828} = logor(loop_not_taken_13824, y_13827)
let {cert index_certs_13829} =
assert(protect_assert_disj_13828, "Index [", 0i32,
"] out of bounds for array of shape [",
res_13820, "].",
"/prelude/array.fut:15:29-32")
let {i32 max_group_size_14841} =
get_size_max(group_size)
let {bool fits_14842} = sle32(16i32, max_group_size_14841)
let {bool suff_intra_par_14840} =
get_size(suff_intra_par_6, threshold (!suff_outer_par_5)) <= 16i32
let {bool intra_suff_and_fits_14843} =
logand(suff_intra_par_14840, fits_14842)
let {i32 segmap_group_size_15829} =
get_size(segmap_group_size_15648, group_size)
let {i64 res_15435} = sext i32 res_13820 to i64
let {i32 segmap_group_size_15439} =
get_size(segmap_group_size_15195, group_size)
let {i32 segmap_group_size_15472} =
get_size(segmap_group_size_15135, group_size)
let {i32 segmap_group_size_15512} =
get_size(segmap_group_size_15059, group_size)
let {i32 segmap_group_size_15542} =
get_size(segmap_group_size_15002, group_size)
let {i64 nest_size_15438} = mul64(16i64, res_15435)
let {i64 segmap_group_size_15440} = sext i32 segmap_group_size_15439 to i64
let {i64 segmap_group_size_15473} = sext i32 segmap_group_size_15472 to i64
let {i64 segmap_group_size_15513} = sext i32 segmap_group_size_15512 to i64
let {i64 segmap_group_size_15543} = sext i32 segmap_group_size_15542 to i64
let {i64 y_15441} = sub64(segmap_group_size_15440, 1i64)
let {i64 y_15474} = sub64(segmap_group_size_15473, 1i64)
let {i64 y_15514} = sub64(segmap_group_size_15513, 1i64)
let {i64 y_15544} = sub64(segmap_group_size_15543, 1i64)
let {i64 x_15442} = add64(nest_size_15438, y_15441)
let {i64 x_15475} = add64(nest_size_15438, y_15474)
let {i64 x_15515} = add64(nest_size_15438, y_15514)
let {i64 x_15545} = add64(nest_size_15438, y_15544)
let {bool cond_neg_18554} = not intra_suff_and_fits_14843
let {bool protect_cond_conj_18564} =
logand(loop_nonempty_13819, cond_neg_18554)
let {i64 x_18314} =
-- Branch returns: {i64}
if <fallback> protect_cond_conj_18564
then {
let {i64 x_18555} = squot64(x_15442, segmap_group_size_15440)
in {x_18555}
} else {0i64}
let {i64 x_18316} =
-- Branch returns: {i64}
if <fallback> protect_cond_conj_18564
then {
let {i64 x_18557} = squot64(x_15475, segmap_group_size_15473)
in {x_18557}
} else {0i64}
let {i64 x_18318} =
-- Branch returns: {i64}
if <fallback> protect_cond_conj_18564
then {
let {i64 x_18559} = squot64(x_15515, segmap_group_size_15513)
in {x_18559}
} else {0i64}
let {i64 x_18320} =
-- Branch returns: {i64}
if <fallback> protect_cond_conj_18564
then {
let {i64 x_18561} = squot64(x_15545, segmap_group_size_15543)
in {x_18561}
} else {0i64}
let {i32 segmap_usable_groups_15445} = sext i64 x_18314 to i32
let {i32 segmap_usable_groups_15478} = sext i64 x_18316 to i32
let {i32 segmap_usable_groups_15518} = sext i64 x_18318 to i32
let {i32 segmap_usable_groups_15548} = sext i64 x_18320 to i32
let {i32 convop_x_19134} = mul32(256i32, res_13820)
let {i64 binop_x_19135} = sext i32 convop_x_19134 to i64
let {i64 bytes_19133} = mul64(4i64, binop_x_19135)
let {i64 binop_x_19152} = mul64(16i64, nest_size_15438)
let {i64 bytes_19147} = mul64(4i64, binop_x_19152)
let {i32 convop_x_19164} = mul32(16i32, x_13822)
let {i64 binop_x_19165} = sext i32 convop_x_19164 to i64
let {i64 bytes_19162} = mul64(4i64, binop_x_19165)
let {i64 bytes_19167} = mul64(4i64, nest_size_15438)
let {i32 segmap_group_size_15976} =
get_size(segmap_group_size_15932, group_size)
-- matb_13830 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_13830} =
-- Consumes res_14679
-- matb_13831 : *[num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
loop {*[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_13831} = {res_14679}
for step_13832:i32 < upper_bound_13818 do {
let {bool y_13834} = slt32(step_13832, num_blocks_13772)
let {bool index_ok_13836} = logand(y_13834, y_13834)
let {cert index_certs_13837} =
assert(index_ok_13836, "Index [", step_13832, ", ", step_13832,
"] out of bounds for array of shape [",
num_blocks_13772, "][", num_blocks_13772, "].",
"lud.fut:141:33-47")
-- lud_diagonal_arg_13838 aliases matb_13831
-- lud_diagonal_arg_13838 : [16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (step_13832) (mul32 (256i32) (num_blocks_13772))) (mul32 (step_13832) (256i32)); strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 lud_diagonal_arg_13838} =
<index_certs_13837>
matb_13831[step_13832, step_13832, 0i32:+16i32*1i32, 0i32:+16i32*1i32]
-- res_13839 aliases lud_diagonal_arg_13838
-- res_13839 : [res_13820][16i32][16i32]f32@@mem_19131->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (step_13832) (mul32 (256i32) (num_blocks_13772))) (mul32 (step_13832) (256i32)); strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 res_13839} =
<dim_ok_13826>
reshape((res_13820, 16i32, 16i32), lud_diagonal_arg_13838)
-- res_13840 : [res_13820][16i32][16i32]f32@@res_mem_19183->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {mem res_mem_19183;
[res_13820][16i32][16i32]f32 res_13840} =
-- Branch returns: {[res_13820][16i32][16i32]f32@?0->{base: [res_13820, 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32];
-- shape: [res_13820, 16i32, 16i32];
-- permutation: [0, 1, 2];
-- monotonicity: [Inc, Inc, Inc]}]}}
if intra_suff_and_fits_14843
then {
let {mem mem_19136} =
alloc(bytes_19133)
-- res_coalesced_18570 : [res_13820][16i32][16i32]f32@@mem_19136->{base: [16i32, 16i32, res_13820]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (res_13820) (16i32), res_13820, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, res_13820]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 res_coalesced_18570} = manifest((1,
2,
0),
res_13839)
let {mem mem_19153} =
alloc(bytes_19147)
-- res_14844 : [res_13820][16i32][16i32]f32@@mem_19153->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 res_14844} =
segmap_group
(#groups=res_13820; groupsize=16i32)
(gtid_14690 < res_13820) (~phys_tid_14733) : {[16i32][16i32]f32} {
-- x_14845 aliases res_coalesced_18570
-- x_14845 : [16i32][16i32]f32@@mem_19136->{base: [16i32, 16i32, res_13820]; contiguous: False; LMADs: [{offset: gtid_14690; strides: [mul32 (res_13820) (16i32), res_13820]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 x_14845} = res_coalesced_18570[gtid_14690,
0i32:+16i32*1i32,
0i32:+16i32*1i32]
let {mem@local mem_19141} =
alloc(1024i64, @local)
-- smaller_replicate_14846 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 smaller_replicate_14846} = copy(x_14845)
let {mem@local mem_19145} =
alloc(64i64, @local)
-- res_14847 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_14847} =
-- Consumes smaller_replicate_14846
-- mat_14848 : *[16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[16i32][16i32]f32 mat_14848} = {smaller_replicate_14846}
for i_14849:i32 < 15i32 do {
-- res_14851 : [16i32]f32@@mem_19145->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_14851} =
segmap_thread
(#groups=res_13820; groupsize=16i32)
(gtid_14694 < 16i32) (~phys_tid_14695) : {f32} {
let {bool cond_14856} = slt32(i_14849, gtid_14694)
let {f32 res_14857} =
-- Branch returns: {f32}
if cond_14856
then {
let {f32 x_14858} = mat_14848[gtid_14694, i_14849]
let {f32 res_14860} =
loop {f32 redout_18909} = {0.0f32}
for i_18910:i32 < i_14849 do {
let {f32 x_14864} = mat_14848[gtid_14694, i_18910]
let {f32 x_14865} = mat_14848[i_18910, i_14849]
let {f32 res_14866} = fmul32(x_14864, x_14865)
let {f32 res_14863} =
fadd32(res_14866, redout_18909)
in {res_14863}
}
let {f32 x_14867} = fsub32(x_14858, res_14860)
let {f32 y_14868} = mat_14848[i_14849, i_14849]
let {f32 res_14869} = fdiv32(x_14867, y_14868)
in {res_14869}
} else {
let {f32 res_14870} = mat_14848[gtid_14694, i_14849]
in {res_14870}
}
return {returns res_14857}
}
-- mat_14871 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 mat_14871} =
-- Consumes mat_14848
mat_14848 with [0i32:+16i32*1i32, i_14849] <- res_14851
let {i32 j_14872} = add32(1i32, i_14849)
-- mat_14889 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 mat_14889} =
-- Consumes mat_14871
segmap_thread
(#groups=res_13820; groupsize=16i32)
(gtid_14716 < 16i32) (~phys_tid_14717) : {f32} {
let {bool cond_14875} = slt32(i_14849, gtid_14716)
let {f32 res_14876} =
-- Branch returns: {f32}
if cond_14875
then {
let {f32 x_14877} = mat_14871[j_14872, gtid_14716]
let {f32 res_14880} =
loop {f32 redout_18911} = {0.0f32}
for i_18912:i32 < j_14872 do {
let {f32 x_14884} = mat_14871[i_18912, gtid_14716]
let {f32 x_14885} = mat_14871[j_14872, i_18912]
let {f32 res_14886} = fmul32(x_14884, x_14885)
let {f32 res_14883} =
fadd32(res_14886, redout_18911)
in {res_14883}
}
let {f32 res_14887} = fsub32(x_14877, res_14880)
in {res_14887}
} else {
let {f32 res_14888} = mat_14871[j_14872, gtid_14716]
in {res_14888}
}
return {mat_14871 with ([j_14872 < 16i32,
gtid_14716 < 16i32] <- res_14876)}
}
in {mat_14889}
}
return {returns res_14847}
}
in {mem_19153, res_14844}
} else {
let {mem mem_19160} =
alloc(bytes_19147)
-- smaller_replicate_r_15407 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 smaller_replicate_r_15407} =
copy(res_13839)
let {mem mem_19166} =
alloc(bytes_19162)
let {mem mem_19171} =
alloc(bytes_19167)
let {mem mem_19176} =
alloc(bytes_19162)
let {mem mem_19181} =
alloc(bytes_19167)
-- res_15408 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 res_15408} =
-- Consumes smaller_replicate_r_15407
-- mat_expanded_15409 : *[res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
loop {*[res_13820][16i32][16i32]f32 mat_expanded_15409} = {smaller_replicate_r_15407}
for i_15410:i32 < 15i32 do {
-- mat_expanded_coalesced_18572 : [res_13820][16i32][16i32]f32@@mem_19166->{base: [16i32, res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_13820), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_13820, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 mat_expanded_coalesced_18572} =
manifest((2, 0, 1), mat_expanded_15409)
-- res_r_15446 : [res_13820][16i32]f32@@mem_19171->{base: [res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_13820, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[res_13820][16i32]f32 res_r_15446} =
segmap_thread
(#groups=segmap_usable_groups_15445; groupsize=segmap_group_size_15439)
(gtid_15188 < res_13820,
gtid_15189 < 16i32) (~phys_tid_15190) : {f32} {
let {bool cond_15452} = slt32(i_15410, gtid_15189)
let {f32 res_15453} =
-- Branch returns: {f32}
if cond_15452
then {
let {f32 x_15454} = mat_expanded_15409[gtid_15188,
gtid_15189,
i_15410]
let {f32 res_15456} =
loop {f32 redout_18913} = {0.0f32}
for i_18914:i32 < i_15410 do {
let {f32 x_15460} =
mat_expanded_coalesced_18572[gtid_15188, gtid_15189,
i_18914]
let {f32 x_15461} = mat_expanded_15409[gtid_15188,
i_18914,
i_15410]
let {f32 res_15462} = fmul32(x_15460, x_15461)
let {f32 res_15459} = fadd32(res_15462, redout_18913)
in {res_15459}
}
let {f32 x_15463} = fsub32(x_15454, res_15456)
let {f32 y_15464} = mat_expanded_15409[gtid_15188,
i_15410, i_15410]
let {f32 res_15465} = fdiv32(x_15463, y_15464)
in {res_15465}
} else {
let {f32 res_15466} = mat_expanded_15409[gtid_15188,
gtid_15189,
i_15410]
in {res_15466}
}
return {returns res_15453}
}
-- mat_r_15479 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 mat_r_15479} =
-- Consumes mat_expanded_15409
segmap_thread
(#groups=segmap_usable_groups_15478; groupsize=segmap_group_size_15472)
(gtid_15127 < res_13820,
gtid_slice_15128 < 16i32) (~phys_tid_15130) : {f32} {
let {f32 v_15482} = res_r_15446[gtid_15127, gtid_slice_15128]
return {mat_expanded_15409 with ([gtid_15127 < res_13820,
gtid_slice_15128 < 16i32,
i_15410 < 16i32] <- v_15482)}
}
let {i32 j_15493} = add32(1i32, i_15410)
-- mat_r_coalesced_18575 : [res_13820][16i32][16i32]f32@@mem_19176->{base: [16i32, res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_13820), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_13820, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 mat_r_coalesced_18575} =
manifest((2, 0, 1), mat_r_15479)
-- res_r_15519 : [res_13820][16i32]f32@@mem_19181->{base: [res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_13820, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[res_13820][16i32]f32 res_r_15519} =
segmap_thread
(#groups=segmap_usable_groups_15518; groupsize=segmap_group_size_15512)
(gtid_15052 < res_13820,
gtid_15053 < 16i32) (~phys_tid_15054) : {f32} {
let {bool cond_15523} = slt32(i_15410, gtid_15053)
let {f32 res_15524} =
-- Branch returns: {f32}
if cond_15523
then {
let {f32 x_15525} = mat_r_15479[gtid_15052, j_15493,
gtid_15053]
let {f32 res_15528} =
loop {f32 redout_18915} = {0.0f32}
for i_18916:i32 < j_15493 do {
let {f32 x_15532} = mat_r_15479[gtid_15052, i_18916,
gtid_15053]
let {f32 x_15533} = mat_r_coalesced_18575[gtid_15052,
j_15493,
i_18916]
let {f32 res_15534} = fmul32(x_15532, x_15533)
let {f32 res_15531} = fadd32(res_15534, redout_18915)
in {res_15531}
}
let {f32 res_15535} = fsub32(x_15525, res_15528)
in {res_15535}
} else {
let {f32 res_15536} = mat_r_15479[gtid_15052, j_15493,
gtid_15053]
in {res_15536}
}
return {returns res_15524}
}
-- res_15549 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_13820][16i32][16i32]f32 res_15549} =
-- Consumes mat_r_15479
segmap_thread
(#groups=segmap_usable_groups_15548; groupsize=segmap_group_size_15542)
(gtid_14994 < res_13820,
gtid_slice_14995 < 16i32) (~phys_tid_14997) : {f32} {
let {f32 v_15553} = res_r_15519[gtid_14994, gtid_slice_14995]
return {mat_r_15479 with ([gtid_14994 < res_13820,
j_15493 < 16i32,
gtid_slice_14995 < 16i32] <- v_15553)}
}
in {res_15549}
}
in {mem_19160, res_15408}
}
-- res_13927 aliases res_13840
-- res_13927 : [16i32][16i32]f32@@res_mem_19183->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_13927} =
<index_certs_13829>
res_13840[0i32, 0i32:+16i32*1i32, 0i32:+16i32*1i32]
let {i32 i_13928} = add32(1i32, step_13832)
let {i32 j_m_i_13929} = sub32(num_blocks_13772, i_13928)
let {bool empty_slice_13930} = eq_i32(j_m_i_13929, 0i32)
let {i32 m_13931} = sub32(j_m_i_13929, 1i32)
let {i32 i_p_m_t_s_13932} = add32(i_13928, m_13931)
let {bool zero_leq_i_p_m_t_s_13933} = sle32(0i32, i_p_m_t_s_13932)
let {bool i_p_m_t_s_leq_w_13934} =
slt32(i_p_m_t_s_13932, num_blocks_13772)
let {bool i_lte_j_13936} = sle32(i_13928, num_blocks_13772)
let {bool y_13938} =
logand(zero_leq_i_p_m_t_s_13933, i_p_m_t_s_leq_w_13934)
let {bool y_13939} = logand(i_lte_j_13936, y_13938)
let {bool ok_or_empty_13941} = logor(empty_slice_13930, y_13939)
let {bool index_ok_13942} = logand(y_13834, ok_or_empty_13941)
let {cert index_certs_13943} =
assert(index_ok_13942, "Index [", step_13832, ", ", i_13928, ":",
num_blocks_13772,
"] out of bounds for array of shape [",
num_blocks_13772, "][", num_blocks_13772, "].",
"lud.fut:146:25-52")
let {i64 j_m_i_15825} = sext i32 j_m_i_13929 to i64
let {i64 nest_size_15828} = mul64(16i64, j_m_i_15825)
let {i32 num_groups_15830} =
calc_num_groups(nest_size_15828, segmap_num_groups_15650,
segmap_group_size_15829)
let {i32 binop_x_19188} = mul32(16i32, j_m_i_13929)
let {i32 convop_x_19189} = mul32(16i32, binop_x_19188)
let {i64 binop_x_19190} = sext i32 convop_x_19189 to i64
let {i64 bytes_19187} = mul64(4i64, binop_x_19190)
let {mem mem_19191} =
alloc(bytes_19187)
-- res_15832 : [j_m_i_13929][16i32][16i32]f32@@mem_19191->{base: [16i32, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_15832} =
segmap_thread
(#groups=num_groups_15830; groupsize=segmap_group_size_15829; virtualise)
(gtid_15641 < j_m_i_13929,
gtid_15642 < 16i32) (~phys_tid_15643) : {[16i32]f32} {
let {mem@[16i32]f32 mem_19186} =
alloc(64i64, @[16i32]f32)
-- smaller_replicate_18330 : [16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 smaller_replicate_18330} = replicate([16i32], 0.0f32)
let {i32 j_p_i_t_s_18325} = add32(i_13928, gtid_15641)
-- res_15835 : [16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_15835} =
-- Consumes smaller_replicate_18330
-- row_15836 : *[16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
loop {*[16i32]f32 row_15836} = {smaller_replicate_18330}
for i_15837:i32 < 16i32 do {
let {f32 sum_15838} =
loop {f32 sum_15839} = {0.0f32}
for k_15840:i32 < i_15837 do {
let {f32 x_15841} =
<index_certs_13829>
res_13840[0i32, i_15837, k_15840]
let {f32 y_15842} = row_15836[k_15840]
let {f32 y_15843} = fmul32(x_15841, y_15842)
let {f32 loopres_15844} = fadd32(sum_15839, y_15843)
in {loopres_15844}
}
let {f32 x_15845} =
<index_certs_13943>
matb_13831[step_13832, j_p_i_t_s_18325, i_15837, gtid_15642]
let {f32 lw_val_15846} = fsub32(x_15845, sum_15838)
-- row_15847 : [16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 row_15847} =
-- Consumes row_15836
row_15836 with [i_15837] <- lw_val_15846
in {row_15847}
}
return {returns res_15835}
}
-- res_transformed_13974 aliases res_15832
-- res_transformed_13974 : [j_m_i_13929][16i32][16i32]f32@@mem_19191->{base: [16i32, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, 16i32]; permutation: [1, 0, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_transformed_13974} = rearrange((0,
2,
1),
res_15832)
let {cert index_certs_13975} =
assert(index_ok_13942, "Index [", i_13928, ":", num_blocks_13772, ", ",
step_13832,
"] out of bounds for array of shape [",
num_blocks_13772, "][", num_blocks_13772, "].",
"lud.fut:153:25-52")
let {bool index_ok_13977} = logand(ok_or_empty_13941, ok_or_empty_13941)
let {cert index_certs_13978} =
assert(index_ok_13977, "Index [", i_13928, ":", num_blocks_13772, ", ",
i_13928, ":", num_blocks_13772,
"] out of bounds for array of shape [",
num_blocks_13772, "][", num_blocks_13772, "].",
"lud.fut:159:27-67")
let {bool suff_outer_par_15850} =
get_size(suff_outer_par_13, threshold ()) <= j_m_i_13929
let {i32 one_intra_par_min_15913} = mul32(4096i32, j_m_i_13929)
let {i32 y_15922} = smin32(16i32, convop_x_19189)
let {i32 y_15923} = smin32(one_intra_par_min_15913, y_15922)
let {i32 intra_avail_par_15924} = smin32(256i32, y_15923)
let {i32 y_15925} = smax32(16i32, convop_x_19189)
let {i32 y_15926} = smax32(one_intra_par_min_15913, y_15925)
let {i32 computed_group_size_15853} = smax32(256i32, y_15926)
let {bool fits_16021} =
sle32(computed_group_size_15853, max_group_size_14841)
let {bool suff_intra_par_16019} =
get_size(suff_intra_par_14,
threshold (!suff_outer_par_13)) <= intra_avail_par_15924
let {bool intra_suff_and_fits_16022} =
logand(suff_intra_par_16019, fits_16021)
let {i32 convop_x_19226} = mul32(j_m_i_13929, convop_x_19189)
let {i64 binop_x_19227} = sext i32 convop_x_19226 to i64
let {i64 bytes_19223} = mul64(4i64, binop_x_19227)
let {i32 convop_x_19230} = mul32(256i32, j_m_i_13929)
let {i64 binop_x_19231} = sext i32 convop_x_19230 to i64
let {i64 bytes_19229} = mul64(4i64, binop_x_19231)
let {i64 binop_x_19493} = mul64(j_m_i_15825, j_m_i_15825)
let {i64 binop_x_19495} = mul64(16i64, binop_x_19493)
let {i64 binop_x_19497} = mul64(16i64, binop_x_19495)
let {i64 bytes_19490} = mul64(4i64, binop_x_19497)
let {i64 binop_x_19505} = mul64(16i64, nest_size_15828)
let {i64 bytes_19500} = mul64(4i64, binop_x_19505)
let {i32 num_groups_15977} =
calc_num_groups(j_m_i_15825, segmap_num_groups_15934,
segmap_group_size_15976)
let {i32 num_threads_19639} =
mul32(segmap_group_size_15976, num_groups_15977)
let {i64 num_threads64_19640} = sext i32 num_threads_19639 to i64
let {i64 total_size_19641} = mul64(bytes_19500, num_threads64_19640)
-- res_13980 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19508->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
-- res_13981 : [j_m_i_13929][16i32][16i32]f32@@res_mem_19509->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {mem res_mem_19508, mem res_mem_19509;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_13980,
[j_m_i_13929][16i32][16i32]f32 res_13981} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]},
-- [j_m_i_13929][16i32][16i32]f32@?1->{base: [j_m_i_13929, 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2];
-- monotonicity: [Inc, Inc, Inc]}]}}
if suff_outer_par_15850
then {
let {mem mem_19228} =
alloc(bytes_19223)
let {mem mem_19232} =
alloc(bytes_19229)
let {mem mem_19207} =
alloc(total_size_19641)
-- res_15979 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19228->{base: [j_m_i_13929, 16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (j_m_i_13929) (16i32)) (16i32), mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32, j_m_i_13929]; permutation: [3, 0, 1, 2]; monotonicity: [Inc, Inc, Inc, Inc]}]}
-- res_15980 : [j_m_i_13929][16i32][16i32]f32@@mem_19232->{base: [16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_15979,
[j_m_i_13929][16i32][16i32]f32 res_15980} =
segmap_thread
(#groups=num_groups_15977; groupsize=segmap_group_size_15976; virtualise)
(gtid_15928 < j_m_i_13929) (~phys_tid_15929) : {[j_m_i_13929][16i32][16i32]f32,
[16i32][16i32]f32} {
let {i32 j_p_i_t_s_18340} = add32(i_13928, gtid_15928)
let {mem@[16i3216i32]f32 mem_19194} =
alloc(1024i64, @[16i3216i32]f32)
-- result_18917 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 result_18917} = scratch(f32, 16i32, 16i32)
let {mem@[16i32]f32 mem_19197} =
alloc(64i64, @[16i32]f32)
-- res_15985 : [16i32]f32@@mem_19197->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_15985} = replicate([16i32], 0.0f32)
-- res_15983 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_15983} =
-- Consumes result_18917
-- mapout_18918 : *[16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[16i32][16i32]f32 mapout_18918} = {result_18917}
for i_18919:i32 < 16i32 do {
-- modified_source_19090 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 modified_source_19090} =
-- Consumes mapout_18918
mapout_18918 with [i_18919, 0i32:+16i32*1i32] <- res_15985
-- lw_dest_18920 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 lw_dest_18920} =
-- Consumes modified_source_19090
-- lowered_array_19089 : *[16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[16i32][16i32]f32 lowered_array_19089} = {modified_source_19090}
for j_15988:i32 < 16i32 do {
let {f32 sum_15989} =
loop {f32 sum_15990} = {0.0f32}
for k_15991:i32 < j_15988 do {
let {f32 x_15992} =
<index_certs_13829>
res_13840[0i32, k_15991, j_15988]
let {f32 y_15993} = lowered_array_19089[i_18919,
k_15991]
let {f32 y_15994} = fmul32(x_15992, y_15993)
let {f32 loopres_15995} = fadd32(sum_15990, y_15994)
in {loopres_15995}
}
let {f32 x_15996} =
<index_certs_13975>
matb_13831[j_p_i_t_s_18340, step_13832, i_18919,
j_15988]
let {f32 x_15997} = fsub32(x_15996, sum_15989)
let {f32 y_15998} =
<index_certs_13829>
res_13840[0i32, j_15988, j_15988]
let {f32 lw_val_15999} = fdiv32(x_15997, y_15998)
-- lowered_array_updated_19094 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 lowered_array_updated_19094} =
-- Consumes lowered_array_19089
lowered_array_19089 with [i_18919,
j_15988] <- lw_val_15999
in {lowered_array_updated_19094}
}
in {lw_dest_18920}
}
-- result_18921 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 result_18921} = scratch(f32,
j_m_i_13929,
16i32,
16i32)
-- res_16001 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_16001} =
-- Consumes result_18921
-- mapout_18922 : *[j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
loop {*[j_m_i_13929][16i32][16i32]f32 mapout_18922} = {result_18921}
for i_18923:i32 < j_m_i_13929 do {
let {i32 j_p_i_t_s_19000} = add32(i_13928, i_18923)
-- lw_dest_18924 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 lw_dest_18924} =
-- Consumes mapout_18922
-- lowered_array_19078 : *[j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
loop {*[j_m_i_13929][16i32][16i32]f32 lowered_array_19078} = {mapout_18922}
for i_18927:i32 < 16i32 do {
-- lowered_array_updated_19082 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 lowered_array_updated_19082} =
-- Consumes lowered_array_19078
-- lowered_array_19083 : *[j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
loop {*[j_m_i_13929][16i32][16i32]f32 lowered_array_19083} = {lowered_array_19078}
for i_18931:i32 < 16i32 do {
let {f32 x_16008} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18340, j_p_i_t_s_19000,
i_18927, i_18931]
let {f32 res_16010} =
loop {f32 redout_18933} = {0.0f32}
for i_18934:i32 < 16i32 do {
let {f32 x_16014} = res_15983[i_18927, i_18934]
let {f32 x_16015} = res_15832[i_18923, i_18931,
i_18934]
let {f32 res_16016} = fmul32(x_16014, x_16015)
let {f32 res_16013} =
fadd32(res_16016, redout_18933)
in {res_16013}
}
let {f32 res_16017} = fsub32(x_16008, res_16010)
-- lowered_array_updated_19088 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 lowered_array_updated_19088} =
-- Consumes lowered_array_19083
lowered_array_19083 with [i_18923, i_18927,
i_18931] <- res_16017
in {lowered_array_updated_19088}
}
in {lowered_array_updated_19082}
}
in {lw_dest_18924}
}
return {returns res_16001, returns res_15983}
}
let {mem mem_19498} =
alloc(bytes_19490)
-- res_linear_19499 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19498->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_linear_19499} =
copy(res_15979)
let {mem mem_19506} =
alloc(bytes_19500)
-- res_linear_19507 : [j_m_i_13929][16i32][16i32]f32@@mem_19506->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_linear_19507} =
copy(res_15980)
in {mem_19498, mem_19506, res_linear_19499, res_linear_19507}
} else {
-- res_17218 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19488->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
-- res_17219 : [j_m_i_13929][16i32][16i32]f32@@res_mem_19489->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {mem res_mem_19488, mem res_mem_19489;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17218,
[j_m_i_13929][16i32][16i32]f32 res_17219} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]},
-- [j_m_i_13929][16i32][16i32]f32@?1->{base: [j_m_i_13929, 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2];
-- monotonicity: [Inc, Inc, Inc]}]}}
if intra_suff_and_fits_16022
then {
let {mem mem_19264} =
alloc(bytes_19490)
let {mem mem_19271} =
alloc(bytes_19500)
-- res_16023 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19264->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
-- res_16024 : [j_m_i_13929][16i32][16i32]f32@@mem_19271->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_16023,
[j_m_i_13929][16i32][16i32]f32 res_16024} =
segmap_group
(#groups=j_m_i_13929; groupsize=computed_group_size_15853)
(gtid_15851 < j_m_i_13929) (~phys_tid_15927) : {[j_m_i_13929][16i32][16i32]f32,
[16i32][16i32]f32} {
let {i32 j_p_i_t_s_18352} = add32(i_13928, gtid_15851)
let {mem@local mem_19240} =
alloc(1024i64, @local)
-- res_16029 : [16i32][16i32]f32@@mem_19240->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_16029} =
segmap_thread
(#groups=j_m_i_13929; groupsize=computed_group_size_15853)
(gtid_15856 < 16i32) (~phys_tid_15857) : {[16i32]f32} {
let {mem@[16i32]f32 mem_19235} =
alloc(64i64, @[16i32]f32)
-- smaller_replicate_18357 : [16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 smaller_replicate_18357} =
replicate([16i32], 0.0f32)
-- res_16032 : [16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_16032} =
-- Consumes smaller_replicate_18357
-- row_16033 : *[16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
loop {*[16i32]f32 row_16033} = {smaller_replicate_18357}
for j_16034:i32 < 16i32 do {
let {f32 sum_16035} =
loop {f32 sum_16036} = {0.0f32}
for k_16037:i32 < j_16034 do {
let {f32 x_16038} =
<index_certs_13829>
res_13840[0i32, k_16037, j_16034]
let {f32 y_16039} = row_16033[k_16037]
let {f32 y_16040} = fmul32(x_16038, y_16039)
let {f32 loopres_16041} =
fadd32(sum_16036, y_16040)
in {loopres_16041}
}
let {f32 x_16042} =
<index_certs_13975>
matb_13831[j_p_i_t_s_18352, step_13832, gtid_15856,
j_16034]
let {f32 x_16043} = fsub32(x_16042, sum_16035)
let {f32 y_16044} =
<index_certs_13829>
res_13840[0i32, j_16034, j_16034]
let {f32 lw_val_16045} = fdiv32(x_16043, y_16044)
-- row_16046 : [16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 row_16046} =
-- Consumes row_16033
row_16033 with [j_16034] <- lw_val_16045
in {row_16046}
}
return {returns res_16032}
}
let {mem@local mem_19248} =
alloc(bytes_19500, @local)
-- res_r_r_r_16054 : [j_m_i_13929][16i32][16i32]f32@@mem_19248->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_r_r_r_16054} =
segred_thread
(#groups=j_m_i_13929; groupsize=computed_group_size_15853)
({{0.0f32},
[],
commutative fn {f32} (f32 x_16055, f32 x_16056) =>
let {f32 res_16057} = fadd32(x_16055, x_16056)
in {res_16057}})
(gtid_15889 < j_m_i_13929, gtid_15890 < 16i32,
gtid_15891 < 16i32,
gtid_15899 < 16i32) (~phys_tid_15900) : {f32} {
let {f32 x_16061} = res_16029[gtid_15890, gtid_15899]
let {f32 x_16062} = res_15832[gtid_15889, gtid_15891,
gtid_15899]
let {f32 res_16063} = fmul32(x_16061, x_16062)
return {returns res_16063}
}
let {mem@local mem_19255} =
alloc(bytes_19500, @local)
-- res_16064 : [j_m_i_13929][16i32][16i32]f32@@mem_19255->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_16064} =
segmap_thread
(#groups=j_m_i_13929; groupsize=computed_group_size_15853)
(gtid_15882 < j_m_i_13929, gtid_15883 < 16i32,
gtid_15884 < 16i32) (~phys_tid_15885) : {f32} {
let {i32 j_p_i_t_s_18371} = add32(i_13928, gtid_15882)
let {f32 x_16065} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18352, j_p_i_t_s_18371, gtid_15883,
gtid_15884]
let {f32 res_16066} = res_r_r_r_16054[gtid_15882,
gtid_15883,
gtid_15884]
let {f32 res_16067} = fsub32(x_16065, res_16066)
return {returns res_16067}
}
return {returns res_16064, returns res_16029}
}
in {mem_19264, mem_19271, res_16023, res_16024}
} else {
let {i32 segmap_group_size_16956} =
get_size(segmap_group_size_16771, group_size)
let {i32 num_groups_16957} =
calc_num_groups(nest_size_15828, segmap_num_groups_16773,
segmap_group_size_16956)
let {mem mem_19279} =
alloc(bytes_19187)
-- res_16959 : [j_m_i_13929][16i32][16i32]f32@@mem_19279->{base: [16i32, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_16959} =
segmap_thread
(#groups=num_groups_16957; groupsize=segmap_group_size_16956; virtualise)
(gtid_16764 < j_m_i_13929,
gtid_16765 < 16i32) (~phys_tid_16766) : {[16i32]f32} {
let {mem@[16i32]f32 mem_19274} =
alloc(64i64, @[16i32]f32)
-- smaller_replicate_18384 : [16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 smaller_replicate_18384} = replicate([16i32],
0.0f32)
let {i32 j_p_i_t_s_18379} = add32(i_13928, gtid_16764)
-- res_16962 : [16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_16962} =
-- Consumes smaller_replicate_18384
-- row_16963 : *[16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
loop {*[16i32]f32 row_16963} = {smaller_replicate_18384}
for j_16964:i32 < 16i32 do {
let {f32 sum_16965} =
loop {f32 sum_16966} = {0.0f32}
for k_16967:i32 < j_16964 do {
let {f32 x_16968} =
<index_certs_13829>
res_13840[0i32, k_16967, j_16964]
let {f32 y_16969} = row_16963[k_16967]
let {f32 y_16970} = fmul32(x_16968, y_16969)
let {f32 loopres_16971} = fadd32(sum_16966, y_16970)
in {loopres_16971}
}
let {f32 x_16972} =
<index_certs_13975>
matb_13831[j_p_i_t_s_18379, step_13832, gtid_16765,
j_16964]
let {f32 x_16973} = fsub32(x_16972, sum_16965)
let {f32 y_16974} =
<index_certs_13829>
res_13840[0i32, j_16964, j_16964]
let {f32 lw_val_16975} = fdiv32(x_16973, y_16974)
-- row_16976 : [16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 row_16976} =
-- Consumes row_16963
row_16963 with [j_16964] <- lw_val_16975
in {row_16976}
}
return {returns res_16962}
}
let {i32 segmap_group_size_16982} =
get_size(segmap_group_size_16075, group_size)
let {i32 num_groups_16983} =
calc_num_groups(binop_x_19493, segmap_num_groups_16077,
segmap_group_size_16982)
let {i32 comparatee_16986} = mul32(j_m_i_13929, j_m_i_13929)
let {bool suff_outer_par_16987} =
get_size(suff_outer_par_15,
threshold (!suff_outer_par_13 !suff_intra_par_14)) <= comparatee_16986
let {bool fits_16999} = sle32(4096i32, max_group_size_14841)
let {bool suff_intra_par_17001} =
get_size(suff_intra_par_16,
threshold (!suff_outer_par_15 !suff_outer_par_13 !suff_intra_par_14)) <= 256i32
let {bool intra_suff_and_fits_17002} =
logand(fits_16999, suff_intra_par_17001)
let {i32 convop_x_19305} = mul32(j_m_i_13929, convop_x_19230)
let {i64 binop_x_19306} = sext i32 convop_x_19305 to i64
let {i64 bytes_19303} = mul64(4i64, binop_x_19306)
-- res_17003 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19479->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {mem res_mem_19479;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17003} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]}}
if suff_outer_par_16987
then {
let {mem mem_19284} =
alloc(bytes_19187)
-- res_rowmajor_18582 : [j_m_i_13929][16i32][16i32]f32@@mem_19284->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18582} =
manifest((0, 1, 2), res_16959)
let {mem mem_19289} =
alloc(bytes_19187)
-- res_rowmajor_18583 : [j_m_i_13929][16i32][16i32]f32@@mem_19289->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18583} =
manifest((0, 1, 2), res_15832)
let {mem mem_19293} =
alloc(bytes_19229)
-- res_coalesced_18584 : [j_m_i_13929][16i32][16i32]f32@@mem_19293->{base: [16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_coalesced_18584} =
manifest((1, 2, 0), res_rowmajor_18583)
let {mem mem_19307} =
alloc(bytes_19303)
-- res_17004 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19307->{base: [16i32, 16i32, j_m_i_13929, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (j_m_i_13929) (j_m_i_13929)) (16i32), mul32 (j_m_i_13929) (j_m_i_13929), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929, j_m_i_13929]; permutation: [2, 3, 0, 1]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17004} =
segmap_thread
(#groups=num_groups_16983; groupsize=segmap_group_size_16982; virtualise)
(gtid_16068 < j_m_i_13929,
gtid_16069 < j_m_i_13929) (~phys_tid_16070) : {[16i32][16i32]f32} {
let {i32 j_p_i_t_s_18386} = add32(i_13928, gtid_16068)
let {i32 j_p_i_t_s_18388} = add32(i_13928, gtid_16069)
let {mem@[16i3216i32]f32 mem_19296} =
alloc(1024i64, @[16i3216i32]f32)
-- result_18935 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 result_18935} = scratch(f32, 16i32,
16i32)
-- res_17008 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_17008} =
-- Consumes result_18935
-- mapout_18936 : *[16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[16i32][16i32]f32 mapout_18936} = {result_18935}
for i_18937:i32 < 16i32 do {
-- lw_dest_18938 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 lw_dest_18938} =
-- Consumes mapout_18936
-- lowered_array_19095 : *[16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[16i32][16i32]f32 lowered_array_19095} = {mapout_18936}
for i_18941:i32 < 16i32 do {
let {f32 x_17012} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18386, j_p_i_t_s_18388,
i_18937, i_18941]
let {f32 res_17014} =
loop {f32 redout_18943} = {0.0f32}
for i_18944:i32 < 16i32 do {
let {f32 x_17018} =
res_rowmajor_18582[gtid_16068, i_18937,
i_18944]
let {f32 x_17019} =
res_coalesced_18584[gtid_16069, i_18941,
i_18944]
let {f32 res_17020} = fmul32(x_17018, x_17019)
let {f32 res_17017} =
fadd32(res_17020, redout_18943)
in {res_17017}
}
let {f32 res_17021} = fsub32(x_17012, res_17014)
-- lowered_array_updated_19099 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 lowered_array_updated_19099} =
-- Consumes lowered_array_19095
lowered_array_19095 with [i_18937,
i_18941] <- res_17021
in {lowered_array_updated_19099}
}
in {lw_dest_18938}
}
return {returns res_17008}
}
let {mem mem_19477} =
alloc(bytes_19490)
-- res_linear_19478 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19477->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_linear_19478} =
copy(res_17004)
in {mem_19477, res_linear_19478}
} else {
-- res_17022 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19468->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {mem res_mem_19468;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17022} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]}}
if intra_suff_and_fits_17002
then {
let {mem mem_19327} =
alloc(bytes_19490)
-- res_17023 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19327->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17023} =
segmap_group
(#groups=comparatee_16986; groupsize=4096i32)
(gtid_16083 < j_m_i_13929,
gtid_16084 < j_m_i_13929) (~phys_tid_16121) : {[16i32][16i32]f32} {
let {mem@local mem_19313} =
alloc(1024i64, @local)
-- res_r_r_17033 : [16i32][16i32]f32@@mem_19313->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_r_r_17033} =
segred_thread
(#groups=comparatee_16986; groupsize=4096i32)
({{0.0f32},
[],
commutative fn {f32} (f32 x_17034, f32 x_17035) =>
let {f32 res_17036} = fadd32(x_17034, x_17035)
in {res_17036}})
(gtid_16096 < 16i32, gtid_16097 < 16i32,
gtid_16104 < 16i32) (~phys_tid_16105) : {f32} {
let {f32 x_17039} = res_16959[gtid_16083,
gtid_16096,
gtid_16104]
let {f32 x_17040} = res_15832[gtid_16084,
gtid_16097,
gtid_16104]
let {f32 res_17041} = fmul32(x_17039, x_17040)
return {returns res_17041}
}
let {i32 j_p_i_t_s_18410} = add32(i_13928, gtid_16083)
let {i32 j_p_i_t_s_18412} = add32(i_13928, gtid_16084)
let {mem@local mem_19318} =
alloc(1024i64, @local)
-- res_17042 : [16i32][16i32]f32@@mem_19318->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_17042} =
segmap_thread
(#groups=comparatee_16986; groupsize=4096i32)
(gtid_16090 < 16i32,
gtid_16091 < 16i32) (~phys_tid_16092) : {f32} {
let {f32 x_17043} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18410, j_p_i_t_s_18412,
gtid_16090, gtid_16091]
let {f32 res_17044} = res_r_r_17033[gtid_16090,
gtid_16091]
let {f32 res_17045} = fsub32(x_17043, res_17044)
return {returns res_17045}
}
return {returns res_17042}
}
in {mem_19327, res_17023}
} else {
let {i64 nest_size_17051} =
mul64(j_m_i_15825, nest_size_15828)
let {i32 segmap_group_size_17052} =
get_size(segmap_group_size_16178, group_size)
let {i32 num_groups_17053} =
calc_num_groups(nest_size_17051,
segmap_num_groups_16180,
segmap_group_size_17052)
let {i32 comparatee_17057} =
mul32(j_m_i_13929, binop_x_19188)
let {bool suff_outer_par_17058} =
get_size(suff_outer_par_17,
threshold (!suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= comparatee_17057
let {bool fits_17067} =
sle32(256i32, max_group_size_14841)
let {bool suff_intra_par_17069} =
get_size(suff_intra_par_18,
threshold (!suff_outer_par_17 !suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= 16i32
let {bool intra_suff_and_fits_17070} =
logand(fits_17067, suff_intra_par_17069)
let {i32 binop_x_19339} = mul32(num_blocks_13772, n_13773)
let {i32 convop_x_19340} = mul32(16i32, binop_x_19339)
let {i64 binop_x_19341} = sext i32 convop_x_19340 to i64
let {i64 bytes_19337} = mul64(4i64, binop_x_19341)
let {i32 convop_x_19349} = mul32(16i32, comparatee_17057)
let {i64 binop_x_19350} = sext i32 convop_x_19349 to i64
let {i64 bytes_19346} = mul64(4i64, binop_x_19350)
-- res_17071 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19467->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {mem res_mem_19467;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17071} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]}}
if suff_outer_par_17058
then {
let {mem mem_19332} =
alloc(bytes_19187)
-- res_rowmajor_18587 : [j_m_i_13929][16i32][16i32]f32@@mem_19332->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18587} =
manifest((0, 1, 2), res_15832)
let {mem mem_19336} =
alloc(bytes_19229)
-- res_coalesced_18588 : [j_m_i_13929][16i32][16i32]f32@@mem_19336->{base: [16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_coalesced_18588} =
manifest((1, 2, 0), res_rowmajor_18587)
let {mem mem_19342} =
alloc(bytes_19337)
-- matb_coalesced_18589 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19342->{base: [16i32, num_blocks_13772, num_blocks_13772, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (16i32) (num_blocks_13772)) (num_blocks_13772), mul32 (16i32) (num_blocks_13772), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [16i32, num_blocks_13772, num_blocks_13772, 16i32]; permutation: [1, 2, 3, 0]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_coalesced_18589} =
manifest((3, 0, 1, 2), matb_13831)
let {mem mem_19351} =
alloc(bytes_19346)
-- res_17072 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19351->{base: [16i32, j_m_i_13929, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (16i32) (j_m_i_13929)) (j_m_i_13929), mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, j_m_i_13929, 16i32]; permutation: [1, 2, 3, 0]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17072} =
segmap_thread
(#groups=num_groups_17053; groupsize=segmap_group_size_17052; virtualise)
(gtid_16168 < j_m_i_13929, gtid_16169 < j_m_i_13929,
gtid_16170 < 16i32) (~phys_tid_16171) : {[16i32]f32} {
let {i32 j_p_i_t_s_18418} =
add32(i_13928, gtid_16168)
let {i32 j_p_i_t_s_18420} =
add32(i_13928, gtid_16169)
let {mem@[16i32]f32 mem_19345} =
alloc(64i64, @[16i32]f32)
-- result_18945 : [16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 result_18945} = scratch(f32,
16i32)
-- res_17076 : [16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_17076} =
-- Consumes result_18945
-- mapout_18946 : *[16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
loop {*[16i32]f32 mapout_18946} = {result_18945}
for i_18947:i32 < 16i32 do {
let {f32 x_17077} =
<index_certs_13978>
matb_coalesced_18589[j_p_i_t_s_18418,
j_p_i_t_s_18420,
gtid_16170, i_18947]
let {f32 res_17079} =
loop {f32 redout_18949} = {0.0f32}
for i_18950:i32 < 16i32 do {
let {f32 x_17083} = res_16959[gtid_16168,
gtid_16170,
i_18950]
let {f32 x_17084} =
res_coalesced_18588[gtid_16169, i_18947,
i_18950]
let {f32 res_17085} =
fmul32(x_17083, x_17084)
let {f32 res_17082} =
fadd32(res_17085, redout_18949)
in {res_17082}
}
let {f32 res_17086} =
fsub32(x_17077, res_17079)
-- lw_dest_18948 : [16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 lw_dest_18948} =
-- Consumes mapout_18946
mapout_18946 with [i_18947] <- res_17086
in {lw_dest_18948}
}
return {returns res_17076}
}
let {mem mem_19465} =
alloc(bytes_19490)
-- res_linear_19466 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19465->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_linear_19466} =
copy(res_17072)
in {mem_19465, res_linear_19466}
} else {
-- res_17087 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19456->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {mem res_mem_19456;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17087} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]}}
if intra_suff_and_fits_17070
then {
let {mem mem_19367} =
alloc(bytes_19490)
-- res_17088 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19367->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17088} =
segmap_group
(#groups=comparatee_17057; groupsize=256i32)
(gtid_16187 < j_m_i_13929,
gtid_16188 < j_m_i_13929,
gtid_16189 < 16i32) (~phys_tid_16217) : {[16i32]f32} {
let {mem@local mem_19355} =
alloc(64i64, @local)
-- res_r_17096 : [16i32]f32@@mem_19355->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_r_17096} =
segred_thread
(#groups=comparatee_17057; groupsize=256i32)
({{0.0f32},
[],
commutative fn {f32} (f32 x_17097,
f32 x_17098) =>
let {f32 res_17099} =
fadd32(x_17097, x_17098)
in {res_17099}})
(gtid_16200 < 16i32,
gtid_16205 < 16i32) (~phys_tid_16206) : {f32} {
let {f32 x_17101} = res_16959[gtid_16187,
gtid_16189,
gtid_16205]
let {f32 x_17102} = res_15832[gtid_16188,
gtid_16200,
gtid_16205]
let {f32 res_17103} =
fmul32(x_17101, x_17102)
return {returns res_17103}
}
let {i32 j_p_i_t_s_18436} =
add32(i_13928, gtid_16187)
let {i32 j_p_i_t_s_18438} =
add32(i_13928, gtid_16188)
let {mem@local mem_19358} =
alloc(64i64, @local)
-- res_17104 : [16i32]f32@@mem_19358->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_17104} =
segmap_thread
(#groups=comparatee_17057; groupsize=256i32)
(gtid_16195 < 16i32) (~phys_tid_16196) : {f32} {
let {f32 x_17105} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18436,
j_p_i_t_s_18438, gtid_16189,
gtid_16195]
let {f32 res_17106} =
res_r_17096[gtid_16195]
let {f32 res_17107} =
fsub32(x_17105, res_17106)
return {returns res_17107}
}
return {returns res_17104}
}
in {mem_19367, res_17088}
} else {
let {i64 y_17114} = mul64(256i64, j_m_i_15825)
let {i64 nest_size_17115} =
mul64(j_m_i_15825, y_17114)
let {bool suff_outer_par_17127} =
get_size(suff_outer_par_19,
threshold (!suff_outer_par_17 !suff_intra_par_18 !suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= convop_x_19305
let {bool suff_intra_par_17136} =
get_size(suff_intra_par_20,
threshold (!suff_outer_par_19 !suff_outer_par_17 !suff_intra_par_18 !suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= 16i32
let {bool intra_suff_and_fits_17137} =
logand(fits_14842, suff_intra_par_17136)
-- res_17138 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19455->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {mem res_mem_19455;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17138} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]}}
if suff_outer_par_17127
then {
let {i32 tile_size_18609} =
get_size(tile_size_18608, tile_size)
let {i32 group_size_18610} =
mul32(tile_size_18609, tile_size_18609)
let {i32 y_18611} =
sub32(tile_size_18609, 1i32)
let {i32 x_18612} = add32(16i32, y_18611)
let {i32 num_groups_x_18613} =
squot32(x_18612, tile_size_18609)
let {i32 y_18617} =
mul32(j_m_i_13929, num_groups_x_18613)
let {i32 y_18618} =
mul32(j_m_i_13929, y_18617)
let {i32 num_groups_top_18619} =
mul32(num_groups_x_18613, y_18618)
let {i32 num_whole_tiles_18621} =
squot32(16i32, tile_size_18609)
let {i32 residual_input_18750} =
srem32(16i32, tile_size_18609)
let {bool cond_18751} =
eq_i32(residual_input_18750, 0i32)
let {mem mem_19414} =
alloc(bytes_19490)
let {i64 binop_x_19370} =
sext i32 group_size_18610 to i64
let {i64 bytes_19368} =
mul64(4i64, binop_x_19370)
let {i64 binop_x_19373} =
sext i32 tile_size_18609 to i64
let {i64 binop_x_19375} =
mul64(binop_x_19373, binop_x_19373)
let {i64 bytes_19372} =
mul64(4i64, binop_x_19375)
-- res_17139 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19414->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17139} =
segmap_group
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(gtid_16258 < j_m_i_13929,
gtid_16259 < j_m_i_13929,
gid_x_18606 < num_groups_x_18613,
gid_y_18607 < num_groups_x_18613) (~gid_flat_18620) : {f32} {
let {mem@[]f32 mem_19371} =
alloc(bytes_19368, @[]f32)
-- mergeinit_18646 : [tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 mergeinit_18646} =
segmap_thread
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(ltid_x_18637 < tile_size_18609,
ltid_y_18638 < tile_size_18609) (~ltid_flat_18639) : {f32} {
return {returns (private) 0.0f32}
}
let {i32 binop_x_18714} =
mul32(gid_x_18606, tile_size_18609)
let {i32 binop_x_18716} =
mul32(gid_y_18607, tile_size_18609)
let {mem@local mem_19376} =
alloc(bytes_19372, @local)
let {mem@local mem_19381} =
alloc(bytes_19372, @local)
let {mem@[]f32 mem_19385} =
alloc(bytes_19368, @[]f32)
-- accs_18743 : [tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 accs_18743} =
-- Consumes mergeinit_18646
-- x_merge_18647 : *[tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[tile_size_18609][tile_size_18609]f32 x_merge_18647} = {mergeinit_18646}
for tile_id_18648:i32 < num_whole_tiles_18621 do {
let {i32 binop_x_18710} =
mul32(tile_size_18609, tile_id_18648)
-- full_tile_18708 : [tile_size_18609][tile_size_18609]f32@@mem_19376->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
-- full_tile_18709 : [tile_size_18609][tile_size_18609]f32@@mem_19381->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 full_tile_18708,
[tile_size_18609][tile_size_18609]f32 full_tile_18709} =
segmap_thread
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(ltid_x_18649 < tile_size_18609,
ltid_y_18650 < tile_size_18609) (~ltid_flat_18651) : {f32,
f32} {
let {i32 i_18711} =
add32(ltid_x_18649, binop_x_18710)
let {i32 j_18713} =
add32(ltid_y_18650, binop_x_18710)
let {i32 gtid_18715} =
add32(ltid_x_18649, binop_x_18714)
let {i32 gtid_18717} =
add32(ltid_y_18650, binop_x_18716)
let {f32 tile_elem_18720} =
res_16959[gtid_16258,
gtid_18715, j_18713]
let {f32 tile_elem_18721} =
res_15832[gtid_16259,
gtid_18717, i_18711]
return {returns (manifest) tile_elem_18720,
returns (manifest) tile_elem_18721}
}
-- acc_18722 : [tile_size_18609][tile_size_18609]f32@@mem_19385->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 acc_18722} =
segmap_thread
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(ltid_x_18674 < tile_size_18609,
ltid_y_18675 < tile_size_18609) (~ltid_flat_18676) : {f32} {
let {i32 gtid_18724} =
add32(ltid_x_18674, binop_x_18714)
let {i32 gtid_18726} =
add32(ltid_y_18675, binop_x_18716)
let {f32 acc_18729} =
x_merge_18647[ltid_x_18674,
ltid_y_18675]
let {bool binop_x_18732} =
slt32(gtid_18724, 16i32)
let {bool binop_y_18733} =
slt32(gtid_18726, 16i32)
let {bool cond_18734} =
logand(binop_x_18732, binop_y_18733)
let {f32 acc_18735} =
-- Branch returns: {f32}
if cond_18734
then {
let {f32 x_18736} =
loop {f32 redout_18951} = {acc_18729}
for i_18952:i32 < tile_size_18609 do {
let {f32 x_18740} =
full_tile_18708[ltid_x_18674,
i_18952]
let {f32 x_18741} =
full_tile_18709[i_18952,
ltid_y_18675]
let {f32 res_18742} =
fmul32(x_18740, x_18741)
let {f32 res_18739} =
fadd32(res_18742, redout_18951)
in {res_18739}
}
in {x_18736}
} else {acc_18729}
return {returns (private) acc_18735}
}
-- acc_ensure_copy_19386 : [tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 acc_ensure_copy_19386} =
copy(acc_18722)
in {acc_ensure_copy_19386}
}
let {mem@local mem_19391} =
alloc(bytes_19372, @local)
let {mem@local mem_19396} =
alloc(bytes_19372, @local)
let {mem@[]f32 mem_19400} =
alloc(bytes_19368, @[]f32)
let {mem@[]f32 mem_19613} =
alloc(bytes_19368, @[]f32)
-- acc_after_residual_18880 : [tile_size_18609][tile_size_18609]f32@@mem_19613->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 acc_after_residual_18880} =
-- Branch returns: {[tile_size_18609][tile_size_18609]f32@(mem_19613->{base: [tile_size_18609,
-- tile_size_18609];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [tile_size_18609, 1i32];
-- rotates: [0i32, 0i32];
-- shape: [tile_size_18609, tile_size_18609];
-- permutation: [0, 1];
-- monotonicity: [Inc, Inc]}]})}
if cond_18751
then {
-- accs_nonext_copy_19614 : [tile_size_18609][tile_size_18609]f32@@mem_19613->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 accs_nonext_copy_19614} =
copy(accs_18743)
in {accs_nonext_copy_19614}
} else {
let {i32 binop_x_18835} =
mul32(tile_size_18609, num_whole_tiles_18621)
-- full_tile_18833 : [tile_size_18609][tile_size_18609]f32@@mem_19391->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
-- full_tile_18834 : [tile_size_18609][tile_size_18609]f32@@mem_19396->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 full_tile_18833,
[tile_size_18609][tile_size_18609]f32 full_tile_18834} =
segmap_thread
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(ltid_x_18752 < tile_size_18609,
ltid_y_18753 < tile_size_18609) (~ltid_flat_18754) : {f32,
f32} {
let {i32 i_18836} =
add32(ltid_x_18752, binop_x_18835)
let {i32 j_18838} =
add32(ltid_y_18753, binop_x_18835)
let {i32 gtid_18840} =
add32(binop_x_18714, ltid_x_18752)
let {i32 gtid_18842} =
add32(binop_x_18716, ltid_y_18753)
let {bool binop_x_18845} =
slt32(j_18838, 16i32)
let {bool binop_y_18846} =
slt32(gtid_18840, 16i32)
let {bool cond_18847} =
logand(binop_x_18845, binop_y_18846)
let {f32 pre_18848} =
-- Branch returns: {f32}
if cond_18847
then {
let {f32 x_18849} =
res_16959[gtid_16258,
gtid_18840,
j_18838]
in {x_18849}
} else {0.0f32}
let {bool binop_x_18851} =
slt32(i_18836, 16i32)
let {bool binop_y_18852} =
slt32(gtid_18842, 16i32)
let {bool cond_18853} =
logand(binop_x_18851, binop_y_18852)
let {f32 pre_18854} =
-- Branch returns: {f32}
if cond_18853
then {
let {f32 x_18855} =
res_15832[gtid_16259,
gtid_18842,
i_18836]
in {x_18855}
} else {0.0f32}
return {returns (manifest) pre_18848,
returns (manifest) pre_18854}
}
-- acc_18859 : [tile_size_18609][tile_size_18609]f32@@mem_19400->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 acc_18859} =
segmap_thread
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(ltid_x_18799 < tile_size_18609,
ltid_y_18800 < tile_size_18609) (~ltid_flat_18801) : {f32} {
let {i32 gtid_18861} =
add32(binop_x_18714, ltid_x_18799)
let {i32 gtid_18863} =
add32(binop_x_18716, ltid_y_18800)
let {f32 acc_18866} =
accs_18743[ltid_x_18799,
ltid_y_18800]
let {bool binop_x_18869} =
slt32(gtid_18861, 16i32)
let {bool binop_y_18870} =
slt32(gtid_18863, 16i32)
let {bool cond_18871} =
logand(binop_x_18869, binop_y_18870)
let {f32 acc_18872} =
-- Branch returns: {f32}
if cond_18871
then {
let {f32 x_18873} =
loop {f32 redout_18953} = {acc_18866}
for i_18954:i32 < residual_input_18750 do {
let {f32 x_18877} =
full_tile_18833[ltid_x_18799,
i_18954]
let {f32 x_18878} =
full_tile_18834[i_18954,
ltid_y_18800]
let {f32 res_18879} =
fmul32(x_18877, x_18878)
let {f32 res_18876} =
fadd32(res_18879, redout_18953)
in {res_18876}
}
in {x_18873}
} else {acc_18866}
return {returns (private) acc_18872}
}
-- acc_nonext_copy_19615 : [tile_size_18609][tile_size_18609]f32@@mem_19613->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 acc_nonext_copy_19615} =
copy(acc_18859)
in {acc_nonext_copy_19615}
}
let {mem@[]f32 mem_19405} =
alloc(bytes_19368, @[]f32)
-- thread_res_18907 : [tile_size_18609][tile_size_18609]f32@@mem_19405->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[tile_size_18609][tile_size_18609]f32 thread_res_18907} =
segmap_thread
(#groups=num_groups_top_18619; groupsize=group_size_18610)
(ltid_x_18881 < tile_size_18609,
ltid_y_18882 < tile_size_18609) (~ltid_flat_18883) : {f32} {
let {i32 gtid_18892} =
add32(binop_x_18714, ltid_x_18881)
let {i32 gtid_18894} =
add32(binop_x_18716, ltid_y_18882)
let {bool binop_x_18896} =
slt32(gtid_18892, 16i32)
let {bool binop_y_18897} =
slt32(gtid_18894, 16i32)
let {bool cond_18898} =
logand(binop_x_18896, binop_y_18897)
let {f32 postlude_18899} =
-- Branch returns: {f32}
if cond_18898
then {
let {f32 res_18895} =
acc_after_residual_18880[ltid_x_18881,
ltid_y_18882]
let {i32 j_p_i_t_s_18902} =
add32(i_13928, gtid_16258)
let {i32 j_p_i_t_s_18903} =
add32(i_13928, gtid_16259)
let {f32 x_18904} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18902,
j_p_i_t_s_18903,
gtid_18892,
gtid_18894]
let {f32 res_18905} =
fsub32(x_18904, res_18895)
in {res_18905}
} else {0.0f32}
return {returns (private) postlude_18899}
}
-- thread_res_18908 aliases thread_res_18907
-- thread_res_18908 : [1i32][1i32][tile_size_18609][tile_size_18609]f32@@mem_19405->{base: [1i32, 1i32, tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (tile_size_18609) (tile_size_18609), mul32 (tile_size_18609) (tile_size_18609), tile_size_18609, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [1i32, 1i32, tile_size_18609, tile_size_18609]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[1i32][1i32][tile_size_18609][tile_size_18609]f32 thread_res_18908} =
reshape((1i32, 1i32, tile_size_18609,
tile_size_18609),
thread_res_18907)
return {tile(j_m_i_13929 / 1i32,
j_m_i_13929 / 1i32,
16i32 / tile_size_18609,
16i32 / tile_size_18609) thread_res_18908}
}
in {mem_19414, res_17139}
} else {
-- res_17151 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19454->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {mem res_mem_19454;
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17151} =
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929,
-- j_m_i_13929,
-- 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32, 0i32];
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32];
-- permutation: [0, 1, 2, 3];
-- monotonicity: [Inc, Inc, Inc, Inc]}]}}
if intra_suff_and_fits_17137
then {
let {mem mem_19424} =
alloc(bytes_19490)
-- res_17152 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19424->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17152} =
segmap_group
(#groups=convop_x_19305; groupsize=16i32)
(gtid_16284 < j_m_i_13929,
gtid_16285 < j_m_i_13929,
gtid_16286 < 16i32,
gtid_16287 < 16i32) (~phys_tid_16295) : {f32} {
let {i32 j_p_i_t_s_18456} =
add32(i_13928, gtid_16284)
let {i32 j_p_i_t_s_18458} =
add32(i_13928, gtid_16285)
let {f32 x_17154} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18456,
j_p_i_t_s_18458,
gtid_16286, gtid_16287]
let {f32 res_17156} =
segred_thread
(#groups=convop_x_19305; groupsize=16i32)
({{0.0f32},
[],
fn {f32} (f32 x_17157,
f32 x_17158) =>
let {f32 res_17159} =
fadd32(x_17157, x_17158)
in {res_17159}})
(gtid_16293 < 16i32) (~phys_tid_16294) : {f32} {
let {f32 x_17160} =
res_16959[gtid_16284,
gtid_16286,
gtid_16293]
let {f32 x_17161} =
res_15832[gtid_16285,
gtid_16287,
gtid_16293]
let {f32 res_17162} =
fmul32(x_17160, x_17161)
return {returns res_17162}
}
let {f32 res_17163} =
fsub32(x_17154, res_17156)
return {returns res_17163}
}
in {mem_19424, res_17152}
} else {
let {i64 nest_size_17181} =
mul64(16i64, nest_size_17115)
let {i32 segred_group_size_17182} =
get_size(segred_group_size_16375,
group_size)
let {i32 num_groups_17183} =
calc_num_groups(nest_size_17181,
segred_num_groups_16377,
segred_group_size_17182)
let {mem mem_19429} =
alloc(bytes_19187)
-- res_rowmajor_18594 : [j_m_i_13929][16i32][16i32]f32@@mem_19429->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18594} =
manifest((0, 1, 2), res_16959)
let {mem mem_19434} =
alloc(bytes_19187)
-- res_rowmajor_18595 : [j_m_i_13929][16i32][16i32]f32@@mem_19434->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18595} =
manifest((0, 1, 2), res_15832)
let {mem mem_19444} =
alloc(bytes_19490)
-- res_r_r_r_r_17185 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19444->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_r_r_r_r_17185} =
segred_thread
(#groups=num_groups_17183; groupsize=segred_group_size_17182)
({{0.0f32},
[],
commutative fn {f32} (f32 x_17186,
f32 x_17187) =>
let {f32 res_17188} =
fadd32(x_17186, x_17187)
in {res_17188}})
(gtid_16353 < j_m_i_13929,
gtid_16354 < j_m_i_13929,
gtid_16355 < 16i32, gtid_16356 < 16i32,
gtid_16380 < 16i32) (~phys_tid_16381) : {f32} {
let {f32 x_17193} =
res_rowmajor_18594[gtid_16353,
gtid_16355,
gtid_16380]
let {f32 x_17194} =
res_rowmajor_18595[gtid_16354,
gtid_16356,
gtid_16380]
let {f32 res_17195} =
fmul32(x_17193, x_17194)
return {returns res_17195}
}
let {i32 segmap_group_size_17204} =
get_size(segmap_group_size_16342,
group_size)
let {i64 segmap_group_size_17205} =
sext i32 segmap_group_size_17204 to i64
let {i64 y_17206} =
sub64(segmap_group_size_17205, 1i64)
let {i64 x_17207} =
add64(nest_size_17115, y_17206)
let {i64 segmap_usable_groups_64_17209} =
squot64(x_17207, segmap_group_size_17205)
let {i32 segmap_usable_groups_17210} =
sext i64 segmap_usable_groups_64_17209 to i32
let {mem mem_19453} =
alloc(bytes_19490)
-- res_17211 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19453->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17211} =
segmap_thread
(#groups=segmap_usable_groups_17210; groupsize=segmap_group_size_17204)
(gtid_16329 < j_m_i_13929,
gtid_16330 < j_m_i_13929,
gtid_16331 < 16i32,
gtid_16332 < 16i32) (~phys_tid_16333) : {f32} {
let {i32 j_p_i_t_s_18468} =
add32(i_13928, gtid_16329)
let {i32 j_p_i_t_s_18470} =
add32(i_13928, gtid_16330)
let {f32 x_17212} =
<index_certs_13978>
matb_13831[j_p_i_t_s_18468,
j_p_i_t_s_18470,
gtid_16331, gtid_16332]
let {f32 res_17213} =
res_r_r_r_r_17185[gtid_16329,
gtid_16330,
gtid_16331,
gtid_16332]
let {f32 res_17214} =
fsub32(x_17212, res_17213)
return {returns res_17214}
}
in {mem_19453, res_17211}
}
in {res_mem_19454, res_17151}
}
in {res_mem_19455, res_17138}
}
in {res_mem_19456, res_17087}
}
in {res_mem_19467, res_17071}
}
in {res_mem_19468, res_17022}
}
let {mem mem_19486} =
alloc(bytes_19500)
-- res_linear_19487 : [j_m_i_13929][16i32][16i32]f32@@mem_19486->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[j_m_i_13929][16i32][16i32]f32 res_linear_19487} =
copy(res_16959)
in {res_mem_19479, mem_19486, res_17003, res_linear_19487}
}
in {res_mem_19488, res_mem_19489, res_17218, res_17219}
}
-- matb_14031 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14031} =
-- Consumes matb_13831
<index_certs_13837>
matb_13831 with [step_13832, step_13832, 0i32:+16i32*1i32,
0i32:+16i32*1i32] <- res_13927
-- matb_14032 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14032} =
-- Consumes matb_14031
<index_certs_13943>
matb_14031 with [step_13832, i_13928:+j_m_i_13929*1i32,
0i32:+16i32*1i32,
0i32:+16i32*1i32] <- res_transformed_13974
-- matb_14033 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14033} =
-- Consumes matb_14032
<index_certs_13975, index_certs_13943>
matb_14032 with [i_13928:+j_m_i_13929*1i32, step_13832,
0i32:+16i32*1i32, 0i32:+16i32*1i32] <- res_13981
-- matb_14034 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14034} =
-- Consumes matb_14033
<index_certs_13978>
matb_14033 with [i_13928:+j_m_i_13929*1i32, i_13928:+j_m_i_13929*1i32,
0i32:+16i32*1i32, 0i32:+16i32*1i32] <- res_13980
in {matb_14034}
}
let {bool x_14035} = sle32(0i32, upper_bound_13818)
let {bool y_14036} = slt32(upper_bound_13818, num_blocks_13772)
let {bool bounds_check_14037} = logand(x_14035, y_14036)
let {bool index_ok_14038} = logand(bounds_check_14037, bounds_check_14037)
let {cert index_certs_14039} =
assert(index_ok_14038, "Index [", upper_bound_13818, ", ",
upper_bound_13818,
"] out of bounds for array of shape [",
num_blocks_13772, "][", num_blocks_13772, "].",
"lud.fut:177:27-52")
-- lud_diagonal_arg_14040 aliases matb_13830
-- lud_diagonal_arg_14040 : [16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (upper_bound_13818) (mul32 (256i32) (num_blocks_13772))) (mul32 (upper_bound_13818) (256i32)); strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 lud_diagonal_arg_14040} =
<index_certs_14039>
matb_13830[upper_bound_13818, upper_bound_13818, 0i32:+16i32*1i32,
0i32:+16i32*1i32]
let {i32 res_14041} = opaque(1i32)
let {i32 x_14042} = mul32(16i32, res_14041)
let {bool assert_arg_14043} = eq_i32(x_14042, 16i32)
let {cert dim_ok_14044} =
assert(assert_arg_14043,
"new shape has different number of elements than old shape",
"/prelude/array.fut:95:3-33")
-- res_14045 aliases lud_diagonal_arg_14040
-- res_14045 : [res_14041][16i32][16i32]f32@@mem_19131->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (upper_bound_13818) (mul32 (256i32) (num_blocks_13772))) (mul32 (upper_bound_13818) (256i32)); strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 res_14045} =
<dim_ok_14044>
reshape((res_14041, 16i32, 16i32), lud_diagonal_arg_14040)
let {bool suff_intra_par_17374} =
get_size(suff_intra_par_24, threshold (!suff_outer_par_23)) <= 16i32
let {bool intra_suff_and_fits_17377} =
logand(fits_14842, suff_intra_par_17374)
let {i32 convop_x_19512} = mul32(256i32, res_14041)
let {i64 binop_x_19513} = sext i32 convop_x_19512 to i64
let {i64 bytes_19511} = mul64(4i64, binop_x_19513)
let {i64 binop_x_19526} = sext i32 res_14041 to i64
let {i64 binop_x_19528} = mul64(16i64, binop_x_19526)
let {i64 binop_x_19530} = mul64(16i64, binop_x_19528)
let {i64 bytes_19525} = mul64(4i64, binop_x_19530)
let {i32 convop_x_19542} = mul32(16i32, x_14042)
let {i64 binop_x_19543} = sext i32 convop_x_19542 to i64
let {i64 bytes_19540} = mul64(4i64, binop_x_19543)
let {i64 bytes_19545} = mul64(4i64, binop_x_19528)
-- res_14046 : [res_14041][16i32][16i32]f32@@res_mem_19561->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {mem res_mem_19561;
[res_14041][16i32][16i32]f32 res_14046} =
-- Branch returns: {[res_14041][16i32][16i32]f32@?0->{base: [res_14041, 16i32,
-- 16i32];
-- contiguous: True;
-- LMADs: [{offset: 0i32;
-- strides: [256i32, 16i32, 1i32];
-- rotates: [0i32, 0i32, 0i32];
-- shape: [res_14041, 16i32, 16i32];
-- permutation: [0, 1, 2];
-- monotonicity: [Inc, Inc, Inc]}]}}
if intra_suff_and_fits_17377
then {
let {mem mem_19514} =
alloc(bytes_19511)
-- res_coalesced_18598 : [res_14041][16i32][16i32]f32@@mem_19514->{base: [16i32, 16i32, res_14041]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (res_14041) (16i32), res_14041, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, res_14041]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 res_coalesced_18598} = manifest((1, 2,
0),
res_14045)
let {mem mem_19531} =
alloc(bytes_19525)
-- res_17378 : [res_14041][16i32][16i32]f32@@mem_19531->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 res_17378} =
segmap_group
(#groups=res_14041; groupsize=16i32)
(gtid_17224 < res_14041) (~phys_tid_17267) : {[16i32][16i32]f32} {
-- x_17379 aliases res_coalesced_18598
-- x_17379 : [16i32][16i32]f32@@mem_19514->{base: [16i32, 16i32, res_14041]; contiguous: False; LMADs: [{offset: gtid_17224; strides: [mul32 (res_14041) (16i32), res_14041]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 x_17379} = res_coalesced_18598[gtid_17224,
0i32:+16i32*1i32,
0i32:+16i32*1i32]
let {mem@local mem_19519} =
alloc(1024i64, @local)
-- smaller_replicate_17380 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 smaller_replicate_17380} = copy(x_17379)
let {mem@local mem_19523} =
alloc(64i64, @local)
-- res_17381 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_17381} =
-- Consumes smaller_replicate_17380
-- mat_17382 : *[16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
loop {*[16i32][16i32]f32 mat_17382} = {smaller_replicate_17380}
for i_17383:i32 < 15i32 do {
-- res_17385 : [16i32]f32@@mem_19523->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]}
let {[16i32]f32 res_17385} =
segmap_thread
(#groups=res_14041; groupsize=16i32)
(gtid_17228 < 16i32) (~phys_tid_17229) : {f32} {
let {bool cond_17390} = slt32(i_17383, gtid_17228)
let {f32 res_17391} =
-- Branch returns: {f32}
if cond_17390
then {
let {f32 x_17392} = mat_17382[gtid_17228, i_17383]
let {f32 res_17394} =
loop {f32 redout_18955} = {0.0f32}
for i_18956:i32 < i_17383 do {
let {f32 x_17398} = mat_17382[gtid_17228, i_18956]
let {f32 x_17399} = mat_17382[i_18956, i_17383]
let {f32 res_17400} = fmul32(x_17398, x_17399)
let {f32 res_17397} = fadd32(res_17400, redout_18955)
in {res_17397}
}
let {f32 x_17401} = fsub32(x_17392, res_17394)
let {f32 y_17402} = mat_17382[i_17383, i_17383]
let {f32 res_17403} = fdiv32(x_17401, y_17402)
in {res_17403}
} else {
let {f32 res_17404} = mat_17382[gtid_17228, i_17383]
in {res_17404}
}
return {returns res_17391}
}
-- mat_17405 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 mat_17405} =
-- Consumes mat_17382
mat_17382 with [0i32:+16i32*1i32, i_17383] <- res_17385
let {i32 j_17406} = add32(1i32, i_17383)
-- mat_17423 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 mat_17423} =
-- Consumes mat_17405
segmap_thread
(#groups=res_14041; groupsize=16i32)
(gtid_17250 < 16i32) (~phys_tid_17251) : {f32} {
let {bool cond_17409} = slt32(i_17383, gtid_17250)
let {f32 res_17410} =
-- Branch returns: {f32}
if cond_17409
then {
let {f32 x_17411} = mat_17405[j_17406, gtid_17250]
let {f32 res_17414} =
loop {f32 redout_18957} = {0.0f32}
for i_18958:i32 < j_17406 do {
let {f32 x_17418} = mat_17405[i_18958, gtid_17250]
let {f32 x_17419} = mat_17405[j_17406, i_18958]
let {f32 res_17420} = fmul32(x_17418, x_17419)
let {f32 res_17417} = fadd32(res_17420, redout_18957)
in {res_17417}
}
let {f32 res_17421} = fsub32(x_17411, res_17414)
in {res_17421}
} else {
let {f32 res_17422} = mat_17405[j_17406, gtid_17250]
in {res_17422}
}
return {mat_17405 with ([j_17406 < 16i32,
gtid_17250 < 16i32] <- res_17410)}
}
in {mat_17423}
}
return {returns res_17381}
}
in {mem_19531, res_17378}
} else {
let {mem mem_19538} =
alloc(bytes_19525)
-- smaller_replicate_r_17941 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 smaller_replicate_r_17941} =
copy(res_14045)
let {i32 segmap_group_size_17973} =
get_size(segmap_group_size_17729, group_size)
let {i64 segmap_group_size_17974} =
sext i32 segmap_group_size_17973 to i64
let {i64 y_17975} = sub64(segmap_group_size_17974, 1i64)
let {i64 x_17976} = add64(y_17975, binop_x_19528)
let {i64 x_18524} = squot64(x_17976, segmap_group_size_17974)
let {i32 segmap_usable_groups_17979} = sext i64 x_18524 to i32
let {i32 segmap_group_size_18006} =
get_size(segmap_group_size_17669, group_size)
let {i64 segmap_group_size_18007} =
sext i32 segmap_group_size_18006 to i64
let {i64 y_18008} = sub64(segmap_group_size_18007, 1i64)
let {i64 x_18009} = add64(y_18008, binop_x_19528)
let {i64 x_18526} = squot64(x_18009, segmap_group_size_18007)
let {i32 segmap_usable_groups_18012} = sext i64 x_18526 to i32
let {i32 segmap_group_size_18046} =
get_size(segmap_group_size_17593, group_size)
let {i64 segmap_group_size_18047} =
sext i32 segmap_group_size_18046 to i64
let {i64 y_18048} = sub64(segmap_group_size_18047, 1i64)
let {i64 x_18049} = add64(y_18048, binop_x_19528)
let {i64 x_18528} = squot64(x_18049, segmap_group_size_18047)
let {i32 segmap_usable_groups_18052} = sext i64 x_18528 to i32
let {i32 segmap_group_size_18076} =
get_size(segmap_group_size_17536, group_size)
let {i64 segmap_group_size_18077} =
sext i32 segmap_group_size_18076 to i64
let {i64 y_18078} = sub64(segmap_group_size_18077, 1i64)
let {i64 x_18079} = add64(y_18078, binop_x_19528)
let {i64 x_18530} = squot64(x_18079, segmap_group_size_18077)
let {i32 segmap_usable_groups_18082} = sext i64 x_18530 to i32
let {mem mem_19544} =
alloc(bytes_19540)
let {mem mem_19549} =
alloc(bytes_19545)
let {mem mem_19554} =
alloc(bytes_19540)
let {mem mem_19559} =
alloc(bytes_19545)
-- res_17942 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 res_17942} =
-- Consumes smaller_replicate_r_17941
-- mat_expanded_17943 : *[res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
loop {*[res_14041][16i32][16i32]f32 mat_expanded_17943} = {smaller_replicate_r_17941}
for i_17944:i32 < 15i32 do {
-- mat_expanded_coalesced_18600 : [res_14041][16i32][16i32]f32@@mem_19544->{base: [16i32, res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_14041), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_14041, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 mat_expanded_coalesced_18600} =
manifest((2, 0, 1), mat_expanded_17943)
-- res_r_17980 : [res_14041][16i32]f32@@mem_19549->{base: [res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_14041, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[res_14041][16i32]f32 res_r_17980} =
segmap_thread
(#groups=segmap_usable_groups_17979; groupsize=segmap_group_size_17973)
(gtid_17722 < res_14041,
gtid_17723 < 16i32) (~phys_tid_17724) : {f32} {
let {bool cond_17986} = slt32(i_17944, gtid_17723)
let {f32 res_17987} =
-- Branch returns: {f32}
if cond_17986
then {
let {f32 x_17988} = mat_expanded_17943[gtid_17722, gtid_17723,
i_17944]
let {f32 res_17990} =
loop {f32 redout_18959} = {0.0f32}
for i_18960:i32 < i_17944 do {
let {f32 x_17994} =
mat_expanded_coalesced_18600[gtid_17722, gtid_17723,
i_18960]
let {f32 x_17995} = mat_expanded_17943[gtid_17722,
i_18960, i_17944]
let {f32 res_17996} = fmul32(x_17994, x_17995)
let {f32 res_17993} = fadd32(res_17996, redout_18959)
in {res_17993}
}
let {f32 x_17997} = fsub32(x_17988, res_17990)
let {f32 y_17998} = mat_expanded_17943[gtid_17722, i_17944,
i_17944]
let {f32 res_17999} = fdiv32(x_17997, y_17998)
in {res_17999}
} else {
let {f32 res_18000} = mat_expanded_17943[gtid_17722,
gtid_17723, i_17944]
in {res_18000}
}
return {returns res_17987}
}
-- mat_r_18013 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 mat_r_18013} =
-- Consumes mat_expanded_17943
segmap_thread
(#groups=segmap_usable_groups_18012; groupsize=segmap_group_size_18006)
(gtid_17661 < res_14041,
gtid_slice_17662 < 16i32) (~phys_tid_17664) : {f32} {
let {f32 v_18016} = res_r_17980[gtid_17661, gtid_slice_17662]
return {mat_expanded_17943 with ([gtid_17661 < res_14041,
gtid_slice_17662 < 16i32,
i_17944 < 16i32] <- v_18016)}
}
let {i32 j_18027} = add32(1i32, i_17944)
-- mat_r_coalesced_18603 : [res_14041][16i32][16i32]f32@@mem_19554->{base: [16i32, res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_14041), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_14041, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 mat_r_coalesced_18603} =
manifest((2, 0, 1), mat_r_18013)
-- res_r_18053 : [res_14041][16i32]f32@@mem_19559->{base: [res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_14041, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[res_14041][16i32]f32 res_r_18053} =
segmap_thread
(#groups=segmap_usable_groups_18052; groupsize=segmap_group_size_18046)
(gtid_17586 < res_14041,
gtid_17587 < 16i32) (~phys_tid_17588) : {f32} {
let {bool cond_18057} = slt32(i_17944, gtid_17587)
let {f32 res_18058} =
-- Branch returns: {f32}
if cond_18057
then {
let {f32 x_18059} = mat_r_18013[gtid_17586, j_18027,
gtid_17587]
let {f32 res_18062} =
loop {f32 redout_18961} = {0.0f32}
for i_18962:i32 < j_18027 do {
let {f32 x_18066} = mat_r_18013[gtid_17586, i_18962,
gtid_17587]
let {f32 x_18067} = mat_r_coalesced_18603[gtid_17586,
j_18027,
i_18962]
let {f32 res_18068} = fmul32(x_18066, x_18067)
let {f32 res_18065} = fadd32(res_18068, redout_18961)
in {res_18065}
}
let {f32 res_18069} = fsub32(x_18059, res_18062)
in {res_18069}
} else {
let {f32 res_18070} = mat_r_18013[gtid_17586, j_18027,
gtid_17587]
in {res_18070}
}
return {returns res_18058}
}
-- res_18083 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]}
let {[res_14041][16i32][16i32]f32 res_18083} =
-- Consumes mat_r_18013
segmap_thread
(#groups=segmap_usable_groups_18082; groupsize=segmap_group_size_18076)
(gtid_17528 < res_14041,
gtid_slice_17529 < 16i32) (~phys_tid_17531) : {f32} {
let {f32 v_18087} = res_r_18053[gtid_17528, gtid_slice_17529]
return {mat_r_18013 with ([gtid_17528 < res_14041,
j_18027 < 16i32,
gtid_slice_17529 < 16i32] <- v_18087)}
}
in {res_18083}
}
in {mem_19538, res_17942}
}
let {bool y_14133} = slt32(0i32, res_14041)
let {cert index_certs_14134} =
assert(y_14133, "Index [", 0i32, "] out of bounds for array of shape [",
res_14041, "].", "/prelude/array.fut:15:29-32")
-- res_14135 aliases res_14046
-- res_14135 : [16i32][16i32]f32@@res_mem_19561->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[16i32][16i32]f32 res_14135} =
<index_certs_14134>
res_14046[0i32, 0i32:+16i32*1i32, 0i32:+16i32*1i32]
-- matb_14136 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]}
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14136} =
-- Consumes matb_13830
<index_certs_14039>
matb_13830 with [upper_bound_13818, upper_bound_13818, 0i32:+16i32*1i32,
0i32:+16i32*1i32] <- res_14135
let {bool bounds_invalid_upwards_14137} = slt32(n_13773, 0i32)
let {bool valid_14138} = not bounds_invalid_upwards_14137
let {cert range_valid_c_14139} =
assert(valid_14138, "Range ", 0i32, "..", 1i32, "..<", n_13773,
" is invalid.", "/prelude/math.fut:453:23-30")
let {i64 nest_size_18244} = mul64(binop_y_19103, binop_y_19103)
let {i32 segmap_group_size_18245} =
get_size(segmap_group_size_18162, group_size)
let {i64 segmap_group_size_18246} = sext i32 segmap_group_size_18245 to i64
let {i64 y_18247} = sub64(segmap_group_size_18246, 1i64)
let {i64 x_18248} = add64(nest_size_18244, y_18247)
let {i64 segmap_usable_groups_64_18250} =
squot64(x_18248, segmap_group_size_18246)
let {i32 segmap_usable_groups_18251} =
sext i64 segmap_usable_groups_64_18250 to i32
let {i64 bytes_19562} = mul64(4i64, nest_size_18244)
let {mem mem_19566} =
alloc(bytes_19562)
-- res_18252 : [n_13773][n_13773]f32@@mem_19566->{base: [n_13773, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [n_13773, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[n_13773][n_13773]f32 res_18252} =
segmap_thread
(#groups=segmap_usable_groups_18251; groupsize=segmap_group_size_18245)
(gtid_18155 < n_13773, gtid_18156 < n_13773) (~phys_tid_18157) : {f32} {
let {i32 index_primexp_18551} = sdiv32(gtid_18155, 16i32)
let {i32 binop_y_18549} = mul32(16i32, index_primexp_18551)
let {i32 index_primexp_18550} = sub32(gtid_18155, binop_y_18549)
let {i32 res_18256} =
<range_valid_c_14139>
sdiv32(gtid_18156, 16i32)
let {i32 y_18257} = mul32(16i32, res_18256)
let {i32 res_18258} =
<range_valid_c_14139>
sub32(gtid_18156, y_18257)
let {f32 res_18259} =
<range_valid_c_14139>
matb_14136[index_primexp_18551, res_18256, index_primexp_18550,
res_18258]
return {returns res_18259}
}
let {bool empty_slice_14153} = eq_i32(m_13764, 0i32)
let {i32 m_14154} = sub32(m_13764, 1i32)
let {bool zero_leq_i_p_m_t_s_14155} = sle32(0i32, m_14154)
let {bool i_p_m_t_s_leq_w_14156} = slt32(m_14154, n_13773)
let {bool y_14158} = logand(zero_leq_i_p_m_t_s_14155, i_p_m_t_s_leq_w_14156)
let {bool ok_or_empty_14160} = logor(empty_slice_14153, y_14158)
let {cert index_certs_14161} =
assert(ok_or_empty_14160, "Index [", 0i32, ":", m_13764,
"] out of bounds for array of shape [", n_13773,
"].", "/prelude/array.fut:27:44-49")
-- res_14162 aliases res_18252
-- res_14162 : [m_13764][m_13764]f32@@mem_19566->{base: [n_13773, n_13773]; contiguous: False; LMADs: [{offset: mul32 (n_13773) (0i32); strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13764]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[m_13764][m_13764]f32 res_14162} =
<index_certs_14161, range_valid_c_14139>
res_18252[0i32:+m_13764*1i32, 0i32:+m_13764*1i32]
let {i64 binop_x_19570} = mul64(binop_x_19107, binop_x_19107)
let {i64 bytes_19567} = mul64(4i64, binop_x_19570)
let {mem mem_19571} =
alloc(bytes_19567)
-- res_linear_19572 : [m_13764][m_13764]f32@@mem_19571->{base: [m_13764, m_13764]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13764, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13764]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]}
let {[m_13764][m_13764]f32 res_linear_19572} = copy(res_14162)
in {m_13764, m_13764, mem_19571, res_linear_19572}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment