Created
May 5, 2020 13:26
-
-
Save Munksgaard/1ab28e296d7708cdc66087433a8b4efb to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- mat_13766 : [m_13764][m_13765]f32@@mat_mem_19100->{base: [m_13764, m_13765]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13765, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13765]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
entry {[?0][?1]f32@?2->{base: [?0, ?1]; contiguous: True; LMADs: [{offset: 0i32; | |
strides: [?1, 1i32]; | |
rotates: [0i32, 0i32]; | |
shape: [?0, ?1]; | |
permutation: [0, 1]; | |
monotonicity: [Inc, Inc]}]}} | |
main (mem mat_mem_19100, i32 m_13764, i32 m_13765, | |
[m_13764][m_13765]f32 mat_13766) = { | |
let {bool dim_match_13767} = eq_i32(m_13764, m_13765) | |
let {cert empty_or_match_cert_13768} = | |
assert(dim_match_13767, "function arguments of wrong shape", | |
"lud.fut:108:1-186:39") | |
-- mat_13769 aliases mat_13766 | |
-- mat_13769 : [m_13764][m_13764]f32@@mat_mem_19100->{base: [m_13764, m_13764]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13765, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13764]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[m_13764][m_13764]f32 mat_13769} = | |
<empty_or_match_cert_13768> | |
reshape((~m_13764, ~m_13764), mat_13766) | |
let {i32 x_13770} = add32(16i32, m_13764) | |
let {i32 x_13771} = sub32(x_13770, 1i32) | |
let {i32 num_blocks_13772} = sdiv32(x_13771, 16i32) | |
let {i32 n_13773} = mul32(16i32, num_blocks_13772) | |
let {i32 padding_13774} = sub32(n_13773, m_13764) | |
let {bool cond_13775} = eq_i32(padding_13774, 0i32) | |
let {bool cond_13776} = not cond_13775 | |
let {i32 conc_tmp_13777} = add32(m_13764, padding_13774) | |
let {i32 size_13778} = | |
-- Branch returns: {i32} | |
if cond_13776 | |
then {conc_tmp_13777} else {n_13773} | |
let {i64 binop_x_19102} = sext i32 padding_13774 to i64 | |
let {i64 binop_y_19103} = sext i32 n_13773 to i64 | |
let {i64 binop_x_19104} = mul64(binop_x_19102, binop_y_19103) | |
let {i64 bytes_19101} = mul64(4i64, binop_x_19104) | |
let {i64 binop_x_19107} = sext i32 m_13764 to i64 | |
let {i64 binop_x_19109} = mul64(binop_x_19102, binop_x_19107) | |
let {i64 bytes_19106} = mul64(4i64, binop_x_19109) | |
let {i32 mat_ixfn_19121} = | |
-- Branch returns: {i32} | |
if cond_13776 | |
then {n_13773} else {m_13765} | |
-- mat_mem_19122 aliases mat_mem_19100 | |
-- mat_13779 aliases mat_13766 | |
-- mat_13779 : [size_13778][n_13773]f32@@mat_mem_19122->{base: [size_13778, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [mat_ixfn_19121, 1i32]; rotates: [0i32, 0i32]; shape: [size_13778, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {mem mat_mem_19122; | |
[size_13778][n_13773]f32 mat_13779} = | |
-- Branch returns: {[size_13778][n_13773]f32@?0->{base: [size_13778, n_13773]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mat_ixfn_19121, 1i32]; | |
-- rotates: [0i32, 0i32]; | |
-- shape: [size_13778, n_13773]; | |
-- permutation: [0, 1]; | |
-- monotonicity: [Inc, Inc]}]}} | |
if cond_13776 | |
then { | |
let {bool bounds_invalid_upwards_13780} = slt32(padding_13774, 0i32) | |
let {bool valid_13781} = not bounds_invalid_upwards_13780 | |
let {cert range_valid_c_13782} = | |
assert(valid_13781, "Range ", 0i32, "..", 1i32, "..<", padding_13774, | |
" is invalid.", "/prelude/math.fut:453:23-30") | |
let {mem mem_19105} = | |
alloc(bytes_19101) | |
-- res_13783 : [padding_13774][n_13773]f32@@mem_19105->{base: [padding_13774, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [padding_13774, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[padding_13774][n_13773]f32 res_13783} = replicate([padding_13774, | |
n_13773], 0.0f32) | |
let {cert range_valid_c_13784} = | |
assert(valid_13781, "Range ", 0i32, "..", 1i32, "..<", padding_13774, | |
" is invalid.", "/prelude/math.fut:453:23-30") | |
let {bool dim_match_13786} = eq_i32(n_13773, conc_tmp_13777) | |
let {cert empty_or_match_cert_13787} = | |
assert(dim_match_13786, "Value of (core language) shape (", | |
conc_tmp_13777, | |
") cannot match shape of type `[", n_13773, | |
"]a`.", "lud.fut:103:3-36") | |
let {mem mem_19110} = | |
alloc(bytes_19106) | |
-- res_repd_14195 : [m_13764][padding_13774]f32@@mem_19110->{base: [m_13764, padding_13774]; contiguous: True; LMADs: [{offset: 0i32; strides: [padding_13774, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, padding_13774]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[m_13764][padding_13774]f32 res_repd_14195} = replicate([m_13764, | |
padding_13774], | |
0.0f32) | |
let {i64 binop_y_19113} = | |
<range_valid_c_13784> | |
sext i32 conc_tmp_13777 to i64 | |
let {i64 binop_x_19114} = | |
<range_valid_c_13784> | |
mul64(binop_x_19107, binop_y_19113) | |
let {i64 bytes_19111} = | |
<range_valid_c_13784> | |
mul64(4i64, binop_x_19114) | |
let {mem mem_19115} = | |
<range_valid_c_13784> | |
alloc(bytes_19111) | |
-- res_r_14198 : [m_13764][conc_tmp_13777]f32@@mem_19115->{base: [m_13764, conc_tmp_13777]; contiguous: True; LMADs: [{offset: 0i32; strides: [conc_tmp_13777, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, conc_tmp_13777]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[m_13764][conc_tmp_13777]f32 res_r_14198} = | |
<range_valid_c_13784> | |
concat@1(mat_13769, res_repd_14195) | |
-- res_14199 aliases res_r_14198 | |
-- res_14199 : [m_13764][n_13773]f32@@mem_19115->{base: [m_13764, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[m_13764][n_13773]f32 res_14199} = | |
<empty_or_match_cert_13787> | |
reshape((m_13764, n_13773), res_r_14198) | |
let {i64 binop_x_19119} = | |
<range_valid_c_13782> | |
mul64(binop_y_19103, binop_y_19113) | |
let {i64 bytes_19116} = | |
<range_valid_c_13782> | |
mul64(4i64, binop_x_19119) | |
let {mem mem_19120} = | |
<range_valid_c_13782> | |
alloc(bytes_19116) | |
-- res_13792 : [conc_tmp_13777][n_13773]f32@@mem_19120->{base: [conc_tmp_13777, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [conc_tmp_13777, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[conc_tmp_13777][n_13773]f32 res_13792} = | |
<range_valid_c_13782> | |
concat@0(res_14199, res_13783) | |
-- branch_ctx_reshaped_13793 aliases res_13792 | |
-- branch_ctx_reshaped_13793 : [size_13778][n_13773]f32@@mem_19120->{base: [size_13778, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [size_13778, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[size_13778][n_13773]f32 branch_ctx_reshaped_13793} = | |
reshape((~size_13778, ~n_13773), res_13792) | |
in {mem_19120, branch_ctx_reshaped_13793} | |
} else { | |
let {bool dim_match_13794} = eq_i32(n_13773, m_13764) | |
let {bool match_13795} = logand(dim_match_13794, dim_match_13794) | |
let {cert empty_or_match_cert_13796} = | |
assert(match_13795, "Value of (core language) shape (", m_13764, ", ", | |
m_13764, ") cannot match shape of type `[", n_13773, | |
"][", n_13773, "]f32`.", "lud.fut:117:20-35") | |
-- branch_ctx_reshaped_13797 aliases mat_13766 | |
-- branch_ctx_reshaped_13797 : [size_13778][n_13773]f32@@mat_mem_19100->{base: [size_13778, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13765, 1i32]; rotates: [0i32, 0i32]; shape: [size_13778, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[size_13778][n_13773]f32 branch_ctx_reshaped_13797} = | |
<empty_or_match_cert_13768, empty_or_match_cert_13796> | |
reshape((~size_13778, ~n_13773), mat_13766) | |
in {mat_mem_19100, branch_ctx_reshaped_13797} | |
} | |
let {bool bounds_invalid_upwards_13798} = slt32(num_blocks_13772, 0i32) | |
let {bool valid_13799} = not bounds_invalid_upwards_13798 | |
let {cert range_valid_c_13800} = | |
assert(valid_13799, "Range ", 0i32, "..", 1i32, "..<", num_blocks_13772, | |
" is invalid.", "/prelude/math.fut:453:23-30") | |
let {cert range_valid_c_13802} = | |
assert(valid_13799, "Range ", 0i32, "..", 1i32, "..<", num_blocks_13772, | |
" is invalid.", "/prelude/math.fut:453:23-30") | |
let {i64 num_blocks_14664} = sext i32 num_blocks_13772 to i64 | |
let {i64 y_14670} = mul64(256i64, num_blocks_14664) | |
let {i64 nest_size_14671} = mul64(num_blocks_14664, y_14670) | |
let {i32 segmap_group_size_14672} = | |
get_size(segmap_group_size_14346, group_size) | |
let {i64 segmap_group_size_14673} = sext i32 segmap_group_size_14672 to i64 | |
let {i64 y_14674} = sub64(segmap_group_size_14673, 1i64) | |
let {i64 x_14675} = add64(nest_size_14671, y_14674) | |
let {i64 segmap_usable_groups_64_14677} = | |
squot64(x_14675, segmap_group_size_14673) | |
let {i32 segmap_usable_groups_14678} = | |
sext i64 segmap_usable_groups_64_14677 to i32 | |
let {i64 binop_x_19126} = | |
<range_valid_c_13802> | |
mul64(num_blocks_14664, num_blocks_14664) | |
let {i64 binop_x_19128} = | |
<range_valid_c_13802> | |
mul64(16i64, binop_x_19126) | |
let {i64 binop_x_19130} = | |
<range_valid_c_13802> | |
mul64(16i64, binop_x_19128) | |
let {i64 bytes_19123} = | |
<range_valid_c_13802> | |
mul64(4i64, binop_x_19130) | |
let {mem mem_19131} = | |
<range_valid_c_13802> | |
alloc(bytes_19123) | |
-- res_14679 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 res_14679} = | |
<range_valid_c_13802> | |
segmap_thread | |
(#groups=segmap_usable_groups_14678; groupsize=segmap_group_size_14672) | |
(gtid_14333 < num_blocks_13772, gtid_14334 < num_blocks_13772, | |
gtid_14335 < 16i32, gtid_14336 < 16i32) (~phys_tid_14337) : {f32} { | |
let {i32 index_primexp_18535} = mul32(16i32, gtid_14334) | |
let {i32 binop_y_18552} = mul32(16i32, gtid_14333) | |
let {i32 index_primexp_18553} = add32(gtid_14335, binop_y_18552) | |
let {i32 i_14683} = | |
<range_valid_c_13802, range_valid_c_13800> | |
add32(gtid_14336, index_primexp_18535) | |
let {f32 res_14684} = | |
<range_valid_c_13802, range_valid_c_13800> | |
mat_13779[index_primexp_18553, i_14683] | |
return {returns res_14684} | |
} | |
let {i32 x_13817} = sdiv32(n_13773, 16i32) | |
let {i32 upper_bound_13818} = sub32(x_13817, 1i32) | |
let {bool loop_nonempty_13819} = slt32(0i32, upper_bound_13818) | |
let {i32 res_13820} = | |
-- Branch returns: {i32} | |
if <fallback> loop_nonempty_13819 | |
then { | |
let {i32 x_13821} = opaque(1i32) | |
in {x_13821} | |
} else {0i32} | |
let {i32 x_13822} = mul32(16i32, res_13820) | |
let {bool assert_arg_13823} = eq_i32(x_13822, 16i32) | |
let {bool loop_not_taken_13824} = not loop_nonempty_13819 | |
let {bool protect_assert_disj_13825} = | |
logor(assert_arg_13823, loop_not_taken_13824) | |
let {cert dim_ok_13826} = | |
assert(protect_assert_disj_13825, | |
"new shape has different number of elements than old shape", | |
"/prelude/array.fut:95:3-33") | |
let {bool y_13827} = slt32(0i32, res_13820) | |
let {bool protect_assert_disj_13828} = logor(loop_not_taken_13824, y_13827) | |
let {cert index_certs_13829} = | |
assert(protect_assert_disj_13828, "Index [", 0i32, | |
"] out of bounds for array of shape [", | |
res_13820, "].", | |
"/prelude/array.fut:15:29-32") | |
let {i32 max_group_size_14841} = | |
get_size_max(group_size) | |
let {bool fits_14842} = sle32(16i32, max_group_size_14841) | |
let {bool suff_intra_par_14840} = | |
get_size(suff_intra_par_6, threshold (!suff_outer_par_5)) <= 16i32 | |
let {bool intra_suff_and_fits_14843} = | |
logand(suff_intra_par_14840, fits_14842) | |
let {i32 segmap_group_size_15829} = | |
get_size(segmap_group_size_15648, group_size) | |
let {i64 res_15435} = sext i32 res_13820 to i64 | |
let {i32 segmap_group_size_15439} = | |
get_size(segmap_group_size_15195, group_size) | |
let {i32 segmap_group_size_15472} = | |
get_size(segmap_group_size_15135, group_size) | |
let {i32 segmap_group_size_15512} = | |
get_size(segmap_group_size_15059, group_size) | |
let {i32 segmap_group_size_15542} = | |
get_size(segmap_group_size_15002, group_size) | |
let {i64 nest_size_15438} = mul64(16i64, res_15435) | |
let {i64 segmap_group_size_15440} = sext i32 segmap_group_size_15439 to i64 | |
let {i64 segmap_group_size_15473} = sext i32 segmap_group_size_15472 to i64 | |
let {i64 segmap_group_size_15513} = sext i32 segmap_group_size_15512 to i64 | |
let {i64 segmap_group_size_15543} = sext i32 segmap_group_size_15542 to i64 | |
let {i64 y_15441} = sub64(segmap_group_size_15440, 1i64) | |
let {i64 y_15474} = sub64(segmap_group_size_15473, 1i64) | |
let {i64 y_15514} = sub64(segmap_group_size_15513, 1i64) | |
let {i64 y_15544} = sub64(segmap_group_size_15543, 1i64) | |
let {i64 x_15442} = add64(nest_size_15438, y_15441) | |
let {i64 x_15475} = add64(nest_size_15438, y_15474) | |
let {i64 x_15515} = add64(nest_size_15438, y_15514) | |
let {i64 x_15545} = add64(nest_size_15438, y_15544) | |
let {bool cond_neg_18554} = not intra_suff_and_fits_14843 | |
let {bool protect_cond_conj_18564} = | |
logand(loop_nonempty_13819, cond_neg_18554) | |
let {i64 x_18314} = | |
-- Branch returns: {i64} | |
if <fallback> protect_cond_conj_18564 | |
then { | |
let {i64 x_18555} = squot64(x_15442, segmap_group_size_15440) | |
in {x_18555} | |
} else {0i64} | |
let {i64 x_18316} = | |
-- Branch returns: {i64} | |
if <fallback> protect_cond_conj_18564 | |
then { | |
let {i64 x_18557} = squot64(x_15475, segmap_group_size_15473) | |
in {x_18557} | |
} else {0i64} | |
let {i64 x_18318} = | |
-- Branch returns: {i64} | |
if <fallback> protect_cond_conj_18564 | |
then { | |
let {i64 x_18559} = squot64(x_15515, segmap_group_size_15513) | |
in {x_18559} | |
} else {0i64} | |
let {i64 x_18320} = | |
-- Branch returns: {i64} | |
if <fallback> protect_cond_conj_18564 | |
then { | |
let {i64 x_18561} = squot64(x_15545, segmap_group_size_15543) | |
in {x_18561} | |
} else {0i64} | |
let {i32 segmap_usable_groups_15445} = sext i64 x_18314 to i32 | |
let {i32 segmap_usable_groups_15478} = sext i64 x_18316 to i32 | |
let {i32 segmap_usable_groups_15518} = sext i64 x_18318 to i32 | |
let {i32 segmap_usable_groups_15548} = sext i64 x_18320 to i32 | |
let {i32 convop_x_19134} = mul32(256i32, res_13820) | |
let {i64 binop_x_19135} = sext i32 convop_x_19134 to i64 | |
let {i64 bytes_19133} = mul64(4i64, binop_x_19135) | |
let {i64 binop_x_19152} = mul64(16i64, nest_size_15438) | |
let {i64 bytes_19147} = mul64(4i64, binop_x_19152) | |
let {i32 convop_x_19164} = mul32(16i32, x_13822) | |
let {i64 binop_x_19165} = sext i32 convop_x_19164 to i64 | |
let {i64 bytes_19162} = mul64(4i64, binop_x_19165) | |
let {i64 bytes_19167} = mul64(4i64, nest_size_15438) | |
let {i32 segmap_group_size_15976} = | |
get_size(segmap_group_size_15932, group_size) | |
-- matb_13830 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_13830} = | |
-- Consumes res_14679 | |
-- matb_13831 : *[num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
loop {*[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_13831} = {res_14679} | |
for step_13832:i32 < upper_bound_13818 do { | |
let {bool y_13834} = slt32(step_13832, num_blocks_13772) | |
let {bool index_ok_13836} = logand(y_13834, y_13834) | |
let {cert index_certs_13837} = | |
assert(index_ok_13836, "Index [", step_13832, ", ", step_13832, | |
"] out of bounds for array of shape [", | |
num_blocks_13772, "][", num_blocks_13772, "].", | |
"lud.fut:141:33-47") | |
-- lud_diagonal_arg_13838 aliases matb_13831 | |
-- lud_diagonal_arg_13838 : [16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (step_13832) (mul32 (256i32) (num_blocks_13772))) (mul32 (step_13832) (256i32)); strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 lud_diagonal_arg_13838} = | |
<index_certs_13837> | |
matb_13831[step_13832, step_13832, 0i32:+16i32*1i32, 0i32:+16i32*1i32] | |
-- res_13839 aliases lud_diagonal_arg_13838 | |
-- res_13839 : [res_13820][16i32][16i32]f32@@mem_19131->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (step_13832) (mul32 (256i32) (num_blocks_13772))) (mul32 (step_13832) (256i32)); strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 res_13839} = | |
<dim_ok_13826> | |
reshape((res_13820, 16i32, 16i32), lud_diagonal_arg_13838) | |
-- res_13840 : [res_13820][16i32][16i32]f32@@res_mem_19183->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {mem res_mem_19183; | |
[res_13820][16i32][16i32]f32 res_13840} = | |
-- Branch returns: {[res_13820][16i32][16i32]f32@?0->{base: [res_13820, 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32]; | |
-- shape: [res_13820, 16i32, 16i32]; | |
-- permutation: [0, 1, 2]; | |
-- monotonicity: [Inc, Inc, Inc]}]}} | |
if intra_suff_and_fits_14843 | |
then { | |
let {mem mem_19136} = | |
alloc(bytes_19133) | |
-- res_coalesced_18570 : [res_13820][16i32][16i32]f32@@mem_19136->{base: [16i32, 16i32, res_13820]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (res_13820) (16i32), res_13820, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, res_13820]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 res_coalesced_18570} = manifest((1, | |
2, | |
0), | |
res_13839) | |
let {mem mem_19153} = | |
alloc(bytes_19147) | |
-- res_14844 : [res_13820][16i32][16i32]f32@@mem_19153->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 res_14844} = | |
segmap_group | |
(#groups=res_13820; groupsize=16i32) | |
(gtid_14690 < res_13820) (~phys_tid_14733) : {[16i32][16i32]f32} { | |
-- x_14845 aliases res_coalesced_18570 | |
-- x_14845 : [16i32][16i32]f32@@mem_19136->{base: [16i32, 16i32, res_13820]; contiguous: False; LMADs: [{offset: gtid_14690; strides: [mul32 (res_13820) (16i32), res_13820]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 x_14845} = res_coalesced_18570[gtid_14690, | |
0i32:+16i32*1i32, | |
0i32:+16i32*1i32] | |
let {mem@local mem_19141} = | |
alloc(1024i64, @local) | |
-- smaller_replicate_14846 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 smaller_replicate_14846} = copy(x_14845) | |
let {mem@local mem_19145} = | |
alloc(64i64, @local) | |
-- res_14847 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_14847} = | |
-- Consumes smaller_replicate_14846 | |
-- mat_14848 : *[16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[16i32][16i32]f32 mat_14848} = {smaller_replicate_14846} | |
for i_14849:i32 < 15i32 do { | |
-- res_14851 : [16i32]f32@@mem_19145->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_14851} = | |
segmap_thread | |
(#groups=res_13820; groupsize=16i32) | |
(gtid_14694 < 16i32) (~phys_tid_14695) : {f32} { | |
let {bool cond_14856} = slt32(i_14849, gtid_14694) | |
let {f32 res_14857} = | |
-- Branch returns: {f32} | |
if cond_14856 | |
then { | |
let {f32 x_14858} = mat_14848[gtid_14694, i_14849] | |
let {f32 res_14860} = | |
loop {f32 redout_18909} = {0.0f32} | |
for i_18910:i32 < i_14849 do { | |
let {f32 x_14864} = mat_14848[gtid_14694, i_18910] | |
let {f32 x_14865} = mat_14848[i_18910, i_14849] | |
let {f32 res_14866} = fmul32(x_14864, x_14865) | |
let {f32 res_14863} = | |
fadd32(res_14866, redout_18909) | |
in {res_14863} | |
} | |
let {f32 x_14867} = fsub32(x_14858, res_14860) | |
let {f32 y_14868} = mat_14848[i_14849, i_14849] | |
let {f32 res_14869} = fdiv32(x_14867, y_14868) | |
in {res_14869} | |
} else { | |
let {f32 res_14870} = mat_14848[gtid_14694, i_14849] | |
in {res_14870} | |
} | |
return {returns res_14857} | |
} | |
-- mat_14871 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 mat_14871} = | |
-- Consumes mat_14848 | |
mat_14848 with [0i32:+16i32*1i32, i_14849] <- res_14851 | |
let {i32 j_14872} = add32(1i32, i_14849) | |
-- mat_14889 : [16i32][16i32]f32@@mem_19141->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 mat_14889} = | |
-- Consumes mat_14871 | |
segmap_thread | |
(#groups=res_13820; groupsize=16i32) | |
(gtid_14716 < 16i32) (~phys_tid_14717) : {f32} { | |
let {bool cond_14875} = slt32(i_14849, gtid_14716) | |
let {f32 res_14876} = | |
-- Branch returns: {f32} | |
if cond_14875 | |
then { | |
let {f32 x_14877} = mat_14871[j_14872, gtid_14716] | |
let {f32 res_14880} = | |
loop {f32 redout_18911} = {0.0f32} | |
for i_18912:i32 < j_14872 do { | |
let {f32 x_14884} = mat_14871[i_18912, gtid_14716] | |
let {f32 x_14885} = mat_14871[j_14872, i_18912] | |
let {f32 res_14886} = fmul32(x_14884, x_14885) | |
let {f32 res_14883} = | |
fadd32(res_14886, redout_18911) | |
in {res_14883} | |
} | |
let {f32 res_14887} = fsub32(x_14877, res_14880) | |
in {res_14887} | |
} else { | |
let {f32 res_14888} = mat_14871[j_14872, gtid_14716] | |
in {res_14888} | |
} | |
return {mat_14871 with ([j_14872 < 16i32, | |
gtid_14716 < 16i32] <- res_14876)} | |
} | |
in {mat_14889} | |
} | |
return {returns res_14847} | |
} | |
in {mem_19153, res_14844} | |
} else { | |
let {mem mem_19160} = | |
alloc(bytes_19147) | |
-- smaller_replicate_r_15407 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 smaller_replicate_r_15407} = | |
copy(res_13839) | |
let {mem mem_19166} = | |
alloc(bytes_19162) | |
let {mem mem_19171} = | |
alloc(bytes_19167) | |
let {mem mem_19176} = | |
alloc(bytes_19162) | |
let {mem mem_19181} = | |
alloc(bytes_19167) | |
-- res_15408 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 res_15408} = | |
-- Consumes smaller_replicate_r_15407 | |
-- mat_expanded_15409 : *[res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
loop {*[res_13820][16i32][16i32]f32 mat_expanded_15409} = {smaller_replicate_r_15407} | |
for i_15410:i32 < 15i32 do { | |
-- mat_expanded_coalesced_18572 : [res_13820][16i32][16i32]f32@@mem_19166->{base: [16i32, res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_13820), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_13820, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 mat_expanded_coalesced_18572} = | |
manifest((2, 0, 1), mat_expanded_15409) | |
-- res_r_15446 : [res_13820][16i32]f32@@mem_19171->{base: [res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_13820, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[res_13820][16i32]f32 res_r_15446} = | |
segmap_thread | |
(#groups=segmap_usable_groups_15445; groupsize=segmap_group_size_15439) | |
(gtid_15188 < res_13820, | |
gtid_15189 < 16i32) (~phys_tid_15190) : {f32} { | |
let {bool cond_15452} = slt32(i_15410, gtid_15189) | |
let {f32 res_15453} = | |
-- Branch returns: {f32} | |
if cond_15452 | |
then { | |
let {f32 x_15454} = mat_expanded_15409[gtid_15188, | |
gtid_15189, | |
i_15410] | |
let {f32 res_15456} = | |
loop {f32 redout_18913} = {0.0f32} | |
for i_18914:i32 < i_15410 do { | |
let {f32 x_15460} = | |
mat_expanded_coalesced_18572[gtid_15188, gtid_15189, | |
i_18914] | |
let {f32 x_15461} = mat_expanded_15409[gtid_15188, | |
i_18914, | |
i_15410] | |
let {f32 res_15462} = fmul32(x_15460, x_15461) | |
let {f32 res_15459} = fadd32(res_15462, redout_18913) | |
in {res_15459} | |
} | |
let {f32 x_15463} = fsub32(x_15454, res_15456) | |
let {f32 y_15464} = mat_expanded_15409[gtid_15188, | |
i_15410, i_15410] | |
let {f32 res_15465} = fdiv32(x_15463, y_15464) | |
in {res_15465} | |
} else { | |
let {f32 res_15466} = mat_expanded_15409[gtid_15188, | |
gtid_15189, | |
i_15410] | |
in {res_15466} | |
} | |
return {returns res_15453} | |
} | |
-- mat_r_15479 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 mat_r_15479} = | |
-- Consumes mat_expanded_15409 | |
segmap_thread | |
(#groups=segmap_usable_groups_15478; groupsize=segmap_group_size_15472) | |
(gtid_15127 < res_13820, | |
gtid_slice_15128 < 16i32) (~phys_tid_15130) : {f32} { | |
let {f32 v_15482} = res_r_15446[gtid_15127, gtid_slice_15128] | |
return {mat_expanded_15409 with ([gtid_15127 < res_13820, | |
gtid_slice_15128 < 16i32, | |
i_15410 < 16i32] <- v_15482)} | |
} | |
let {i32 j_15493} = add32(1i32, i_15410) | |
-- mat_r_coalesced_18575 : [res_13820][16i32][16i32]f32@@mem_19176->{base: [16i32, res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_13820), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_13820, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 mat_r_coalesced_18575} = | |
manifest((2, 0, 1), mat_r_15479) | |
-- res_r_15519 : [res_13820][16i32]f32@@mem_19181->{base: [res_13820, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_13820, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[res_13820][16i32]f32 res_r_15519} = | |
segmap_thread | |
(#groups=segmap_usable_groups_15518; groupsize=segmap_group_size_15512) | |
(gtid_15052 < res_13820, | |
gtid_15053 < 16i32) (~phys_tid_15054) : {f32} { | |
let {bool cond_15523} = slt32(i_15410, gtid_15053) | |
let {f32 res_15524} = | |
-- Branch returns: {f32} | |
if cond_15523 | |
then { | |
let {f32 x_15525} = mat_r_15479[gtid_15052, j_15493, | |
gtid_15053] | |
let {f32 res_15528} = | |
loop {f32 redout_18915} = {0.0f32} | |
for i_18916:i32 < j_15493 do { | |
let {f32 x_15532} = mat_r_15479[gtid_15052, i_18916, | |
gtid_15053] | |
let {f32 x_15533} = mat_r_coalesced_18575[gtid_15052, | |
j_15493, | |
i_18916] | |
let {f32 res_15534} = fmul32(x_15532, x_15533) | |
let {f32 res_15531} = fadd32(res_15534, redout_18915) | |
in {res_15531} | |
} | |
let {f32 res_15535} = fsub32(x_15525, res_15528) | |
in {res_15535} | |
} else { | |
let {f32 res_15536} = mat_r_15479[gtid_15052, j_15493, | |
gtid_15053] | |
in {res_15536} | |
} | |
return {returns res_15524} | |
} | |
-- res_15549 : [res_13820][16i32][16i32]f32@@mem_19160->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_13820, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_13820][16i32][16i32]f32 res_15549} = | |
-- Consumes mat_r_15479 | |
segmap_thread | |
(#groups=segmap_usable_groups_15548; groupsize=segmap_group_size_15542) | |
(gtid_14994 < res_13820, | |
gtid_slice_14995 < 16i32) (~phys_tid_14997) : {f32} { | |
let {f32 v_15553} = res_r_15519[gtid_14994, gtid_slice_14995] | |
return {mat_r_15479 with ([gtid_14994 < res_13820, | |
j_15493 < 16i32, | |
gtid_slice_14995 < 16i32] <- v_15553)} | |
} | |
in {res_15549} | |
} | |
in {mem_19160, res_15408} | |
} | |
-- res_13927 aliases res_13840 | |
-- res_13927 : [16i32][16i32]f32@@res_mem_19183->{base: [res_13820, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_13927} = | |
<index_certs_13829> | |
res_13840[0i32, 0i32:+16i32*1i32, 0i32:+16i32*1i32] | |
let {i32 i_13928} = add32(1i32, step_13832) | |
let {i32 j_m_i_13929} = sub32(num_blocks_13772, i_13928) | |
let {bool empty_slice_13930} = eq_i32(j_m_i_13929, 0i32) | |
let {i32 m_13931} = sub32(j_m_i_13929, 1i32) | |
let {i32 i_p_m_t_s_13932} = add32(i_13928, m_13931) | |
let {bool zero_leq_i_p_m_t_s_13933} = sle32(0i32, i_p_m_t_s_13932) | |
let {bool i_p_m_t_s_leq_w_13934} = | |
slt32(i_p_m_t_s_13932, num_blocks_13772) | |
let {bool i_lte_j_13936} = sle32(i_13928, num_blocks_13772) | |
let {bool y_13938} = | |
logand(zero_leq_i_p_m_t_s_13933, i_p_m_t_s_leq_w_13934) | |
let {bool y_13939} = logand(i_lte_j_13936, y_13938) | |
let {bool ok_or_empty_13941} = logor(empty_slice_13930, y_13939) | |
let {bool index_ok_13942} = logand(y_13834, ok_or_empty_13941) | |
let {cert index_certs_13943} = | |
assert(index_ok_13942, "Index [", step_13832, ", ", i_13928, ":", | |
num_blocks_13772, | |
"] out of bounds for array of shape [", | |
num_blocks_13772, "][", num_blocks_13772, "].", | |
"lud.fut:146:25-52") | |
let {i64 j_m_i_15825} = sext i32 j_m_i_13929 to i64 | |
let {i64 nest_size_15828} = mul64(16i64, j_m_i_15825) | |
let {i32 num_groups_15830} = | |
calc_num_groups(nest_size_15828, segmap_num_groups_15650, | |
segmap_group_size_15829) | |
let {i32 binop_x_19188} = mul32(16i32, j_m_i_13929) | |
let {i32 convop_x_19189} = mul32(16i32, binop_x_19188) | |
let {i64 binop_x_19190} = sext i32 convop_x_19189 to i64 | |
let {i64 bytes_19187} = mul64(4i64, binop_x_19190) | |
let {mem mem_19191} = | |
alloc(bytes_19187) | |
-- res_15832 : [j_m_i_13929][16i32][16i32]f32@@mem_19191->{base: [16i32, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_15832} = | |
segmap_thread | |
(#groups=num_groups_15830; groupsize=segmap_group_size_15829; virtualise) | |
(gtid_15641 < j_m_i_13929, | |
gtid_15642 < 16i32) (~phys_tid_15643) : {[16i32]f32} { | |
let {mem@[16i32]f32 mem_19186} = | |
alloc(64i64, @[16i32]f32) | |
-- smaller_replicate_18330 : [16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 smaller_replicate_18330} = replicate([16i32], 0.0f32) | |
let {i32 j_p_i_t_s_18325} = add32(i_13928, gtid_15641) | |
-- res_15835 : [16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_15835} = | |
-- Consumes smaller_replicate_18330 | |
-- row_15836 : *[16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
loop {*[16i32]f32 row_15836} = {smaller_replicate_18330} | |
for i_15837:i32 < 16i32 do { | |
let {f32 sum_15838} = | |
loop {f32 sum_15839} = {0.0f32} | |
for k_15840:i32 < i_15837 do { | |
let {f32 x_15841} = | |
<index_certs_13829> | |
res_13840[0i32, i_15837, k_15840] | |
let {f32 y_15842} = row_15836[k_15840] | |
let {f32 y_15843} = fmul32(x_15841, y_15842) | |
let {f32 loopres_15844} = fadd32(sum_15839, y_15843) | |
in {loopres_15844} | |
} | |
let {f32 x_15845} = | |
<index_certs_13943> | |
matb_13831[step_13832, j_p_i_t_s_18325, i_15837, gtid_15642] | |
let {f32 lw_val_15846} = fsub32(x_15845, sum_15838) | |
-- row_15847 : [16i32]f32@@mem_19186->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 row_15847} = | |
-- Consumes row_15836 | |
row_15836 with [i_15837] <- lw_val_15846 | |
in {row_15847} | |
} | |
return {returns res_15835} | |
} | |
-- res_transformed_13974 aliases res_15832 | |
-- res_transformed_13974 : [j_m_i_13929][16i32][16i32]f32@@mem_19191->{base: [16i32, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, 16i32]; permutation: [1, 0, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_transformed_13974} = rearrange((0, | |
2, | |
1), | |
res_15832) | |
let {cert index_certs_13975} = | |
assert(index_ok_13942, "Index [", i_13928, ":", num_blocks_13772, ", ", | |
step_13832, | |
"] out of bounds for array of shape [", | |
num_blocks_13772, "][", num_blocks_13772, "].", | |
"lud.fut:153:25-52") | |
let {bool index_ok_13977} = logand(ok_or_empty_13941, ok_or_empty_13941) | |
let {cert index_certs_13978} = | |
assert(index_ok_13977, "Index [", i_13928, ":", num_blocks_13772, ", ", | |
i_13928, ":", num_blocks_13772, | |
"] out of bounds for array of shape [", | |
num_blocks_13772, "][", num_blocks_13772, "].", | |
"lud.fut:159:27-67") | |
let {bool suff_outer_par_15850} = | |
get_size(suff_outer_par_13, threshold ()) <= j_m_i_13929 | |
let {i32 one_intra_par_min_15913} = mul32(4096i32, j_m_i_13929) | |
let {i32 y_15922} = smin32(16i32, convop_x_19189) | |
let {i32 y_15923} = smin32(one_intra_par_min_15913, y_15922) | |
let {i32 intra_avail_par_15924} = smin32(256i32, y_15923) | |
let {i32 y_15925} = smax32(16i32, convop_x_19189) | |
let {i32 y_15926} = smax32(one_intra_par_min_15913, y_15925) | |
let {i32 computed_group_size_15853} = smax32(256i32, y_15926) | |
let {bool fits_16021} = | |
sle32(computed_group_size_15853, max_group_size_14841) | |
let {bool suff_intra_par_16019} = | |
get_size(suff_intra_par_14, | |
threshold (!suff_outer_par_13)) <= intra_avail_par_15924 | |
let {bool intra_suff_and_fits_16022} = | |
logand(suff_intra_par_16019, fits_16021) | |
let {i32 convop_x_19226} = mul32(j_m_i_13929, convop_x_19189) | |
let {i64 binop_x_19227} = sext i32 convop_x_19226 to i64 | |
let {i64 bytes_19223} = mul64(4i64, binop_x_19227) | |
let {i32 convop_x_19230} = mul32(256i32, j_m_i_13929) | |
let {i64 binop_x_19231} = sext i32 convop_x_19230 to i64 | |
let {i64 bytes_19229} = mul64(4i64, binop_x_19231) | |
let {i64 binop_x_19493} = mul64(j_m_i_15825, j_m_i_15825) | |
let {i64 binop_x_19495} = mul64(16i64, binop_x_19493) | |
let {i64 binop_x_19497} = mul64(16i64, binop_x_19495) | |
let {i64 bytes_19490} = mul64(4i64, binop_x_19497) | |
let {i64 binop_x_19505} = mul64(16i64, nest_size_15828) | |
let {i64 bytes_19500} = mul64(4i64, binop_x_19505) | |
let {i32 num_groups_15977} = | |
calc_num_groups(j_m_i_15825, segmap_num_groups_15934, | |
segmap_group_size_15976) | |
let {i32 num_threads_19639} = | |
mul32(segmap_group_size_15976, num_groups_15977) | |
let {i64 num_threads64_19640} = sext i32 num_threads_19639 to i64 | |
let {i64 total_size_19641} = mul64(bytes_19500, num_threads64_19640) | |
-- res_13980 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19508->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
-- res_13981 : [j_m_i_13929][16i32][16i32]f32@@res_mem_19509->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {mem res_mem_19508, mem res_mem_19509; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_13980, | |
[j_m_i_13929][16i32][16i32]f32 res_13981} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}, | |
-- [j_m_i_13929][16i32][16i32]f32@?1->{base: [j_m_i_13929, 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2]; | |
-- monotonicity: [Inc, Inc, Inc]}]}} | |
if suff_outer_par_15850 | |
then { | |
let {mem mem_19228} = | |
alloc(bytes_19223) | |
let {mem mem_19232} = | |
alloc(bytes_19229) | |
let {mem mem_19207} = | |
alloc(total_size_19641) | |
-- res_15979 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19228->{base: [j_m_i_13929, 16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (j_m_i_13929) (16i32)) (16i32), mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32, j_m_i_13929]; permutation: [3, 0, 1, 2]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
-- res_15980 : [j_m_i_13929][16i32][16i32]f32@@mem_19232->{base: [16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_15979, | |
[j_m_i_13929][16i32][16i32]f32 res_15980} = | |
segmap_thread | |
(#groups=num_groups_15977; groupsize=segmap_group_size_15976; virtualise) | |
(gtid_15928 < j_m_i_13929) (~phys_tid_15929) : {[j_m_i_13929][16i32][16i32]f32, | |
[16i32][16i32]f32} { | |
let {i32 j_p_i_t_s_18340} = add32(i_13928, gtid_15928) | |
let {mem@[16i3216i32]f32 mem_19194} = | |
alloc(1024i64, @[16i3216i32]f32) | |
-- result_18917 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 result_18917} = scratch(f32, 16i32, 16i32) | |
let {mem@[16i32]f32 mem_19197} = | |
alloc(64i64, @[16i32]f32) | |
-- res_15985 : [16i32]f32@@mem_19197->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_15985} = replicate([16i32], 0.0f32) | |
-- res_15983 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_15983} = | |
-- Consumes result_18917 | |
-- mapout_18918 : *[16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[16i32][16i32]f32 mapout_18918} = {result_18917} | |
for i_18919:i32 < 16i32 do { | |
-- modified_source_19090 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 modified_source_19090} = | |
-- Consumes mapout_18918 | |
mapout_18918 with [i_18919, 0i32:+16i32*1i32] <- res_15985 | |
-- lw_dest_18920 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 lw_dest_18920} = | |
-- Consumes modified_source_19090 | |
-- lowered_array_19089 : *[16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[16i32][16i32]f32 lowered_array_19089} = {modified_source_19090} | |
for j_15988:i32 < 16i32 do { | |
let {f32 sum_15989} = | |
loop {f32 sum_15990} = {0.0f32} | |
for k_15991:i32 < j_15988 do { | |
let {f32 x_15992} = | |
<index_certs_13829> | |
res_13840[0i32, k_15991, j_15988] | |
let {f32 y_15993} = lowered_array_19089[i_18919, | |
k_15991] | |
let {f32 y_15994} = fmul32(x_15992, y_15993) | |
let {f32 loopres_15995} = fadd32(sum_15990, y_15994) | |
in {loopres_15995} | |
} | |
let {f32 x_15996} = | |
<index_certs_13975> | |
matb_13831[j_p_i_t_s_18340, step_13832, i_18919, | |
j_15988] | |
let {f32 x_15997} = fsub32(x_15996, sum_15989) | |
let {f32 y_15998} = | |
<index_certs_13829> | |
res_13840[0i32, j_15988, j_15988] | |
let {f32 lw_val_15999} = fdiv32(x_15997, y_15998) | |
-- lowered_array_updated_19094 : [16i32][16i32]f32@@mem_19194->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 lowered_array_updated_19094} = | |
-- Consumes lowered_array_19089 | |
lowered_array_19089 with [i_18919, | |
j_15988] <- lw_val_15999 | |
in {lowered_array_updated_19094} | |
} | |
in {lw_dest_18920} | |
} | |
-- result_18921 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 result_18921} = scratch(f32, | |
j_m_i_13929, | |
16i32, | |
16i32) | |
-- res_16001 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_16001} = | |
-- Consumes result_18921 | |
-- mapout_18922 : *[j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
loop {*[j_m_i_13929][16i32][16i32]f32 mapout_18922} = {result_18921} | |
for i_18923:i32 < j_m_i_13929 do { | |
let {i32 j_p_i_t_s_19000} = add32(i_13928, i_18923) | |
-- lw_dest_18924 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 lw_dest_18924} = | |
-- Consumes mapout_18922 | |
-- lowered_array_19078 : *[j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
loop {*[j_m_i_13929][16i32][16i32]f32 lowered_array_19078} = {mapout_18922} | |
for i_18927:i32 < 16i32 do { | |
-- lowered_array_updated_19082 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 lowered_array_updated_19082} = | |
-- Consumes lowered_array_19078 | |
-- lowered_array_19083 : *[j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
loop {*[j_m_i_13929][16i32][16i32]f32 lowered_array_19083} = {lowered_array_19078} | |
for i_18931:i32 < 16i32 do { | |
let {f32 x_16008} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18340, j_p_i_t_s_19000, | |
i_18927, i_18931] | |
let {f32 res_16010} = | |
loop {f32 redout_18933} = {0.0f32} | |
for i_18934:i32 < 16i32 do { | |
let {f32 x_16014} = res_15983[i_18927, i_18934] | |
let {f32 x_16015} = res_15832[i_18923, i_18931, | |
i_18934] | |
let {f32 res_16016} = fmul32(x_16014, x_16015) | |
let {f32 res_16013} = | |
fadd32(res_16016, redout_18933) | |
in {res_16013} | |
} | |
let {f32 res_16017} = fsub32(x_16008, res_16010) | |
-- lowered_array_updated_19088 : [j_m_i_13929][16i32][16i32]f32@@mem_19207->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: False; LMADs: [{offset: phys_tid_15929; strides: [mul32 (mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32)) (16i32), mul32 (mul32 (num_groups_15977) (segmap_group_size_15976)) (16i32), mul32 (num_groups_15977) (segmap_group_size_15976)]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 lowered_array_updated_19088} = | |
-- Consumes lowered_array_19083 | |
lowered_array_19083 with [i_18923, i_18927, | |
i_18931] <- res_16017 | |
in {lowered_array_updated_19088} | |
} | |
in {lowered_array_updated_19082} | |
} | |
in {lw_dest_18924} | |
} | |
return {returns res_16001, returns res_15983} | |
} | |
let {mem mem_19498} = | |
alloc(bytes_19490) | |
-- res_linear_19499 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19498->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_linear_19499} = | |
copy(res_15979) | |
let {mem mem_19506} = | |
alloc(bytes_19500) | |
-- res_linear_19507 : [j_m_i_13929][16i32][16i32]f32@@mem_19506->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_linear_19507} = | |
copy(res_15980) | |
in {mem_19498, mem_19506, res_linear_19499, res_linear_19507} | |
} else { | |
-- res_17218 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19488->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
-- res_17219 : [j_m_i_13929][16i32][16i32]f32@@res_mem_19489->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {mem res_mem_19488, mem res_mem_19489; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17218, | |
[j_m_i_13929][16i32][16i32]f32 res_17219} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}, | |
-- [j_m_i_13929][16i32][16i32]f32@?1->{base: [j_m_i_13929, 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2]; | |
-- monotonicity: [Inc, Inc, Inc]}]}} | |
if intra_suff_and_fits_16022 | |
then { | |
let {mem mem_19264} = | |
alloc(bytes_19490) | |
let {mem mem_19271} = | |
alloc(bytes_19500) | |
-- res_16023 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19264->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
-- res_16024 : [j_m_i_13929][16i32][16i32]f32@@mem_19271->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_16023, | |
[j_m_i_13929][16i32][16i32]f32 res_16024} = | |
segmap_group | |
(#groups=j_m_i_13929; groupsize=computed_group_size_15853) | |
(gtid_15851 < j_m_i_13929) (~phys_tid_15927) : {[j_m_i_13929][16i32][16i32]f32, | |
[16i32][16i32]f32} { | |
let {i32 j_p_i_t_s_18352} = add32(i_13928, gtid_15851) | |
let {mem@local mem_19240} = | |
alloc(1024i64, @local) | |
-- res_16029 : [16i32][16i32]f32@@mem_19240->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_16029} = | |
segmap_thread | |
(#groups=j_m_i_13929; groupsize=computed_group_size_15853) | |
(gtid_15856 < 16i32) (~phys_tid_15857) : {[16i32]f32} { | |
let {mem@[16i32]f32 mem_19235} = | |
alloc(64i64, @[16i32]f32) | |
-- smaller_replicate_18357 : [16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 smaller_replicate_18357} = | |
replicate([16i32], 0.0f32) | |
-- res_16032 : [16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_16032} = | |
-- Consumes smaller_replicate_18357 | |
-- row_16033 : *[16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
loop {*[16i32]f32 row_16033} = {smaller_replicate_18357} | |
for j_16034:i32 < 16i32 do { | |
let {f32 sum_16035} = | |
loop {f32 sum_16036} = {0.0f32} | |
for k_16037:i32 < j_16034 do { | |
let {f32 x_16038} = | |
<index_certs_13829> | |
res_13840[0i32, k_16037, j_16034] | |
let {f32 y_16039} = row_16033[k_16037] | |
let {f32 y_16040} = fmul32(x_16038, y_16039) | |
let {f32 loopres_16041} = | |
fadd32(sum_16036, y_16040) | |
in {loopres_16041} | |
} | |
let {f32 x_16042} = | |
<index_certs_13975> | |
matb_13831[j_p_i_t_s_18352, step_13832, gtid_15856, | |
j_16034] | |
let {f32 x_16043} = fsub32(x_16042, sum_16035) | |
let {f32 y_16044} = | |
<index_certs_13829> | |
res_13840[0i32, j_16034, j_16034] | |
let {f32 lw_val_16045} = fdiv32(x_16043, y_16044) | |
-- row_16046 : [16i32]f32@@mem_19235->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 row_16046} = | |
-- Consumes row_16033 | |
row_16033 with [j_16034] <- lw_val_16045 | |
in {row_16046} | |
} | |
return {returns res_16032} | |
} | |
let {mem@local mem_19248} = | |
alloc(bytes_19500, @local) | |
-- res_r_r_r_16054 : [j_m_i_13929][16i32][16i32]f32@@mem_19248->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_r_r_r_16054} = | |
segred_thread | |
(#groups=j_m_i_13929; groupsize=computed_group_size_15853) | |
({{0.0f32}, | |
[], | |
commutative fn {f32} (f32 x_16055, f32 x_16056) => | |
let {f32 res_16057} = fadd32(x_16055, x_16056) | |
in {res_16057}}) | |
(gtid_15889 < j_m_i_13929, gtid_15890 < 16i32, | |
gtid_15891 < 16i32, | |
gtid_15899 < 16i32) (~phys_tid_15900) : {f32} { | |
let {f32 x_16061} = res_16029[gtid_15890, gtid_15899] | |
let {f32 x_16062} = res_15832[gtid_15889, gtid_15891, | |
gtid_15899] | |
let {f32 res_16063} = fmul32(x_16061, x_16062) | |
return {returns res_16063} | |
} | |
let {mem@local mem_19255} = | |
alloc(bytes_19500, @local) | |
-- res_16064 : [j_m_i_13929][16i32][16i32]f32@@mem_19255->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_16064} = | |
segmap_thread | |
(#groups=j_m_i_13929; groupsize=computed_group_size_15853) | |
(gtid_15882 < j_m_i_13929, gtid_15883 < 16i32, | |
gtid_15884 < 16i32) (~phys_tid_15885) : {f32} { | |
let {i32 j_p_i_t_s_18371} = add32(i_13928, gtid_15882) | |
let {f32 x_16065} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18352, j_p_i_t_s_18371, gtid_15883, | |
gtid_15884] | |
let {f32 res_16066} = res_r_r_r_16054[gtid_15882, | |
gtid_15883, | |
gtid_15884] | |
let {f32 res_16067} = fsub32(x_16065, res_16066) | |
return {returns res_16067} | |
} | |
return {returns res_16064, returns res_16029} | |
} | |
in {mem_19264, mem_19271, res_16023, res_16024} | |
} else { | |
let {i32 segmap_group_size_16956} = | |
get_size(segmap_group_size_16771, group_size) | |
let {i32 num_groups_16957} = | |
calc_num_groups(nest_size_15828, segmap_num_groups_16773, | |
segmap_group_size_16956) | |
let {mem mem_19279} = | |
alloc(bytes_19187) | |
-- res_16959 : [j_m_i_13929][16i32][16i32]f32@@mem_19279->{base: [16i32, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_16959} = | |
segmap_thread | |
(#groups=num_groups_16957; groupsize=segmap_group_size_16956; virtualise) | |
(gtid_16764 < j_m_i_13929, | |
gtid_16765 < 16i32) (~phys_tid_16766) : {[16i32]f32} { | |
let {mem@[16i32]f32 mem_19274} = | |
alloc(64i64, @[16i32]f32) | |
-- smaller_replicate_18384 : [16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 smaller_replicate_18384} = replicate([16i32], | |
0.0f32) | |
let {i32 j_p_i_t_s_18379} = add32(i_13928, gtid_16764) | |
-- res_16962 : [16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_16962} = | |
-- Consumes smaller_replicate_18384 | |
-- row_16963 : *[16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
loop {*[16i32]f32 row_16963} = {smaller_replicate_18384} | |
for j_16964:i32 < 16i32 do { | |
let {f32 sum_16965} = | |
loop {f32 sum_16966} = {0.0f32} | |
for k_16967:i32 < j_16964 do { | |
let {f32 x_16968} = | |
<index_certs_13829> | |
res_13840[0i32, k_16967, j_16964] | |
let {f32 y_16969} = row_16963[k_16967] | |
let {f32 y_16970} = fmul32(x_16968, y_16969) | |
let {f32 loopres_16971} = fadd32(sum_16966, y_16970) | |
in {loopres_16971} | |
} | |
let {f32 x_16972} = | |
<index_certs_13975> | |
matb_13831[j_p_i_t_s_18379, step_13832, gtid_16765, | |
j_16964] | |
let {f32 x_16973} = fsub32(x_16972, sum_16965) | |
let {f32 y_16974} = | |
<index_certs_13829> | |
res_13840[0i32, j_16964, j_16964] | |
let {f32 lw_val_16975} = fdiv32(x_16973, y_16974) | |
-- row_16976 : [16i32]f32@@mem_19274->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 row_16976} = | |
-- Consumes row_16963 | |
row_16963 with [j_16964] <- lw_val_16975 | |
in {row_16976} | |
} | |
return {returns res_16962} | |
} | |
let {i32 segmap_group_size_16982} = | |
get_size(segmap_group_size_16075, group_size) | |
let {i32 num_groups_16983} = | |
calc_num_groups(binop_x_19493, segmap_num_groups_16077, | |
segmap_group_size_16982) | |
let {i32 comparatee_16986} = mul32(j_m_i_13929, j_m_i_13929) | |
let {bool suff_outer_par_16987} = | |
get_size(suff_outer_par_15, | |
threshold (!suff_outer_par_13 !suff_intra_par_14)) <= comparatee_16986 | |
let {bool fits_16999} = sle32(4096i32, max_group_size_14841) | |
let {bool suff_intra_par_17001} = | |
get_size(suff_intra_par_16, | |
threshold (!suff_outer_par_15 !suff_outer_par_13 !suff_intra_par_14)) <= 256i32 | |
let {bool intra_suff_and_fits_17002} = | |
logand(fits_16999, suff_intra_par_17001) | |
let {i32 convop_x_19305} = mul32(j_m_i_13929, convop_x_19230) | |
let {i64 binop_x_19306} = sext i32 convop_x_19305 to i64 | |
let {i64 bytes_19303} = mul64(4i64, binop_x_19306) | |
-- res_17003 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19479->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {mem res_mem_19479; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17003} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}} | |
if suff_outer_par_16987 | |
then { | |
let {mem mem_19284} = | |
alloc(bytes_19187) | |
-- res_rowmajor_18582 : [j_m_i_13929][16i32][16i32]f32@@mem_19284->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18582} = | |
manifest((0, 1, 2), res_16959) | |
let {mem mem_19289} = | |
alloc(bytes_19187) | |
-- res_rowmajor_18583 : [j_m_i_13929][16i32][16i32]f32@@mem_19289->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18583} = | |
manifest((0, 1, 2), res_15832) | |
let {mem mem_19293} = | |
alloc(bytes_19229) | |
-- res_coalesced_18584 : [j_m_i_13929][16i32][16i32]f32@@mem_19293->{base: [16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_coalesced_18584} = | |
manifest((1, 2, 0), res_rowmajor_18583) | |
let {mem mem_19307} = | |
alloc(bytes_19303) | |
-- res_17004 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19307->{base: [16i32, 16i32, j_m_i_13929, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (j_m_i_13929) (j_m_i_13929)) (16i32), mul32 (j_m_i_13929) (j_m_i_13929), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929, j_m_i_13929]; permutation: [2, 3, 0, 1]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17004} = | |
segmap_thread | |
(#groups=num_groups_16983; groupsize=segmap_group_size_16982; virtualise) | |
(gtid_16068 < j_m_i_13929, | |
gtid_16069 < j_m_i_13929) (~phys_tid_16070) : {[16i32][16i32]f32} { | |
let {i32 j_p_i_t_s_18386} = add32(i_13928, gtid_16068) | |
let {i32 j_p_i_t_s_18388} = add32(i_13928, gtid_16069) | |
let {mem@[16i3216i32]f32 mem_19296} = | |
alloc(1024i64, @[16i3216i32]f32) | |
-- result_18935 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 result_18935} = scratch(f32, 16i32, | |
16i32) | |
-- res_17008 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_17008} = | |
-- Consumes result_18935 | |
-- mapout_18936 : *[16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[16i32][16i32]f32 mapout_18936} = {result_18935} | |
for i_18937:i32 < 16i32 do { | |
-- lw_dest_18938 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 lw_dest_18938} = | |
-- Consumes mapout_18936 | |
-- lowered_array_19095 : *[16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[16i32][16i32]f32 lowered_array_19095} = {mapout_18936} | |
for i_18941:i32 < 16i32 do { | |
let {f32 x_17012} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18386, j_p_i_t_s_18388, | |
i_18937, i_18941] | |
let {f32 res_17014} = | |
loop {f32 redout_18943} = {0.0f32} | |
for i_18944:i32 < 16i32 do { | |
let {f32 x_17018} = | |
res_rowmajor_18582[gtid_16068, i_18937, | |
i_18944] | |
let {f32 x_17019} = | |
res_coalesced_18584[gtid_16069, i_18941, | |
i_18944] | |
let {f32 res_17020} = fmul32(x_17018, x_17019) | |
let {f32 res_17017} = | |
fadd32(res_17020, redout_18943) | |
in {res_17017} | |
} | |
let {f32 res_17021} = fsub32(x_17012, res_17014) | |
-- lowered_array_updated_19099 : [16i32][16i32]f32@@mem_19296->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 lowered_array_updated_19099} = | |
-- Consumes lowered_array_19095 | |
lowered_array_19095 with [i_18937, | |
i_18941] <- res_17021 | |
in {lowered_array_updated_19099} | |
} | |
in {lw_dest_18938} | |
} | |
return {returns res_17008} | |
} | |
let {mem mem_19477} = | |
alloc(bytes_19490) | |
-- res_linear_19478 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19477->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_linear_19478} = | |
copy(res_17004) | |
in {mem_19477, res_linear_19478} | |
} else { | |
-- res_17022 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19468->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {mem res_mem_19468; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17022} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}} | |
if intra_suff_and_fits_17002 | |
then { | |
let {mem mem_19327} = | |
alloc(bytes_19490) | |
-- res_17023 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19327->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17023} = | |
segmap_group | |
(#groups=comparatee_16986; groupsize=4096i32) | |
(gtid_16083 < j_m_i_13929, | |
gtid_16084 < j_m_i_13929) (~phys_tid_16121) : {[16i32][16i32]f32} { | |
let {mem@local mem_19313} = | |
alloc(1024i64, @local) | |
-- res_r_r_17033 : [16i32][16i32]f32@@mem_19313->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_r_r_17033} = | |
segred_thread | |
(#groups=comparatee_16986; groupsize=4096i32) | |
({{0.0f32}, | |
[], | |
commutative fn {f32} (f32 x_17034, f32 x_17035) => | |
let {f32 res_17036} = fadd32(x_17034, x_17035) | |
in {res_17036}}) | |
(gtid_16096 < 16i32, gtid_16097 < 16i32, | |
gtid_16104 < 16i32) (~phys_tid_16105) : {f32} { | |
let {f32 x_17039} = res_16959[gtid_16083, | |
gtid_16096, | |
gtid_16104] | |
let {f32 x_17040} = res_15832[gtid_16084, | |
gtid_16097, | |
gtid_16104] | |
let {f32 res_17041} = fmul32(x_17039, x_17040) | |
return {returns res_17041} | |
} | |
let {i32 j_p_i_t_s_18410} = add32(i_13928, gtid_16083) | |
let {i32 j_p_i_t_s_18412} = add32(i_13928, gtid_16084) | |
let {mem@local mem_19318} = | |
alloc(1024i64, @local) | |
-- res_17042 : [16i32][16i32]f32@@mem_19318->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_17042} = | |
segmap_thread | |
(#groups=comparatee_16986; groupsize=4096i32) | |
(gtid_16090 < 16i32, | |
gtid_16091 < 16i32) (~phys_tid_16092) : {f32} { | |
let {f32 x_17043} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18410, j_p_i_t_s_18412, | |
gtid_16090, gtid_16091] | |
let {f32 res_17044} = res_r_r_17033[gtid_16090, | |
gtid_16091] | |
let {f32 res_17045} = fsub32(x_17043, res_17044) | |
return {returns res_17045} | |
} | |
return {returns res_17042} | |
} | |
in {mem_19327, res_17023} | |
} else { | |
let {i64 nest_size_17051} = | |
mul64(j_m_i_15825, nest_size_15828) | |
let {i32 segmap_group_size_17052} = | |
get_size(segmap_group_size_16178, group_size) | |
let {i32 num_groups_17053} = | |
calc_num_groups(nest_size_17051, | |
segmap_num_groups_16180, | |
segmap_group_size_17052) | |
let {i32 comparatee_17057} = | |
mul32(j_m_i_13929, binop_x_19188) | |
let {bool suff_outer_par_17058} = | |
get_size(suff_outer_par_17, | |
threshold (!suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= comparatee_17057 | |
let {bool fits_17067} = | |
sle32(256i32, max_group_size_14841) | |
let {bool suff_intra_par_17069} = | |
get_size(suff_intra_par_18, | |
threshold (!suff_outer_par_17 !suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= 16i32 | |
let {bool intra_suff_and_fits_17070} = | |
logand(fits_17067, suff_intra_par_17069) | |
let {i32 binop_x_19339} = mul32(num_blocks_13772, n_13773) | |
let {i32 convop_x_19340} = mul32(16i32, binop_x_19339) | |
let {i64 binop_x_19341} = sext i32 convop_x_19340 to i64 | |
let {i64 bytes_19337} = mul64(4i64, binop_x_19341) | |
let {i32 convop_x_19349} = mul32(16i32, comparatee_17057) | |
let {i64 binop_x_19350} = sext i32 convop_x_19349 to i64 | |
let {i64 bytes_19346} = mul64(4i64, binop_x_19350) | |
-- res_17071 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19467->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {mem res_mem_19467; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17071} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}} | |
if suff_outer_par_17058 | |
then { | |
let {mem mem_19332} = | |
alloc(bytes_19187) | |
-- res_rowmajor_18587 : [j_m_i_13929][16i32][16i32]f32@@mem_19332->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18587} = | |
manifest((0, 1, 2), res_15832) | |
let {mem mem_19336} = | |
alloc(bytes_19229) | |
-- res_coalesced_18588 : [j_m_i_13929][16i32][16i32]f32@@mem_19336->{base: [16i32, 16i32, j_m_i_13929]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (j_m_i_13929) (16i32), j_m_i_13929, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, j_m_i_13929]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_coalesced_18588} = | |
manifest((1, 2, 0), res_rowmajor_18587) | |
let {mem mem_19342} = | |
alloc(bytes_19337) | |
-- matb_coalesced_18589 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19342->{base: [16i32, num_blocks_13772, num_blocks_13772, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (16i32) (num_blocks_13772)) (num_blocks_13772), mul32 (16i32) (num_blocks_13772), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [16i32, num_blocks_13772, num_blocks_13772, 16i32]; permutation: [1, 2, 3, 0]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_coalesced_18589} = | |
manifest((3, 0, 1, 2), matb_13831) | |
let {mem mem_19351} = | |
alloc(bytes_19346) | |
-- res_17072 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19351->{base: [16i32, j_m_i_13929, j_m_i_13929, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (mul32 (16i32) (j_m_i_13929)) (j_m_i_13929), mul32 (16i32) (j_m_i_13929), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [16i32, j_m_i_13929, j_m_i_13929, 16i32]; permutation: [1, 2, 3, 0]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17072} = | |
segmap_thread | |
(#groups=num_groups_17053; groupsize=segmap_group_size_17052; virtualise) | |
(gtid_16168 < j_m_i_13929, gtid_16169 < j_m_i_13929, | |
gtid_16170 < 16i32) (~phys_tid_16171) : {[16i32]f32} { | |
let {i32 j_p_i_t_s_18418} = | |
add32(i_13928, gtid_16168) | |
let {i32 j_p_i_t_s_18420} = | |
add32(i_13928, gtid_16169) | |
let {mem@[16i32]f32 mem_19345} = | |
alloc(64i64, @[16i32]f32) | |
-- result_18945 : [16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 result_18945} = scratch(f32, | |
16i32) | |
-- res_17076 : [16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_17076} = | |
-- Consumes result_18945 | |
-- mapout_18946 : *[16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
loop {*[16i32]f32 mapout_18946} = {result_18945} | |
for i_18947:i32 < 16i32 do { | |
let {f32 x_17077} = | |
<index_certs_13978> | |
matb_coalesced_18589[j_p_i_t_s_18418, | |
j_p_i_t_s_18420, | |
gtid_16170, i_18947] | |
let {f32 res_17079} = | |
loop {f32 redout_18949} = {0.0f32} | |
for i_18950:i32 < 16i32 do { | |
let {f32 x_17083} = res_16959[gtid_16168, | |
gtid_16170, | |
i_18950] | |
let {f32 x_17084} = | |
res_coalesced_18588[gtid_16169, i_18947, | |
i_18950] | |
let {f32 res_17085} = | |
fmul32(x_17083, x_17084) | |
let {f32 res_17082} = | |
fadd32(res_17085, redout_18949) | |
in {res_17082} | |
} | |
let {f32 res_17086} = | |
fsub32(x_17077, res_17079) | |
-- lw_dest_18948 : [16i32]f32@@mem_19345->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 lw_dest_18948} = | |
-- Consumes mapout_18946 | |
mapout_18946 with [i_18947] <- res_17086 | |
in {lw_dest_18948} | |
} | |
return {returns res_17076} | |
} | |
let {mem mem_19465} = | |
alloc(bytes_19490) | |
-- res_linear_19466 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19465->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_linear_19466} = | |
copy(res_17072) | |
in {mem_19465, res_linear_19466} | |
} else { | |
-- res_17087 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19456->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {mem res_mem_19456; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17087} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}} | |
if intra_suff_and_fits_17070 | |
then { | |
let {mem mem_19367} = | |
alloc(bytes_19490) | |
-- res_17088 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19367->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17088} = | |
segmap_group | |
(#groups=comparatee_17057; groupsize=256i32) | |
(gtid_16187 < j_m_i_13929, | |
gtid_16188 < j_m_i_13929, | |
gtid_16189 < 16i32) (~phys_tid_16217) : {[16i32]f32} { | |
let {mem@local mem_19355} = | |
alloc(64i64, @local) | |
-- res_r_17096 : [16i32]f32@@mem_19355->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_r_17096} = | |
segred_thread | |
(#groups=comparatee_17057; groupsize=256i32) | |
({{0.0f32}, | |
[], | |
commutative fn {f32} (f32 x_17097, | |
f32 x_17098) => | |
let {f32 res_17099} = | |
fadd32(x_17097, x_17098) | |
in {res_17099}}) | |
(gtid_16200 < 16i32, | |
gtid_16205 < 16i32) (~phys_tid_16206) : {f32} { | |
let {f32 x_17101} = res_16959[gtid_16187, | |
gtid_16189, | |
gtid_16205] | |
let {f32 x_17102} = res_15832[gtid_16188, | |
gtid_16200, | |
gtid_16205] | |
let {f32 res_17103} = | |
fmul32(x_17101, x_17102) | |
return {returns res_17103} | |
} | |
let {i32 j_p_i_t_s_18436} = | |
add32(i_13928, gtid_16187) | |
let {i32 j_p_i_t_s_18438} = | |
add32(i_13928, gtid_16188) | |
let {mem@local mem_19358} = | |
alloc(64i64, @local) | |
-- res_17104 : [16i32]f32@@mem_19358->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_17104} = | |
segmap_thread | |
(#groups=comparatee_17057; groupsize=256i32) | |
(gtid_16195 < 16i32) (~phys_tid_16196) : {f32} { | |
let {f32 x_17105} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18436, | |
j_p_i_t_s_18438, gtid_16189, | |
gtid_16195] | |
let {f32 res_17106} = | |
res_r_17096[gtid_16195] | |
let {f32 res_17107} = | |
fsub32(x_17105, res_17106) | |
return {returns res_17107} | |
} | |
return {returns res_17104} | |
} | |
in {mem_19367, res_17088} | |
} else { | |
let {i64 y_17114} = mul64(256i64, j_m_i_15825) | |
let {i64 nest_size_17115} = | |
mul64(j_m_i_15825, y_17114) | |
let {bool suff_outer_par_17127} = | |
get_size(suff_outer_par_19, | |
threshold (!suff_outer_par_17 !suff_intra_par_18 !suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= convop_x_19305 | |
let {bool suff_intra_par_17136} = | |
get_size(suff_intra_par_20, | |
threshold (!suff_outer_par_19 !suff_outer_par_17 !suff_intra_par_18 !suff_outer_par_15 !suff_intra_par_16 !suff_outer_par_13 !suff_intra_par_14)) <= 16i32 | |
let {bool intra_suff_and_fits_17137} = | |
logand(fits_14842, suff_intra_par_17136) | |
-- res_17138 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19455->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {mem res_mem_19455; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17138} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}} | |
if suff_outer_par_17127 | |
then { | |
let {i32 tile_size_18609} = | |
get_size(tile_size_18608, tile_size) | |
let {i32 group_size_18610} = | |
mul32(tile_size_18609, tile_size_18609) | |
let {i32 y_18611} = | |
sub32(tile_size_18609, 1i32) | |
let {i32 x_18612} = add32(16i32, y_18611) | |
let {i32 num_groups_x_18613} = | |
squot32(x_18612, tile_size_18609) | |
let {i32 y_18617} = | |
mul32(j_m_i_13929, num_groups_x_18613) | |
let {i32 y_18618} = | |
mul32(j_m_i_13929, y_18617) | |
let {i32 num_groups_top_18619} = | |
mul32(num_groups_x_18613, y_18618) | |
let {i32 num_whole_tiles_18621} = | |
squot32(16i32, tile_size_18609) | |
let {i32 residual_input_18750} = | |
srem32(16i32, tile_size_18609) | |
let {bool cond_18751} = | |
eq_i32(residual_input_18750, 0i32) | |
let {mem mem_19414} = | |
alloc(bytes_19490) | |
let {i64 binop_x_19370} = | |
sext i32 group_size_18610 to i64 | |
let {i64 bytes_19368} = | |
mul64(4i64, binop_x_19370) | |
let {i64 binop_x_19373} = | |
sext i32 tile_size_18609 to i64 | |
let {i64 binop_x_19375} = | |
mul64(binop_x_19373, binop_x_19373) | |
let {i64 bytes_19372} = | |
mul64(4i64, binop_x_19375) | |
-- res_17139 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19414->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17139} = | |
segmap_group | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(gtid_16258 < j_m_i_13929, | |
gtid_16259 < j_m_i_13929, | |
gid_x_18606 < num_groups_x_18613, | |
gid_y_18607 < num_groups_x_18613) (~gid_flat_18620) : {f32} { | |
let {mem@[]f32 mem_19371} = | |
alloc(bytes_19368, @[]f32) | |
-- mergeinit_18646 : [tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 mergeinit_18646} = | |
segmap_thread | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(ltid_x_18637 < tile_size_18609, | |
ltid_y_18638 < tile_size_18609) (~ltid_flat_18639) : {f32} { | |
return {returns (private) 0.0f32} | |
} | |
let {i32 binop_x_18714} = | |
mul32(gid_x_18606, tile_size_18609) | |
let {i32 binop_x_18716} = | |
mul32(gid_y_18607, tile_size_18609) | |
let {mem@local mem_19376} = | |
alloc(bytes_19372, @local) | |
let {mem@local mem_19381} = | |
alloc(bytes_19372, @local) | |
let {mem@[]f32 mem_19385} = | |
alloc(bytes_19368, @[]f32) | |
-- accs_18743 : [tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 accs_18743} = | |
-- Consumes mergeinit_18646 | |
-- x_merge_18647 : *[tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[tile_size_18609][tile_size_18609]f32 x_merge_18647} = {mergeinit_18646} | |
for tile_id_18648:i32 < num_whole_tiles_18621 do { | |
let {i32 binop_x_18710} = | |
mul32(tile_size_18609, tile_id_18648) | |
-- full_tile_18708 : [tile_size_18609][tile_size_18609]f32@@mem_19376->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
-- full_tile_18709 : [tile_size_18609][tile_size_18609]f32@@mem_19381->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 full_tile_18708, | |
[tile_size_18609][tile_size_18609]f32 full_tile_18709} = | |
segmap_thread | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(ltid_x_18649 < tile_size_18609, | |
ltid_y_18650 < tile_size_18609) (~ltid_flat_18651) : {f32, | |
f32} { | |
let {i32 i_18711} = | |
add32(ltid_x_18649, binop_x_18710) | |
let {i32 j_18713} = | |
add32(ltid_y_18650, binop_x_18710) | |
let {i32 gtid_18715} = | |
add32(ltid_x_18649, binop_x_18714) | |
let {i32 gtid_18717} = | |
add32(ltid_y_18650, binop_x_18716) | |
let {f32 tile_elem_18720} = | |
res_16959[gtid_16258, | |
gtid_18715, j_18713] | |
let {f32 tile_elem_18721} = | |
res_15832[gtid_16259, | |
gtid_18717, i_18711] | |
return {returns (manifest) tile_elem_18720, | |
returns (manifest) tile_elem_18721} | |
} | |
-- acc_18722 : [tile_size_18609][tile_size_18609]f32@@mem_19385->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 acc_18722} = | |
segmap_thread | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(ltid_x_18674 < tile_size_18609, | |
ltid_y_18675 < tile_size_18609) (~ltid_flat_18676) : {f32} { | |
let {i32 gtid_18724} = | |
add32(ltid_x_18674, binop_x_18714) | |
let {i32 gtid_18726} = | |
add32(ltid_y_18675, binop_x_18716) | |
let {f32 acc_18729} = | |
x_merge_18647[ltid_x_18674, | |
ltid_y_18675] | |
let {bool binop_x_18732} = | |
slt32(gtid_18724, 16i32) | |
let {bool binop_y_18733} = | |
slt32(gtid_18726, 16i32) | |
let {bool cond_18734} = | |
logand(binop_x_18732, binop_y_18733) | |
let {f32 acc_18735} = | |
-- Branch returns: {f32} | |
if cond_18734 | |
then { | |
let {f32 x_18736} = | |
loop {f32 redout_18951} = {acc_18729} | |
for i_18952:i32 < tile_size_18609 do { | |
let {f32 x_18740} = | |
full_tile_18708[ltid_x_18674, | |
i_18952] | |
let {f32 x_18741} = | |
full_tile_18709[i_18952, | |
ltid_y_18675] | |
let {f32 res_18742} = | |
fmul32(x_18740, x_18741) | |
let {f32 res_18739} = | |
fadd32(res_18742, redout_18951) | |
in {res_18739} | |
} | |
in {x_18736} | |
} else {acc_18729} | |
return {returns (private) acc_18735} | |
} | |
-- acc_ensure_copy_19386 : [tile_size_18609][tile_size_18609]f32@@mem_19371->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 acc_ensure_copy_19386} = | |
copy(acc_18722) | |
in {acc_ensure_copy_19386} | |
} | |
let {mem@local mem_19391} = | |
alloc(bytes_19372, @local) | |
let {mem@local mem_19396} = | |
alloc(bytes_19372, @local) | |
let {mem@[]f32 mem_19400} = | |
alloc(bytes_19368, @[]f32) | |
let {mem@[]f32 mem_19613} = | |
alloc(bytes_19368, @[]f32) | |
-- acc_after_residual_18880 : [tile_size_18609][tile_size_18609]f32@@mem_19613->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 acc_after_residual_18880} = | |
-- Branch returns: {[tile_size_18609][tile_size_18609]f32@(mem_19613->{base: [tile_size_18609, | |
-- tile_size_18609]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [tile_size_18609, 1i32]; | |
-- rotates: [0i32, 0i32]; | |
-- shape: [tile_size_18609, tile_size_18609]; | |
-- permutation: [0, 1]; | |
-- monotonicity: [Inc, Inc]}]})} | |
if cond_18751 | |
then { | |
-- accs_nonext_copy_19614 : [tile_size_18609][tile_size_18609]f32@@mem_19613->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 accs_nonext_copy_19614} = | |
copy(accs_18743) | |
in {accs_nonext_copy_19614} | |
} else { | |
let {i32 binop_x_18835} = | |
mul32(tile_size_18609, num_whole_tiles_18621) | |
-- full_tile_18833 : [tile_size_18609][tile_size_18609]f32@@mem_19391->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
-- full_tile_18834 : [tile_size_18609][tile_size_18609]f32@@mem_19396->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 full_tile_18833, | |
[tile_size_18609][tile_size_18609]f32 full_tile_18834} = | |
segmap_thread | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(ltid_x_18752 < tile_size_18609, | |
ltid_y_18753 < tile_size_18609) (~ltid_flat_18754) : {f32, | |
f32} { | |
let {i32 i_18836} = | |
add32(ltid_x_18752, binop_x_18835) | |
let {i32 j_18838} = | |
add32(ltid_y_18753, binop_x_18835) | |
let {i32 gtid_18840} = | |
add32(binop_x_18714, ltid_x_18752) | |
let {i32 gtid_18842} = | |
add32(binop_x_18716, ltid_y_18753) | |
let {bool binop_x_18845} = | |
slt32(j_18838, 16i32) | |
let {bool binop_y_18846} = | |
slt32(gtid_18840, 16i32) | |
let {bool cond_18847} = | |
logand(binop_x_18845, binop_y_18846) | |
let {f32 pre_18848} = | |
-- Branch returns: {f32} | |
if cond_18847 | |
then { | |
let {f32 x_18849} = | |
res_16959[gtid_16258, | |
gtid_18840, | |
j_18838] | |
in {x_18849} | |
} else {0.0f32} | |
let {bool binop_x_18851} = | |
slt32(i_18836, 16i32) | |
let {bool binop_y_18852} = | |
slt32(gtid_18842, 16i32) | |
let {bool cond_18853} = | |
logand(binop_x_18851, binop_y_18852) | |
let {f32 pre_18854} = | |
-- Branch returns: {f32} | |
if cond_18853 | |
then { | |
let {f32 x_18855} = | |
res_15832[gtid_16259, | |
gtid_18842, | |
i_18836] | |
in {x_18855} | |
} else {0.0f32} | |
return {returns (manifest) pre_18848, | |
returns (manifest) pre_18854} | |
} | |
-- acc_18859 : [tile_size_18609][tile_size_18609]f32@@mem_19400->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 acc_18859} = | |
segmap_thread | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(ltid_x_18799 < tile_size_18609, | |
ltid_y_18800 < tile_size_18609) (~ltid_flat_18801) : {f32} { | |
let {i32 gtid_18861} = | |
add32(binop_x_18714, ltid_x_18799) | |
let {i32 gtid_18863} = | |
add32(binop_x_18716, ltid_y_18800) | |
let {f32 acc_18866} = | |
accs_18743[ltid_x_18799, | |
ltid_y_18800] | |
let {bool binop_x_18869} = | |
slt32(gtid_18861, 16i32) | |
let {bool binop_y_18870} = | |
slt32(gtid_18863, 16i32) | |
let {bool cond_18871} = | |
logand(binop_x_18869, binop_y_18870) | |
let {f32 acc_18872} = | |
-- Branch returns: {f32} | |
if cond_18871 | |
then { | |
let {f32 x_18873} = | |
loop {f32 redout_18953} = {acc_18866} | |
for i_18954:i32 < residual_input_18750 do { | |
let {f32 x_18877} = | |
full_tile_18833[ltid_x_18799, | |
i_18954] | |
let {f32 x_18878} = | |
full_tile_18834[i_18954, | |
ltid_y_18800] | |
let {f32 res_18879} = | |
fmul32(x_18877, x_18878) | |
let {f32 res_18876} = | |
fadd32(res_18879, redout_18953) | |
in {res_18876} | |
} | |
in {x_18873} | |
} else {acc_18866} | |
return {returns (private) acc_18872} | |
} | |
-- acc_nonext_copy_19615 : [tile_size_18609][tile_size_18609]f32@@mem_19613->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 acc_nonext_copy_19615} = | |
copy(acc_18859) | |
in {acc_nonext_copy_19615} | |
} | |
let {mem@[]f32 mem_19405} = | |
alloc(bytes_19368, @[]f32) | |
-- thread_res_18907 : [tile_size_18609][tile_size_18609]f32@@mem_19405->{base: [tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [tile_size_18609, 1i32]; rotates: [0i32, 0i32]; shape: [tile_size_18609, tile_size_18609]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[tile_size_18609][tile_size_18609]f32 thread_res_18907} = | |
segmap_thread | |
(#groups=num_groups_top_18619; groupsize=group_size_18610) | |
(ltid_x_18881 < tile_size_18609, | |
ltid_y_18882 < tile_size_18609) (~ltid_flat_18883) : {f32} { | |
let {i32 gtid_18892} = | |
add32(binop_x_18714, ltid_x_18881) | |
let {i32 gtid_18894} = | |
add32(binop_x_18716, ltid_y_18882) | |
let {bool binop_x_18896} = | |
slt32(gtid_18892, 16i32) | |
let {bool binop_y_18897} = | |
slt32(gtid_18894, 16i32) | |
let {bool cond_18898} = | |
logand(binop_x_18896, binop_y_18897) | |
let {f32 postlude_18899} = | |
-- Branch returns: {f32} | |
if cond_18898 | |
then { | |
let {f32 res_18895} = | |
acc_after_residual_18880[ltid_x_18881, | |
ltid_y_18882] | |
let {i32 j_p_i_t_s_18902} = | |
add32(i_13928, gtid_16258) | |
let {i32 j_p_i_t_s_18903} = | |
add32(i_13928, gtid_16259) | |
let {f32 x_18904} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18902, | |
j_p_i_t_s_18903, | |
gtid_18892, | |
gtid_18894] | |
let {f32 res_18905} = | |
fsub32(x_18904, res_18895) | |
in {res_18905} | |
} else {0.0f32} | |
return {returns (private) postlude_18899} | |
} | |
-- thread_res_18908 aliases thread_res_18907 | |
-- thread_res_18908 : [1i32][1i32][tile_size_18609][tile_size_18609]f32@@mem_19405->{base: [1i32, 1i32, tile_size_18609, tile_size_18609]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (tile_size_18609) (tile_size_18609), mul32 (tile_size_18609) (tile_size_18609), tile_size_18609, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [1i32, 1i32, tile_size_18609, tile_size_18609]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[1i32][1i32][tile_size_18609][tile_size_18609]f32 thread_res_18908} = | |
reshape((1i32, 1i32, tile_size_18609, | |
tile_size_18609), | |
thread_res_18907) | |
return {tile(j_m_i_13929 / 1i32, | |
j_m_i_13929 / 1i32, | |
16i32 / tile_size_18609, | |
16i32 / tile_size_18609) thread_res_18908} | |
} | |
in {mem_19414, res_17139} | |
} else { | |
-- res_17151 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@res_mem_19454->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {mem res_mem_19454; | |
[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17151} = | |
-- Branch returns: {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32@?0->{base: [j_m_i_13929, | |
-- j_m_i_13929, | |
-- 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32, 0i32]; | |
-- shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; | |
-- permutation: [0, 1, 2, 3]; | |
-- monotonicity: [Inc, Inc, Inc, Inc]}]}} | |
if intra_suff_and_fits_17137 | |
then { | |
let {mem mem_19424} = | |
alloc(bytes_19490) | |
-- res_17152 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19424->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17152} = | |
segmap_group | |
(#groups=convop_x_19305; groupsize=16i32) | |
(gtid_16284 < j_m_i_13929, | |
gtid_16285 < j_m_i_13929, | |
gtid_16286 < 16i32, | |
gtid_16287 < 16i32) (~phys_tid_16295) : {f32} { | |
let {i32 j_p_i_t_s_18456} = | |
add32(i_13928, gtid_16284) | |
let {i32 j_p_i_t_s_18458} = | |
add32(i_13928, gtid_16285) | |
let {f32 x_17154} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18456, | |
j_p_i_t_s_18458, | |
gtid_16286, gtid_16287] | |
let {f32 res_17156} = | |
segred_thread | |
(#groups=convop_x_19305; groupsize=16i32) | |
({{0.0f32}, | |
[], | |
fn {f32} (f32 x_17157, | |
f32 x_17158) => | |
let {f32 res_17159} = | |
fadd32(x_17157, x_17158) | |
in {res_17159}}) | |
(gtid_16293 < 16i32) (~phys_tid_16294) : {f32} { | |
let {f32 x_17160} = | |
res_16959[gtid_16284, | |
gtid_16286, | |
gtid_16293] | |
let {f32 x_17161} = | |
res_15832[gtid_16285, | |
gtid_16287, | |
gtid_16293] | |
let {f32 res_17162} = | |
fmul32(x_17160, x_17161) | |
return {returns res_17162} | |
} | |
let {f32 res_17163} = | |
fsub32(x_17154, res_17156) | |
return {returns res_17163} | |
} | |
in {mem_19424, res_17152} | |
} else { | |
let {i64 nest_size_17181} = | |
mul64(16i64, nest_size_17115) | |
let {i32 segred_group_size_17182} = | |
get_size(segred_group_size_16375, | |
group_size) | |
let {i32 num_groups_17183} = | |
calc_num_groups(nest_size_17181, | |
segred_num_groups_16377, | |
segred_group_size_17182) | |
let {mem mem_19429} = | |
alloc(bytes_19187) | |
-- res_rowmajor_18594 : [j_m_i_13929][16i32][16i32]f32@@mem_19429->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18594} = | |
manifest((0, 1, 2), res_16959) | |
let {mem mem_19434} = | |
alloc(bytes_19187) | |
-- res_rowmajor_18595 : [j_m_i_13929][16i32][16i32]f32@@mem_19434->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_rowmajor_18595} = | |
manifest((0, 1, 2), res_15832) | |
let {mem mem_19444} = | |
alloc(bytes_19490) | |
-- res_r_r_r_r_17185 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19444->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_r_r_r_r_17185} = | |
segred_thread | |
(#groups=num_groups_17183; groupsize=segred_group_size_17182) | |
({{0.0f32}, | |
[], | |
commutative fn {f32} (f32 x_17186, | |
f32 x_17187) => | |
let {f32 res_17188} = | |
fadd32(x_17186, x_17187) | |
in {res_17188}}) | |
(gtid_16353 < j_m_i_13929, | |
gtid_16354 < j_m_i_13929, | |
gtid_16355 < 16i32, gtid_16356 < 16i32, | |
gtid_16380 < 16i32) (~phys_tid_16381) : {f32} { | |
let {f32 x_17193} = | |
res_rowmajor_18594[gtid_16353, | |
gtid_16355, | |
gtid_16380] | |
let {f32 x_17194} = | |
res_rowmajor_18595[gtid_16354, | |
gtid_16356, | |
gtid_16380] | |
let {f32 res_17195} = | |
fmul32(x_17193, x_17194) | |
return {returns res_17195} | |
} | |
let {i32 segmap_group_size_17204} = | |
get_size(segmap_group_size_16342, | |
group_size) | |
let {i64 segmap_group_size_17205} = | |
sext i32 segmap_group_size_17204 to i64 | |
let {i64 y_17206} = | |
sub64(segmap_group_size_17205, 1i64) | |
let {i64 x_17207} = | |
add64(nest_size_17115, y_17206) | |
let {i64 segmap_usable_groups_64_17209} = | |
squot64(x_17207, segmap_group_size_17205) | |
let {i32 segmap_usable_groups_17210} = | |
sext i64 segmap_usable_groups_64_17209 to i32 | |
let {mem mem_19453} = | |
alloc(bytes_19490) | |
-- res_17211 : [j_m_i_13929][j_m_i_13929][16i32][16i32]f32@@mem_19453->{base: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (j_m_i_13929), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [j_m_i_13929, j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][j_m_i_13929][16i32][16i32]f32 res_17211} = | |
segmap_thread | |
(#groups=segmap_usable_groups_17210; groupsize=segmap_group_size_17204) | |
(gtid_16329 < j_m_i_13929, | |
gtid_16330 < j_m_i_13929, | |
gtid_16331 < 16i32, | |
gtid_16332 < 16i32) (~phys_tid_16333) : {f32} { | |
let {i32 j_p_i_t_s_18468} = | |
add32(i_13928, gtid_16329) | |
let {i32 j_p_i_t_s_18470} = | |
add32(i_13928, gtid_16330) | |
let {f32 x_17212} = | |
<index_certs_13978> | |
matb_13831[j_p_i_t_s_18468, | |
j_p_i_t_s_18470, | |
gtid_16331, gtid_16332] | |
let {f32 res_17213} = | |
res_r_r_r_r_17185[gtid_16329, | |
gtid_16330, | |
gtid_16331, | |
gtid_16332] | |
let {f32 res_17214} = | |
fsub32(x_17212, res_17213) | |
return {returns res_17214} | |
} | |
in {mem_19453, res_17211} | |
} | |
in {res_mem_19454, res_17151} | |
} | |
in {res_mem_19455, res_17138} | |
} | |
in {res_mem_19456, res_17087} | |
} | |
in {res_mem_19467, res_17071} | |
} | |
in {res_mem_19468, res_17022} | |
} | |
let {mem mem_19486} = | |
alloc(bytes_19500) | |
-- res_linear_19487 : [j_m_i_13929][16i32][16i32]f32@@mem_19486->{base: [j_m_i_13929, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [j_m_i_13929, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[j_m_i_13929][16i32][16i32]f32 res_linear_19487} = | |
copy(res_16959) | |
in {res_mem_19479, mem_19486, res_17003, res_linear_19487} | |
} | |
in {res_mem_19488, res_mem_19489, res_17218, res_17219} | |
} | |
-- matb_14031 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14031} = | |
-- Consumes matb_13831 | |
<index_certs_13837> | |
matb_13831 with [step_13832, step_13832, 0i32:+16i32*1i32, | |
0i32:+16i32*1i32] <- res_13927 | |
-- matb_14032 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14032} = | |
-- Consumes matb_14031 | |
<index_certs_13943> | |
matb_14031 with [step_13832, i_13928:+j_m_i_13929*1i32, | |
0i32:+16i32*1i32, | |
0i32:+16i32*1i32] <- res_transformed_13974 | |
-- matb_14033 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14033} = | |
-- Consumes matb_14032 | |
<index_certs_13975, index_certs_13943> | |
matb_14032 with [i_13928:+j_m_i_13929*1i32, step_13832, | |
0i32:+16i32*1i32, 0i32:+16i32*1i32] <- res_13981 | |
-- matb_14034 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14034} = | |
-- Consumes matb_14033 | |
<index_certs_13978> | |
matb_14033 with [i_13928:+j_m_i_13929*1i32, i_13928:+j_m_i_13929*1i32, | |
0i32:+16i32*1i32, 0i32:+16i32*1i32] <- res_13980 | |
in {matb_14034} | |
} | |
let {bool x_14035} = sle32(0i32, upper_bound_13818) | |
let {bool y_14036} = slt32(upper_bound_13818, num_blocks_13772) | |
let {bool bounds_check_14037} = logand(x_14035, y_14036) | |
let {bool index_ok_14038} = logand(bounds_check_14037, bounds_check_14037) | |
let {cert index_certs_14039} = | |
assert(index_ok_14038, "Index [", upper_bound_13818, ", ", | |
upper_bound_13818, | |
"] out of bounds for array of shape [", | |
num_blocks_13772, "][", num_blocks_13772, "].", | |
"lud.fut:177:27-52") | |
-- lud_diagonal_arg_14040 aliases matb_13830 | |
-- lud_diagonal_arg_14040 : [16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (upper_bound_13818) (mul32 (256i32) (num_blocks_13772))) (mul32 (upper_bound_13818) (256i32)); strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 lud_diagonal_arg_14040} = | |
<index_certs_14039> | |
matb_13830[upper_bound_13818, upper_bound_13818, 0i32:+16i32*1i32, | |
0i32:+16i32*1i32] | |
let {i32 res_14041} = opaque(1i32) | |
let {i32 x_14042} = mul32(16i32, res_14041) | |
let {bool assert_arg_14043} = eq_i32(x_14042, 16i32) | |
let {cert dim_ok_14044} = | |
assert(assert_arg_14043, | |
"new shape has different number of elements than old shape", | |
"/prelude/array.fut:95:3-33") | |
-- res_14045 aliases lud_diagonal_arg_14040 | |
-- res_14045 : [res_14041][16i32][16i32]f32@@mem_19131->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: add32 (mul32 (upper_bound_13818) (mul32 (256i32) (num_blocks_13772))) (mul32 (upper_bound_13818) (256i32)); strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 res_14045} = | |
<dim_ok_14044> | |
reshape((res_14041, 16i32, 16i32), lud_diagonal_arg_14040) | |
let {bool suff_intra_par_17374} = | |
get_size(suff_intra_par_24, threshold (!suff_outer_par_23)) <= 16i32 | |
let {bool intra_suff_and_fits_17377} = | |
logand(fits_14842, suff_intra_par_17374) | |
let {i32 convop_x_19512} = mul32(256i32, res_14041) | |
let {i64 binop_x_19513} = sext i32 convop_x_19512 to i64 | |
let {i64 bytes_19511} = mul64(4i64, binop_x_19513) | |
let {i64 binop_x_19526} = sext i32 res_14041 to i64 | |
let {i64 binop_x_19528} = mul64(16i64, binop_x_19526) | |
let {i64 binop_x_19530} = mul64(16i64, binop_x_19528) | |
let {i64 bytes_19525} = mul64(4i64, binop_x_19530) | |
let {i32 convop_x_19542} = mul32(16i32, x_14042) | |
let {i64 binop_x_19543} = sext i32 convop_x_19542 to i64 | |
let {i64 bytes_19540} = mul64(4i64, binop_x_19543) | |
let {i64 bytes_19545} = mul64(4i64, binop_x_19528) | |
-- res_14046 : [res_14041][16i32][16i32]f32@@res_mem_19561->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {mem res_mem_19561; | |
[res_14041][16i32][16i32]f32 res_14046} = | |
-- Branch returns: {[res_14041][16i32][16i32]f32@?0->{base: [res_14041, 16i32, | |
-- 16i32]; | |
-- contiguous: True; | |
-- LMADs: [{offset: 0i32; | |
-- strides: [256i32, 16i32, 1i32]; | |
-- rotates: [0i32, 0i32, 0i32]; | |
-- shape: [res_14041, 16i32, 16i32]; | |
-- permutation: [0, 1, 2]; | |
-- monotonicity: [Inc, Inc, Inc]}]}} | |
if intra_suff_and_fits_17377 | |
then { | |
let {mem mem_19514} = | |
alloc(bytes_19511) | |
-- res_coalesced_18598 : [res_14041][16i32][16i32]f32@@mem_19514->{base: [16i32, 16i32, res_14041]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (res_14041) (16i32), res_14041, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, 16i32, res_14041]; permutation: [2, 0, 1]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 res_coalesced_18598} = manifest((1, 2, | |
0), | |
res_14045) | |
let {mem mem_19531} = | |
alloc(bytes_19525) | |
-- res_17378 : [res_14041][16i32][16i32]f32@@mem_19531->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 res_17378} = | |
segmap_group | |
(#groups=res_14041; groupsize=16i32) | |
(gtid_17224 < res_14041) (~phys_tid_17267) : {[16i32][16i32]f32} { | |
-- x_17379 aliases res_coalesced_18598 | |
-- x_17379 : [16i32][16i32]f32@@mem_19514->{base: [16i32, 16i32, res_14041]; contiguous: False; LMADs: [{offset: gtid_17224; strides: [mul32 (res_14041) (16i32), res_14041]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 x_17379} = res_coalesced_18598[gtid_17224, | |
0i32:+16i32*1i32, | |
0i32:+16i32*1i32] | |
let {mem@local mem_19519} = | |
alloc(1024i64, @local) | |
-- smaller_replicate_17380 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 smaller_replicate_17380} = copy(x_17379) | |
let {mem@local mem_19523} = | |
alloc(64i64, @local) | |
-- res_17381 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_17381} = | |
-- Consumes smaller_replicate_17380 | |
-- mat_17382 : *[16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
loop {*[16i32][16i32]f32 mat_17382} = {smaller_replicate_17380} | |
for i_17383:i32 < 15i32 do { | |
-- res_17385 : [16i32]f32@@mem_19523->{base: [16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32]; rotates: [0i32]; shape: [16i32]; permutation: [0]; monotonicity: [Inc]}]} | |
let {[16i32]f32 res_17385} = | |
segmap_thread | |
(#groups=res_14041; groupsize=16i32) | |
(gtid_17228 < 16i32) (~phys_tid_17229) : {f32} { | |
let {bool cond_17390} = slt32(i_17383, gtid_17228) | |
let {f32 res_17391} = | |
-- Branch returns: {f32} | |
if cond_17390 | |
then { | |
let {f32 x_17392} = mat_17382[gtid_17228, i_17383] | |
let {f32 res_17394} = | |
loop {f32 redout_18955} = {0.0f32} | |
for i_18956:i32 < i_17383 do { | |
let {f32 x_17398} = mat_17382[gtid_17228, i_18956] | |
let {f32 x_17399} = mat_17382[i_18956, i_17383] | |
let {f32 res_17400} = fmul32(x_17398, x_17399) | |
let {f32 res_17397} = fadd32(res_17400, redout_18955) | |
in {res_17397} | |
} | |
let {f32 x_17401} = fsub32(x_17392, res_17394) | |
let {f32 y_17402} = mat_17382[i_17383, i_17383] | |
let {f32 res_17403} = fdiv32(x_17401, y_17402) | |
in {res_17403} | |
} else { | |
let {f32 res_17404} = mat_17382[gtid_17228, i_17383] | |
in {res_17404} | |
} | |
return {returns res_17391} | |
} | |
-- mat_17405 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 mat_17405} = | |
-- Consumes mat_17382 | |
mat_17382 with [0i32:+16i32*1i32, i_17383] <- res_17385 | |
let {i32 j_17406} = add32(1i32, i_17383) | |
-- mat_17423 : [16i32][16i32]f32@@mem_19519->{base: [16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 mat_17423} = | |
-- Consumes mat_17405 | |
segmap_thread | |
(#groups=res_14041; groupsize=16i32) | |
(gtid_17250 < 16i32) (~phys_tid_17251) : {f32} { | |
let {bool cond_17409} = slt32(i_17383, gtid_17250) | |
let {f32 res_17410} = | |
-- Branch returns: {f32} | |
if cond_17409 | |
then { | |
let {f32 x_17411} = mat_17405[j_17406, gtid_17250] | |
let {f32 res_17414} = | |
loop {f32 redout_18957} = {0.0f32} | |
for i_18958:i32 < j_17406 do { | |
let {f32 x_17418} = mat_17405[i_18958, gtid_17250] | |
let {f32 x_17419} = mat_17405[j_17406, i_18958] | |
let {f32 res_17420} = fmul32(x_17418, x_17419) | |
let {f32 res_17417} = fadd32(res_17420, redout_18957) | |
in {res_17417} | |
} | |
let {f32 res_17421} = fsub32(x_17411, res_17414) | |
in {res_17421} | |
} else { | |
let {f32 res_17422} = mat_17405[j_17406, gtid_17250] | |
in {res_17422} | |
} | |
return {mat_17405 with ([j_17406 < 16i32, | |
gtid_17250 < 16i32] <- res_17410)} | |
} | |
in {mat_17423} | |
} | |
return {returns res_17381} | |
} | |
in {mem_19531, res_17378} | |
} else { | |
let {mem mem_19538} = | |
alloc(bytes_19525) | |
-- smaller_replicate_r_17941 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 smaller_replicate_r_17941} = | |
copy(res_14045) | |
let {i32 segmap_group_size_17973} = | |
get_size(segmap_group_size_17729, group_size) | |
let {i64 segmap_group_size_17974} = | |
sext i32 segmap_group_size_17973 to i64 | |
let {i64 y_17975} = sub64(segmap_group_size_17974, 1i64) | |
let {i64 x_17976} = add64(y_17975, binop_x_19528) | |
let {i64 x_18524} = squot64(x_17976, segmap_group_size_17974) | |
let {i32 segmap_usable_groups_17979} = sext i64 x_18524 to i32 | |
let {i32 segmap_group_size_18006} = | |
get_size(segmap_group_size_17669, group_size) | |
let {i64 segmap_group_size_18007} = | |
sext i32 segmap_group_size_18006 to i64 | |
let {i64 y_18008} = sub64(segmap_group_size_18007, 1i64) | |
let {i64 x_18009} = add64(y_18008, binop_x_19528) | |
let {i64 x_18526} = squot64(x_18009, segmap_group_size_18007) | |
let {i32 segmap_usable_groups_18012} = sext i64 x_18526 to i32 | |
let {i32 segmap_group_size_18046} = | |
get_size(segmap_group_size_17593, group_size) | |
let {i64 segmap_group_size_18047} = | |
sext i32 segmap_group_size_18046 to i64 | |
let {i64 y_18048} = sub64(segmap_group_size_18047, 1i64) | |
let {i64 x_18049} = add64(y_18048, binop_x_19528) | |
let {i64 x_18528} = squot64(x_18049, segmap_group_size_18047) | |
let {i32 segmap_usable_groups_18052} = sext i64 x_18528 to i32 | |
let {i32 segmap_group_size_18076} = | |
get_size(segmap_group_size_17536, group_size) | |
let {i64 segmap_group_size_18077} = | |
sext i32 segmap_group_size_18076 to i64 | |
let {i64 y_18078} = sub64(segmap_group_size_18077, 1i64) | |
let {i64 x_18079} = add64(y_18078, binop_x_19528) | |
let {i64 x_18530} = squot64(x_18079, segmap_group_size_18077) | |
let {i32 segmap_usable_groups_18082} = sext i64 x_18530 to i32 | |
let {mem mem_19544} = | |
alloc(bytes_19540) | |
let {mem mem_19549} = | |
alloc(bytes_19545) | |
let {mem mem_19554} = | |
alloc(bytes_19540) | |
let {mem mem_19559} = | |
alloc(bytes_19545) | |
-- res_17942 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 res_17942} = | |
-- Consumes smaller_replicate_r_17941 | |
-- mat_expanded_17943 : *[res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
loop {*[res_14041][16i32][16i32]f32 mat_expanded_17943} = {smaller_replicate_r_17941} | |
for i_17944:i32 < 15i32 do { | |
-- mat_expanded_coalesced_18600 : [res_14041][16i32][16i32]f32@@mem_19544->{base: [16i32, res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_14041), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_14041, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 mat_expanded_coalesced_18600} = | |
manifest((2, 0, 1), mat_expanded_17943) | |
-- res_r_17980 : [res_14041][16i32]f32@@mem_19549->{base: [res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_14041, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[res_14041][16i32]f32 res_r_17980} = | |
segmap_thread | |
(#groups=segmap_usable_groups_17979; groupsize=segmap_group_size_17973) | |
(gtid_17722 < res_14041, | |
gtid_17723 < 16i32) (~phys_tid_17724) : {f32} { | |
let {bool cond_17986} = slt32(i_17944, gtid_17723) | |
let {f32 res_17987} = | |
-- Branch returns: {f32} | |
if cond_17986 | |
then { | |
let {f32 x_17988} = mat_expanded_17943[gtid_17722, gtid_17723, | |
i_17944] | |
let {f32 res_17990} = | |
loop {f32 redout_18959} = {0.0f32} | |
for i_18960:i32 < i_17944 do { | |
let {f32 x_17994} = | |
mat_expanded_coalesced_18600[gtid_17722, gtid_17723, | |
i_18960] | |
let {f32 x_17995} = mat_expanded_17943[gtid_17722, | |
i_18960, i_17944] | |
let {f32 res_17996} = fmul32(x_17994, x_17995) | |
let {f32 res_17993} = fadd32(res_17996, redout_18959) | |
in {res_17993} | |
} | |
let {f32 x_17997} = fsub32(x_17988, res_17990) | |
let {f32 y_17998} = mat_expanded_17943[gtid_17722, i_17944, | |
i_17944] | |
let {f32 res_17999} = fdiv32(x_17997, y_17998) | |
in {res_17999} | |
} else { | |
let {f32 res_18000} = mat_expanded_17943[gtid_17722, | |
gtid_17723, i_17944] | |
in {res_18000} | |
} | |
return {returns res_17987} | |
} | |
-- mat_r_18013 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 mat_r_18013} = | |
-- Consumes mat_expanded_17943 | |
segmap_thread | |
(#groups=segmap_usable_groups_18012; groupsize=segmap_group_size_18006) | |
(gtid_17661 < res_14041, | |
gtid_slice_17662 < 16i32) (~phys_tid_17664) : {f32} { | |
let {f32 v_18016} = res_r_17980[gtid_17661, gtid_slice_17662] | |
return {mat_expanded_17943 with ([gtid_17661 < res_14041, | |
gtid_slice_17662 < 16i32, | |
i_17944 < 16i32] <- v_18016)} | |
} | |
let {i32 j_18027} = add32(1i32, i_17944) | |
-- mat_r_coalesced_18603 : [res_14041][16i32][16i32]f32@@mem_19554->{base: [16i32, res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (16i32) (res_14041), 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [16i32, res_14041, 16i32]; permutation: [1, 2, 0]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 mat_r_coalesced_18603} = | |
manifest((2, 0, 1), mat_r_18013) | |
-- res_r_18053 : [res_14041][16i32]f32@@mem_19559->{base: [res_14041, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [res_14041, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[res_14041][16i32]f32 res_r_18053} = | |
segmap_thread | |
(#groups=segmap_usable_groups_18052; groupsize=segmap_group_size_18046) | |
(gtid_17586 < res_14041, | |
gtid_17587 < 16i32) (~phys_tid_17588) : {f32} { | |
let {bool cond_18057} = slt32(i_17944, gtid_17587) | |
let {f32 res_18058} = | |
-- Branch returns: {f32} | |
if cond_18057 | |
then { | |
let {f32 x_18059} = mat_r_18013[gtid_17586, j_18027, | |
gtid_17587] | |
let {f32 res_18062} = | |
loop {f32 redout_18961} = {0.0f32} | |
for i_18962:i32 < j_18027 do { | |
let {f32 x_18066} = mat_r_18013[gtid_17586, i_18962, | |
gtid_17587] | |
let {f32 x_18067} = mat_r_coalesced_18603[gtid_17586, | |
j_18027, | |
i_18962] | |
let {f32 res_18068} = fmul32(x_18066, x_18067) | |
let {f32 res_18065} = fadd32(res_18068, redout_18961) | |
in {res_18065} | |
} | |
let {f32 res_18069} = fsub32(x_18059, res_18062) | |
in {res_18069} | |
} else { | |
let {f32 res_18070} = mat_r_18013[gtid_17586, j_18027, | |
gtid_17587] | |
in {res_18070} | |
} | |
return {returns res_18058} | |
} | |
-- res_18083 : [res_14041][16i32][16i32]f32@@mem_19538->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32]; shape: [res_14041, 16i32, 16i32]; permutation: [0, 1, 2]; monotonicity: [Inc, Inc, Inc]}]} | |
let {[res_14041][16i32][16i32]f32 res_18083} = | |
-- Consumes mat_r_18013 | |
segmap_thread | |
(#groups=segmap_usable_groups_18082; groupsize=segmap_group_size_18076) | |
(gtid_17528 < res_14041, | |
gtid_slice_17529 < 16i32) (~phys_tid_17531) : {f32} { | |
let {f32 v_18087} = res_r_18053[gtid_17528, gtid_slice_17529] | |
return {mat_r_18013 with ([gtid_17528 < res_14041, | |
j_18027 < 16i32, | |
gtid_slice_17529 < 16i32] <- v_18087)} | |
} | |
in {res_18083} | |
} | |
in {mem_19538, res_17942} | |
} | |
let {bool y_14133} = slt32(0i32, res_14041) | |
let {cert index_certs_14134} = | |
assert(y_14133, "Index [", 0i32, "] out of bounds for array of shape [", | |
res_14041, "].", "/prelude/array.fut:15:29-32") | |
-- res_14135 aliases res_14046 | |
-- res_14135 : [16i32][16i32]f32@@res_mem_19561->{base: [res_14041, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [16i32, 1i32]; rotates: [0i32, 0i32]; shape: [16i32, 16i32]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[16i32][16i32]f32 res_14135} = | |
<index_certs_14134> | |
res_14046[0i32, 0i32:+16i32*1i32, 0i32:+16i32*1i32] | |
-- matb_14136 : [num_blocks_13772][num_blocks_13772][16i32][16i32]f32@@mem_19131->{base: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [mul32 (256i32) (num_blocks_13772), 256i32, 16i32, 1i32]; rotates: [0i32, 0i32, 0i32, 0i32]; shape: [num_blocks_13772, num_blocks_13772, 16i32, 16i32]; permutation: [0, 1, 2, 3]; monotonicity: [Inc, Inc, Inc, Inc]}]} | |
let {[num_blocks_13772][num_blocks_13772][16i32][16i32]f32 matb_14136} = | |
-- Consumes matb_13830 | |
<index_certs_14039> | |
matb_13830 with [upper_bound_13818, upper_bound_13818, 0i32:+16i32*1i32, | |
0i32:+16i32*1i32] <- res_14135 | |
let {bool bounds_invalid_upwards_14137} = slt32(n_13773, 0i32) | |
let {bool valid_14138} = not bounds_invalid_upwards_14137 | |
let {cert range_valid_c_14139} = | |
assert(valid_14138, "Range ", 0i32, "..", 1i32, "..<", n_13773, | |
" is invalid.", "/prelude/math.fut:453:23-30") | |
let {i64 nest_size_18244} = mul64(binop_y_19103, binop_y_19103) | |
let {i32 segmap_group_size_18245} = | |
get_size(segmap_group_size_18162, group_size) | |
let {i64 segmap_group_size_18246} = sext i32 segmap_group_size_18245 to i64 | |
let {i64 y_18247} = sub64(segmap_group_size_18246, 1i64) | |
let {i64 x_18248} = add64(nest_size_18244, y_18247) | |
let {i64 segmap_usable_groups_64_18250} = | |
squot64(x_18248, segmap_group_size_18246) | |
let {i32 segmap_usable_groups_18251} = | |
sext i64 segmap_usable_groups_64_18250 to i32 | |
let {i64 bytes_19562} = mul64(4i64, nest_size_18244) | |
let {mem mem_19566} = | |
alloc(bytes_19562) | |
-- res_18252 : [n_13773][n_13773]f32@@mem_19566->{base: [n_13773, n_13773]; contiguous: True; LMADs: [{offset: 0i32; strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [n_13773, n_13773]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[n_13773][n_13773]f32 res_18252} = | |
segmap_thread | |
(#groups=segmap_usable_groups_18251; groupsize=segmap_group_size_18245) | |
(gtid_18155 < n_13773, gtid_18156 < n_13773) (~phys_tid_18157) : {f32} { | |
let {i32 index_primexp_18551} = sdiv32(gtid_18155, 16i32) | |
let {i32 binop_y_18549} = mul32(16i32, index_primexp_18551) | |
let {i32 index_primexp_18550} = sub32(gtid_18155, binop_y_18549) | |
let {i32 res_18256} = | |
<range_valid_c_14139> | |
sdiv32(gtid_18156, 16i32) | |
let {i32 y_18257} = mul32(16i32, res_18256) | |
let {i32 res_18258} = | |
<range_valid_c_14139> | |
sub32(gtid_18156, y_18257) | |
let {f32 res_18259} = | |
<range_valid_c_14139> | |
matb_14136[index_primexp_18551, res_18256, index_primexp_18550, | |
res_18258] | |
return {returns res_18259} | |
} | |
let {bool empty_slice_14153} = eq_i32(m_13764, 0i32) | |
let {i32 m_14154} = sub32(m_13764, 1i32) | |
let {bool zero_leq_i_p_m_t_s_14155} = sle32(0i32, m_14154) | |
let {bool i_p_m_t_s_leq_w_14156} = slt32(m_14154, n_13773) | |
let {bool y_14158} = logand(zero_leq_i_p_m_t_s_14155, i_p_m_t_s_leq_w_14156) | |
let {bool ok_or_empty_14160} = logor(empty_slice_14153, y_14158) | |
let {cert index_certs_14161} = | |
assert(ok_or_empty_14160, "Index [", 0i32, ":", m_13764, | |
"] out of bounds for array of shape [", n_13773, | |
"].", "/prelude/array.fut:27:44-49") | |
-- res_14162 aliases res_18252 | |
-- res_14162 : [m_13764][m_13764]f32@@mem_19566->{base: [n_13773, n_13773]; contiguous: False; LMADs: [{offset: mul32 (n_13773) (0i32); strides: [n_13773, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13764]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[m_13764][m_13764]f32 res_14162} = | |
<index_certs_14161, range_valid_c_14139> | |
res_18252[0i32:+m_13764*1i32, 0i32:+m_13764*1i32] | |
let {i64 binop_x_19570} = mul64(binop_x_19107, binop_x_19107) | |
let {i64 bytes_19567} = mul64(4i64, binop_x_19570) | |
let {mem mem_19571} = | |
alloc(bytes_19567) | |
-- res_linear_19572 : [m_13764][m_13764]f32@@mem_19571->{base: [m_13764, m_13764]; contiguous: True; LMADs: [{offset: 0i32; strides: [m_13764, 1i32]; rotates: [0i32, 0i32]; shape: [m_13764, m_13764]; permutation: [0, 1]; monotonicity: [Inc, Inc]}]} | |
let {[m_13764][m_13764]f32 res_linear_19572} = copy(res_14162) | |
in {m_13764, m_13764, mem_19571, res_linear_19572} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment