Last active
March 18, 2017 13:07
-
-
Save titouanc/61640a9911c53e184c28a20ec8b74b82 to your computer and use it in GitHub Desktop.
Futhark changes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
fun convk1(kern: [x]f32, input: [x]f32): f32 = | |
reduceComm (+) 0f32 (map (*) kern input) | |
-- Futhark from a few weeks ago | |
fun conv1d(kern: [m]f32, input: [n]f32): []f32 = | |
map (fn i => convk1(kern, input[i:i+m])) | |
(iota (1 + n - m)) | |
-- Futhark ca9f992 | |
fun conv1d(kern: [m]f32, input: [n]f32): []f32 = | |
map (\i -> convk1(kern, unsafe input[i:i+m])) | |
(iota (1 + n - m)) | |
entry main(img: []f32, k: []f32): []f32 = | |
conv1d(k, img) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void chunked_reduce_kernel_288(int32_t i_264, int32_t n_244, __global | |
unsigned char *k_mem_329, | |
int32_t per_thread_elements_283, | |
__global unsigned char *img_mem_327, | |
__global unsigned char *mem_335) | |
{ | |
ALIGNED_LOCAL_MEMORY(mem_333, 4 * DEFAULT_GROUP_SIZE); | |
int32_t wave_sizze_344; | |
int32_t group_sizze_345; | |
char thread_active_346; | |
int32_t global_tid_288; | |
int32_t local_tid_289; | |
int32_t group_id_290; | |
global_tid_288 = get_global_id(0); | |
local_tid_289 = get_local_id(0); | |
group_sizze_345 = get_local_size(0); | |
wave_sizze_344 = LOCKSTEP_WIDTH; | |
group_id_290 = get_group_id(0); | |
thread_active_346 = 1; | |
int32_t chunk_sizze_294; | |
int32_t remaining_elements_347 = squot32(n_244 - global_tid_288 + | |
num_threads_280 - 1, | |
num_threads_280); | |
if (slt32(per_thread_elements_283, remaining_elements_347)) { | |
chunk_sizze_294 = per_thread_elements_283; | |
} else { | |
chunk_sizze_294 = remaining_elements_347; | |
} | |
float res_298; | |
if (thread_active_346) { | |
float acc_303 = 0.0F; | |
for (int32_t i_304 = 0; i_304 < chunk_sizze_294; i_304++) { | |
float binop_param_x_306 = *(__global float *) &k_mem_329[(i_304 * | |
num_threads_280 + | |
global_tid_288) * | |
4]; | |
float binop_param_y_307 = *(__global float *) &img_mem_327[(i_264 + | |
(i_304 * | |
num_threads_280 + | |
global_tid_288)) * | |
4]; | |
float res_308 = binop_param_x_306 * binop_param_y_307; | |
float res_309 = acc_303 + res_308; | |
float acc_tmp_348 = res_309; | |
acc_303 = acc_tmp_348; | |
} | |
res_298 = acc_303; | |
} | |
float final_result_312; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (slt32(local_tid_289, group_sizze_279) && 1) { | |
*(__local float *) &mem_333[local_tid_289 * 4] = res_298; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int32_t skip_waves_349; | |
float binop_param_y_316; | |
int32_t my_index_313; | |
int32_t other_offset_314; | |
float binop_param_x_315; | |
my_index_313 = local_tid_289; | |
other_offset_314 = 0; | |
binop_param_x_315 = *(__local float *) &mem_333[(local_tid_289 + | |
other_offset_314) * 4]; | |
other_offset_314 = 1; | |
while (slt32(other_offset_314, wave_sizze_344) & slt32(wave_sizze_344, | |
group_sizze_345)) { | |
if (((local_tid_289 - squot32(local_tid_289, wave_sizze_344) * | |
wave_sizze_344) & (2 * other_offset_314 - 1)) == 0) { | |
// read array element | |
{ | |
binop_param_y_316 = *(volatile __local | |
float *) &mem_333[(local_tid_289 + | |
other_offset_314) * 4]; | |
} | |
float res_317; | |
if (thread_active_346) { | |
res_317 = binop_param_x_315 + binop_param_y_316; | |
} | |
binop_param_x_315 = res_317; | |
*(volatile __local float *) &mem_333[local_tid_289 * 4] = | |
binop_param_x_315; | |
} | |
other_offset_314 *= 2; | |
} | |
skip_waves_349 = 1; | |
while (slt32(skip_waves_349, squot32(group_sizze_345 + wave_sizze_344 - 1, | |
wave_sizze_344))) { | |
barrier(CLK_LOCAL_MEM_FENCE); | |
other_offset_314 = skip_waves_349 * wave_sizze_344; | |
if ((local_tid_289 - squot32(local_tid_289, wave_sizze_344) * | |
wave_sizze_344) == 0 && (squot32(local_tid_289, wave_sizze_344) & | |
(2 * skip_waves_349 - 1)) == 0) { | |
// read array element | |
{ | |
binop_param_y_316 = *(__local float *) &mem_333[(local_tid_289 + | |
other_offset_314) * | |
4]; | |
} | |
float res_317; | |
if (thread_active_346) { | |
res_317 = binop_param_x_315 + binop_param_y_316; | |
} | |
binop_param_x_315 = res_317; | |
*(__local float *) &mem_333[local_tid_289 * 4] = binop_param_x_315; | |
} | |
skip_waves_349 *= 2; | |
} | |
final_result_312 = binop_param_x_315; | |
if (local_tid_289 == 0) { | |
*(__global float *) &mem_335[group_id_290 * 4] = final_result_312; | |
} | |
} | |
__kernel void reduce_kernel_319(__global unsigned char *mem_339, __global | |
unsigned char *mem_335) | |
{ | |
ALIGNED_LOCAL_MEMORY(mem_337, 4 * DEFAULT_NUM_GROUPS); | |
int32_t wave_sizze_350; | |
int32_t group_sizze_351; | |
char thread_active_352; | |
int32_t local_tid_320; | |
int32_t group_id_321; | |
int32_t global_tid_319; | |
global_tid_319 = get_global_id(0); | |
local_tid_320 = get_local_id(0); | |
group_sizze_351 = get_local_size(0); | |
wave_sizze_350 = LOCKSTEP_WIDTH; | |
group_id_321 = get_group_id(0); | |
thread_active_352 = 1; | |
float res_index_322; | |
if (thread_active_352) { | |
res_index_322 = *(__global float *) &mem_335[global_tid_319 * 4]; | |
} | |
float final_result_324; | |
barrier(CLK_LOCAL_MEM_FENCE); | |
if (slt32(local_tid_320, num_groups_278) && 1) { | |
*(__local float *) &mem_337[local_tid_320 * 4] = res_index_322; | |
} | |
barrier(CLK_LOCAL_MEM_FENCE); | |
int32_t skip_waves_353; | |
float binop_param_x_253; | |
int32_t my_index_286; | |
float binop_param_y_254; | |
int32_t other_offset_287; | |
my_index_286 = local_tid_320; | |
other_offset_287 = 0; | |
binop_param_x_253 = *(__local float *) &mem_337[(local_tid_320 + | |
other_offset_287) * 4]; | |
other_offset_287 = 1; | |
while (slt32(other_offset_287, wave_sizze_350) & slt32(wave_sizze_350, | |
group_sizze_351)) { | |
if (((local_tid_320 - squot32(local_tid_320, wave_sizze_350) * | |
wave_sizze_350) & (2 * other_offset_287 - 1)) == 0) { | |
// read array element | |
{ | |
binop_param_y_254 = *(volatile __local | |
float *) &mem_337[(local_tid_320 + | |
other_offset_287) * 4]; | |
} | |
float res_255; | |
if (thread_active_352) { | |
res_255 = binop_param_x_253 + binop_param_y_254; | |
} | |
binop_param_x_253 = res_255; | |
*(volatile __local float *) &mem_337[local_tid_320 * 4] = | |
binop_param_x_253; | |
} | |
other_offset_287 *= 2; | |
} | |
skip_waves_353 = 1; | |
while (slt32(skip_waves_353, squot32(group_sizze_351 + wave_sizze_350 - 1, | |
wave_sizze_350))) { | |
barrier(CLK_LOCAL_MEM_FENCE); | |
other_offset_287 = skip_waves_353 * wave_sizze_350; | |
if ((local_tid_320 - squot32(local_tid_320, wave_sizze_350) * | |
wave_sizze_350) == 0 && (squot32(local_tid_320, wave_sizze_350) & | |
(2 * skip_waves_353 - 1)) == 0) { | |
// read array element | |
{ | |
binop_param_y_254 = *(__local float *) &mem_337[(local_tid_320 + | |
other_offset_287) * | |
4]; | |
} | |
float res_255; | |
if (thread_active_352) { | |
res_255 = binop_param_x_253 + binop_param_y_254; | |
} | |
binop_param_x_253 = res_255; | |
*(__local float *) &mem_337[local_tid_320 * 4] = binop_param_x_253; | |
} | |
skip_waves_353 *= 2; | |
} | |
final_result_324 = binop_param_x_253; | |
if (local_tid_320 == 0) { | |
*(__global float *) &mem_339[group_id_321 * 4] = final_result_324; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__kernel void kernel_copy_421(int32_t tmp_offs_420, __global | |
unsigned char *img_mem_405, int32_t sizze_298, | |
__global unsigned char *mem_411, | |
int32_t padded_sizze_373) | |
{ | |
const uint copy_global_thread_index_421 = get_global_id(0); | |
if (copy_global_thread_index_421 >= sizze_298) | |
return; | |
*(__global float *) &mem_411[(tmp_offs_420 + copy_global_thread_index_421) * | |
4] = *(__global | |
float *) &img_mem_405[copy_global_thread_index_421 * | |
4]; | |
} | |
__kernel void kernel_copy_424(int32_t tmp_offs_420, int32_t y_372, __global | |
unsigned char *mem_409, __global | |
unsigned char *mem_411, int32_t padded_sizze_373) | |
{ | |
const uint copy_global_thread_index_424 = get_global_id(0); | |
if (copy_global_thread_index_424 >= y_372) | |
return; | |
*(__global float *) &mem_411[(tmp_offs_420 + copy_global_thread_index_424) * | |
4] = *(__global | |
float *) &mem_409[copy_global_thread_index_424 * | |
4]; | |
} | |
__kernel void map_kernel_347(__global unsigned char *mem_416, int32_t x_305, | |
__global unsigned char *mem_414, | |
int32_t num_threads_346, int32_t n_303, __global | |
unsigned char *k_mem_407) | |
{ | |
int32_t wave_sizze_427; | |
int32_t group_sizze_428; | |
char thread_active_429; | |
int32_t gtid_340; | |
int32_t local_tid_348; | |
int32_t group_id_349; | |
int32_t global_tid_347; | |
global_tid_347 = get_global_id(0); | |
local_tid_348 = get_local_id(0); | |
group_sizze_428 = get_local_size(0); | |
wave_sizze_427 = LOCKSTEP_WIDTH; | |
group_id_349 = get_group_id(0); | |
gtid_340 = global_tid_347; | |
thread_active_429 = slt32(gtid_340, n_303); | |
float res_352; | |
if (thread_active_429) { | |
float binop_param_x_355 = 0.0F; | |
for (int32_t chunk_offset_354 = 0; chunk_offset_354 < x_305; | |
chunk_offset_354++) { | |
float binop_param_x_364 = *(__global | |
float *) &k_mem_407[chunk_offset_354 * | |
4]; | |
int32_t j_p_i_t_s_386 = gtid_340 + chunk_offset_354; | |
int32_t new_index_400 = squot32(j_p_i_t_s_386, x_305); | |
int32_t binop_y_402 = new_index_400 * x_305; | |
int32_t new_index_403 = j_p_i_t_s_386 - binop_y_402; | |
float binop_param_y_365 = *(__global | |
float *) &mem_414[(new_index_403 * | |
num_threads_346 + | |
new_index_400) * 4]; | |
float res_367 = binop_param_x_364 * binop_param_y_365; | |
float res_368 = binop_param_x_355 + res_367; | |
float binop_param_x_tmp_430 = res_368; | |
binop_param_x_355 = binop_param_x_tmp_430; | |
} | |
res_352 = binop_param_x_355; | |
} | |
if (thread_active_429) { | |
*(__global float *) &mem_416[gtid_340 * 4] = res_352; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment