Skip to content

Instantly share code, notes, and snippets.

@titouanc
Last active March 18, 2017 13:07
Show Gist options
  • Save titouanc/61640a9911c53e184c28a20ec8b74b82 to your computer and use it in GitHub Desktop.
Save titouanc/61640a9911c53e184c28a20ec8b74b82 to your computer and use it in GitHub Desktop.
Futhark changes
fun convk1(kern: [x]f32, input: [x]f32): f32 =
reduceComm (+) 0f32 (map (*) kern input)
-- Futhark from a few weeks ago
fun conv1d(kern: [m]f32, input: [n]f32): []f32 =
map (fn i => convk1(kern, input[i:i+m]))
(iota (1 + n - m))
-- Futhark ca9f992
fun conv1d(kern: [m]f32, input: [n]f32): []f32 =
map (\i -> convk1(kern, unsafe input[i:i+m]))
(iota (1 + n - m))
entry main(img: []f32, k: []f32): []f32 =
conv1d(k, img)
__kernel void chunked_reduce_kernel_288(int32_t i_264, int32_t n_244, __global
unsigned char *k_mem_329,
int32_t per_thread_elements_283,
__global unsigned char *img_mem_327,
__global unsigned char *mem_335)
{
ALIGNED_LOCAL_MEMORY(mem_333, 4 * DEFAULT_GROUP_SIZE);
int32_t wave_sizze_344;
int32_t group_sizze_345;
char thread_active_346;
int32_t global_tid_288;
int32_t local_tid_289;
int32_t group_id_290;
global_tid_288 = get_global_id(0);
local_tid_289 = get_local_id(0);
group_sizze_345 = get_local_size(0);
wave_sizze_344 = LOCKSTEP_WIDTH;
group_id_290 = get_group_id(0);
thread_active_346 = 1;
int32_t chunk_sizze_294;
int32_t remaining_elements_347 = squot32(n_244 - global_tid_288 +
num_threads_280 - 1,
num_threads_280);
if (slt32(per_thread_elements_283, remaining_elements_347)) {
chunk_sizze_294 = per_thread_elements_283;
} else {
chunk_sizze_294 = remaining_elements_347;
}
float res_298;
if (thread_active_346) {
float acc_303 = 0.0F;
for (int32_t i_304 = 0; i_304 < chunk_sizze_294; i_304++) {
float binop_param_x_306 = *(__global float *) &k_mem_329[(i_304 *
num_threads_280 +
global_tid_288) *
4];
float binop_param_y_307 = *(__global float *) &img_mem_327[(i_264 +
(i_304 *
num_threads_280 +
global_tid_288)) *
4];
float res_308 = binop_param_x_306 * binop_param_y_307;
float res_309 = acc_303 + res_308;
float acc_tmp_348 = res_309;
acc_303 = acc_tmp_348;
}
res_298 = acc_303;
}
float final_result_312;
barrier(CLK_LOCAL_MEM_FENCE);
if (slt32(local_tid_289, group_sizze_279) && 1) {
*(__local float *) &mem_333[local_tid_289 * 4] = res_298;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t skip_waves_349;
float binop_param_y_316;
int32_t my_index_313;
int32_t other_offset_314;
float binop_param_x_315;
my_index_313 = local_tid_289;
other_offset_314 = 0;
binop_param_x_315 = *(__local float *) &mem_333[(local_tid_289 +
other_offset_314) * 4];
other_offset_314 = 1;
while (slt32(other_offset_314, wave_sizze_344) & slt32(wave_sizze_344,
group_sizze_345)) {
if (((local_tid_289 - squot32(local_tid_289, wave_sizze_344) *
wave_sizze_344) & (2 * other_offset_314 - 1)) == 0) {
// read array element
{
binop_param_y_316 = *(volatile __local
float *) &mem_333[(local_tid_289 +
other_offset_314) * 4];
}
float res_317;
if (thread_active_346) {
res_317 = binop_param_x_315 + binop_param_y_316;
}
binop_param_x_315 = res_317;
*(volatile __local float *) &mem_333[local_tid_289 * 4] =
binop_param_x_315;
}
other_offset_314 *= 2;
}
skip_waves_349 = 1;
while (slt32(skip_waves_349, squot32(group_sizze_345 + wave_sizze_344 - 1,
wave_sizze_344))) {
barrier(CLK_LOCAL_MEM_FENCE);
other_offset_314 = skip_waves_349 * wave_sizze_344;
if ((local_tid_289 - squot32(local_tid_289, wave_sizze_344) *
wave_sizze_344) == 0 && (squot32(local_tid_289, wave_sizze_344) &
(2 * skip_waves_349 - 1)) == 0) {
// read array element
{
binop_param_y_316 = *(__local float *) &mem_333[(local_tid_289 +
other_offset_314) *
4];
}
float res_317;
if (thread_active_346) {
res_317 = binop_param_x_315 + binop_param_y_316;
}
binop_param_x_315 = res_317;
*(__local float *) &mem_333[local_tid_289 * 4] = binop_param_x_315;
}
skip_waves_349 *= 2;
}
final_result_312 = binop_param_x_315;
if (local_tid_289 == 0) {
*(__global float *) &mem_335[group_id_290 * 4] = final_result_312;
}
}
__kernel void reduce_kernel_319(__global unsigned char *mem_339, __global
unsigned char *mem_335)
{
ALIGNED_LOCAL_MEMORY(mem_337, 4 * DEFAULT_NUM_GROUPS);
int32_t wave_sizze_350;
int32_t group_sizze_351;
char thread_active_352;
int32_t local_tid_320;
int32_t group_id_321;
int32_t global_tid_319;
global_tid_319 = get_global_id(0);
local_tid_320 = get_local_id(0);
group_sizze_351 = get_local_size(0);
wave_sizze_350 = LOCKSTEP_WIDTH;
group_id_321 = get_group_id(0);
thread_active_352 = 1;
float res_index_322;
if (thread_active_352) {
res_index_322 = *(__global float *) &mem_335[global_tid_319 * 4];
}
float final_result_324;
barrier(CLK_LOCAL_MEM_FENCE);
if (slt32(local_tid_320, num_groups_278) && 1) {
*(__local float *) &mem_337[local_tid_320 * 4] = res_index_322;
}
barrier(CLK_LOCAL_MEM_FENCE);
int32_t skip_waves_353;
float binop_param_x_253;
int32_t my_index_286;
float binop_param_y_254;
int32_t other_offset_287;
my_index_286 = local_tid_320;
other_offset_287 = 0;
binop_param_x_253 = *(__local float *) &mem_337[(local_tid_320 +
other_offset_287) * 4];
other_offset_287 = 1;
while (slt32(other_offset_287, wave_sizze_350) & slt32(wave_sizze_350,
group_sizze_351)) {
if (((local_tid_320 - squot32(local_tid_320, wave_sizze_350) *
wave_sizze_350) & (2 * other_offset_287 - 1)) == 0) {
// read array element
{
binop_param_y_254 = *(volatile __local
float *) &mem_337[(local_tid_320 +
other_offset_287) * 4];
}
float res_255;
if (thread_active_352) {
res_255 = binop_param_x_253 + binop_param_y_254;
}
binop_param_x_253 = res_255;
*(volatile __local float *) &mem_337[local_tid_320 * 4] =
binop_param_x_253;
}
other_offset_287 *= 2;
}
skip_waves_353 = 1;
while (slt32(skip_waves_353, squot32(group_sizze_351 + wave_sizze_350 - 1,
wave_sizze_350))) {
barrier(CLK_LOCAL_MEM_FENCE);
other_offset_287 = skip_waves_353 * wave_sizze_350;
if ((local_tid_320 - squot32(local_tid_320, wave_sizze_350) *
wave_sizze_350) == 0 && (squot32(local_tid_320, wave_sizze_350) &
(2 * skip_waves_353 - 1)) == 0) {
// read array element
{
binop_param_y_254 = *(__local float *) &mem_337[(local_tid_320 +
other_offset_287) *
4];
}
float res_255;
if (thread_active_352) {
res_255 = binop_param_x_253 + binop_param_y_254;
}
binop_param_x_253 = res_255;
*(__local float *) &mem_337[local_tid_320 * 4] = binop_param_x_253;
}
skip_waves_353 *= 2;
}
final_result_324 = binop_param_x_253;
if (local_tid_320 == 0) {
*(__global float *) &mem_339[group_id_321 * 4] = final_result_324;
}
}
__kernel void kernel_copy_421(int32_t tmp_offs_420, __global
unsigned char *img_mem_405, int32_t sizze_298,
__global unsigned char *mem_411,
int32_t padded_sizze_373)
{
const uint copy_global_thread_index_421 = get_global_id(0);
if (copy_global_thread_index_421 >= sizze_298)
return;
*(__global float *) &mem_411[(tmp_offs_420 + copy_global_thread_index_421) *
4] = *(__global
float *) &img_mem_405[copy_global_thread_index_421 *
4];
}
__kernel void kernel_copy_424(int32_t tmp_offs_420, int32_t y_372, __global
unsigned char *mem_409, __global
unsigned char *mem_411, int32_t padded_sizze_373)
{
const uint copy_global_thread_index_424 = get_global_id(0);
if (copy_global_thread_index_424 >= y_372)
return;
*(__global float *) &mem_411[(tmp_offs_420 + copy_global_thread_index_424) *
4] = *(__global
float *) &mem_409[copy_global_thread_index_424 *
4];
}
__kernel void map_kernel_347(__global unsigned char *mem_416, int32_t x_305,
__global unsigned char *mem_414,
int32_t num_threads_346, int32_t n_303, __global
unsigned char *k_mem_407)
{
int32_t wave_sizze_427;
int32_t group_sizze_428;
char thread_active_429;
int32_t gtid_340;
int32_t local_tid_348;
int32_t group_id_349;
int32_t global_tid_347;
global_tid_347 = get_global_id(0);
local_tid_348 = get_local_id(0);
group_sizze_428 = get_local_size(0);
wave_sizze_427 = LOCKSTEP_WIDTH;
group_id_349 = get_group_id(0);
gtid_340 = global_tid_347;
thread_active_429 = slt32(gtid_340, n_303);
float res_352;
if (thread_active_429) {
float binop_param_x_355 = 0.0F;
for (int32_t chunk_offset_354 = 0; chunk_offset_354 < x_305;
chunk_offset_354++) {
float binop_param_x_364 = *(__global
float *) &k_mem_407[chunk_offset_354 *
4];
int32_t j_p_i_t_s_386 = gtid_340 + chunk_offset_354;
int32_t new_index_400 = squot32(j_p_i_t_s_386, x_305);
int32_t binop_y_402 = new_index_400 * x_305;
int32_t new_index_403 = j_p_i_t_s_386 - binop_y_402;
float binop_param_y_365 = *(__global
float *) &mem_414[(new_index_403 *
num_threads_346 +
new_index_400) * 4];
float res_367 = binop_param_x_364 * binop_param_y_365;
float res_368 = binop_param_x_355 + res_367;
float binop_param_x_tmp_430 = res_368;
binop_param_x_355 = binop_param_x_tmp_430;
}
res_352 = binop_param_x_355;
}
if (thread_active_429) {
*(__global float *) &mem_416[gtid_340 * 4] = res_352;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment