Skip to content

Instantly share code, notes, and snippets.

@benvanik
Last active November 13, 2024 16:19
Show Gist options
  • Save benvanik/9e2675f7595d38f432d6677aa295ca37 to your computer and use it in GitHub Desktop.
Save benvanik/9e2675f7595d38f432d6677aa295ca37 to your computer and use it in GitHub Desktop.
hsa prototypes
set -x
~/src/iree-build/llvm-project/bin/clang \
-x c -std=c23 \
-target amdgcn-amd-amdhsa -march=gfx1100 \
-nogpulib \
-fgpu-rdc \
-fno-short-wchar \
-fno-ident \
-Xclang -finclude-default-header \
-fvisibility=hidden \
-O3 \
kernels.c \
-c -emit-llvm -o kernels_c.bc
~/src/iree-build/llvm-project/bin/llvm-dis kernels_c.bc
~/src/iree-build/llvm-project/bin/llvm-link \
-internalize \
-only-needed \
kernels_c.bc \
-o kernels_c_linked.bc
# /opt/rocm/lib/llvm/lib/clang/17/lib/amdgcn/bitcode/ockl.bc \
# ~/src/iree-build/tools/iree_platform_libs/rocm/ockl.bc \
~/src/iree-build/llvm-project/bin/llvm-dis kernels_c_linked.bc
~/src/iree-build/llvm-project/bin/opt -O3 kernels_c_linked.bc -o kernels_c_opt.bc
~/src/iree-build/llvm-project/bin/llvm-dis kernels_c_opt.bc
~/src/iree-build/llvm-project/bin/lld \
-flavor gnu \
-m elf64_amdgpu \
--build-id=none \
--no-undefined \
-shared \
-plugin-opt=mcpu=gfx1100 \
-plugin-opt=O3 \
--lto-CGO3 \
--no-whole-archive \
--gc-sections \
--strip-debug \
--discard-all \
--discard-locals \
-o kernels_c.elf \
kernels_c_linked.bc
#--strip-all \
# -save-temps \
~/src/iree-build/llvm-project/bin/llvm-readelf \
kernels_c.elf --all \
>kernels_c.txt
# rm kernels_cl_linked.bc
# ~/src/iree-build/llvm-project/bin/clang \
# -x hip \
# --offload-device-only \
# --offload-arch=gfx1100 \
# -fuse-cuid=none \
# -nogpulib \
# -fgpu-rdc \
# -fvisibility=hidden \
# -O3 \
# kernels.cpp \
# -c -emit-llvm -o kernels_hip.bc
# ~/src/iree-build/llvm-project/bin/llvm-link \
# -internalize \
# -only-needed \
# kernels_hip.bc \
# /opt/rocm/lib/llvm/lib/clang/17/lib/amdgcn/bitcode/ockl.bc \
# -o kernels_hip_linked.bc
# # ~/src/iree-build/tools/iree_platform_libs/rocm/ockl.bc \
# ~/src/iree-build/llvm-project/bin/lld \
# -flavor gnu \
# -m elf64_amdgpu \
# --build-id=none \
# --no-undefined \
# -shared \
# -plugin-opt=mcpu=gfx1100 \
# -plugin-opt=O3 \
# --lto-CGO3 \
# --no-whole-archive \
# -o kernels_hip.elf \
# kernels_hip_linked.bc
# # -save-temps \
# ~/src/iree-build/llvm-project/bin/llvm-readelf \
# kernels_hip.elf --all \
# >kernels_hip.txt
# rm kernels_hip_linked.bc
// info queries
#include <memory.h>
#include <stdio.h>
#include <string.h>
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agent;
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
hsa_status_t err;
char product_name[64];
err = hsa_agent_get_info(
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, product_name);
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
static hsa_status_t pcs_config_callback(
const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data) {
fprintf(stderr, "HAVE CONFIG\n");
return HSA_STATUS_SUCCESS;
}
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
uint16_t version_major = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major);
uint16_t version_minor = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_minor);
uint64_t timestamp_frequency = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
&timestamp_frequency);
bool svm_by_default = false;
err = hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT,
&svm_by_default);
uint64_t max_wait = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait);
hsa_endianness_t endianness = HSA_ENDIANNESS_LITTLE;
err = hsa_system_get_info(HSA_SYSTEM_INFO_ENDIANNESS, &endianness);
hsa_machine_model_t machine_model = HSA_MACHINE_MODEL_SMALL;
err = hsa_system_get_info(HSA_SYSTEM_INFO_MACHINE_MODEL, &machine_model);
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agent = agents.all_agents[i];
err = hsa_agent_get_info(agents.gpu_agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
break;
}
}
//
char gpu_name[64];
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_NAME, gpu_name);
uint8_t gpu_extensions[128] = {0};
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_EXTENSIONS,
gpu_extensions);
bool has_amd_profiler =
gpu_extensions[64] & (HSA_EXTENSION_AMD_PROFILER - 64);
if (has_amd_profiler) fprintf(stderr, "HSA_EXTENSION_AMD_PROFILER\n");
bool has_amd_loader = gpu_extensions[64] & (HSA_EXTENSION_AMD_LOADER - 64);
if (has_amd_loader) fprintf(stderr, "HSA_EXTENSION_AMD_LOADER\n");
bool has_amd_aqlprofile =
gpu_extensions[64] & (HSA_EXTENSION_AMD_AQLPROFILE - 64);
if (has_amd_aqlprofile) fprintf(stderr, "HSA_EXTENSION_AMD_AQLPROFILE\n");
bool has_amd_pc_sampling =
gpu_extensions[64] & (HSA_EXTENSION_AMD_PC_SAMPLING - 64);
if (has_amd_pc_sampling) fprintf(stderr, "HSA_EXTENSION_AMD_PC_SAMPLING\n");
char cpu_name[64];
err = hsa_agent_get_info(agents.cpu_agent, HSA_AGENT_INFO_NAME, cpu_name);
uint32_t cpu_numa_node = 0;
err =
hsa_agent_get_info(agents.cpu_agent, HSA_AGENT_INFO_NODE, &cpu_numa_node);
memory_pools_t gpu_memory_pools;
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools);
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
for (uint32_t i = 0; i < gpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_access_t access =
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
err = hsa_amd_agent_memory_pool_get_info(
agents.cpu_agent, gpu_memory_pools.pools[i],
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
uint32_t num_link_hops = 0;
err = hsa_amd_agent_memory_pool_get_info(
agents.cpu_agent, gpu_memory_pools.pools[i],
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &num_link_hops);
hsa_amd_memory_pool_link_info_t link_infos[32];
err = hsa_amd_agent_memory_pool_get_info(
agents.cpu_agent, gpu_memory_pools.pools[i],
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_infos);
err = err;
}
for (uint32_t i = 0; i < gpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_access_t access =
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED;
err = hsa_amd_agent_memory_pool_get_info(
agents.gpu_agent, cpu_memory_pools.pools[i],
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access);
uint32_t num_link_hops = 0;
err = hsa_amd_agent_memory_pool_get_info(
agents.gpu_agent, cpu_memory_pools.pools[i],
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &num_link_hops);
hsa_amd_memory_pool_link_info_t link_infos[32];
err = hsa_amd_agent_memory_pool_get_info(
agents.gpu_agent, cpu_memory_pools.pools[i],
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_infos);
err = err;
}
hsa_ven_amd_pc_sampling_1_00_pfn_t pcs_table;
err = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_PC_SAMPLING, 1,
sizeof(pcs_table), &pcs_table);
// err = pcs_table.hsa_ven_amd_pcs_iterate_configuration(
// agents.gpu_agent, pcs_config_callback, NULL);
err = hsa_shut_down();
return 0;
}
// dispatch hip or opencl kernel and timing
#include <assert.h>
#include <errno.h>
#include <fcntl.h> // open
#include <fcntl.h>
#include <memory.h>
#include <stdatomic.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ven_amd_loader.h"
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
// AMD Signal Kind Enumeration Values.
enum iree_amd_signal_kind_t {
IREE_AMD_SIGNAL_KIND_INVALID = 0,
IREE_AMD_SIGNAL_KIND_USER = 1,
IREE_AMD_SIGNAL_KIND_DOORBELL = -1,
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
};
typedef int64_t iree_amd_signal_kind64_t;
#define IREE_AMDGPU_GLOBAL
#define IREE_AMDGPU_ALIGNAS(x) __attribute__((aligned(x)))
typedef struct IREE_AMDGPU_ALIGNAS(64) iree_amd_signal_s {
iree_amd_signal_kind64_t kind;
union {
volatile int64_t value;
IREE_AMDGPU_GLOBAL volatile uint32_t* legacy_hardware_doorbell_ptr;
IREE_AMDGPU_GLOBAL volatile uint64_t* hardware_doorbell_ptr;
};
uint64_t event_mailbox_ptr;
uint32_t event_id;
uint32_t reserved1;
uint64_t start_ts;
uint64_t end_ts;
union {
IREE_AMDGPU_GLOBAL /*iree_amd_queue_t*/ void* queue_ptr;
uint64_t reserved2;
};
uint32_t reserved3[2];
} iree_amd_signal_t;
/* Call ioctl, restarting if it is interrupted */
int hsakmt_ioctl(int fd, unsigned long request, void* arg) {
int ret;
do {
ret = ioctl(fd, request, arg);
} while (ret == -1 && (errno == EINTR || errno == EAGAIN));
return ret;
}
#define AMDKFD_IOCTL_BASE 'K'
#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr)
#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type)
#define AMDKFD_IOW(nr, type) _IOW(AMDKFD_IOCTL_BASE, nr, type)
#define AMDKFD_IOWR(nr, type) _IOWR(AMDKFD_IOCTL_BASE, nr, type)
#define AMDKFD_IOC_GET_CLOCK_COUNTERS \
AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args)
struct kfd_ioctl_get_clock_counters_args {
uint64_t gpu_clock_counter; /* from KFD */
uint64_t cpu_clock_counter; /* from KFD */
uint64_t system_clock_counter; /* from KFD */
uint64_t system_clock_freq; /* from KFD */
uint32_t gpu_id; /* to KFD */
uint32_t pad;
};
#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \
defined(__amd64__) || defined(_M_X64) || defined(_M_AMD64)
#include <xmmintrin.h>
#define IS_X86 1
#endif
// https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/util/streaming-load-memcpy.c
__attribute__((always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
// #if defined(IS_X86)
// #if defined(__AVX512F__)
// #pragma unroll
// for (size_t i = 0; i != size / sizeof(__m512i); ++i) {
// _mm512_stream_si512((__m512i* __restrict)(dst),
// *(const __m512i* __restrict)(src));
// src += sizeof(__m512i);
// dst += sizeof(__m512i);
// }
// size = size % sizeof(__m512i);
// #endif
// #if defined(__AVX__)
// #pragma unroll
// for (size_t i = 0; i != size / sizeof(__m256i); ++i) {
// _mm256_stream_si256((__m256i* __restrict)(dst),
// *(const __m256i* __restrict)(src));
// src += sizeof(__m256i);
// dst += sizeof(__m256i);
// }
// size = size % sizeof(__m256i);
// #endif
// #pragma unroll
// for (size_t i = 0; i != size / sizeof(__m128i); ++i) {
// _mm_stream_si128((__m128i* __restrict)(dst),
// *((const __m128i* __restrict)(src)));
// src += sizeof(__m128i);
// dst += sizeof(__m128i);
// }
// size = size % sizeof(__m128i);
// #pragma unroll
// for (size_t i = 0; i != size / sizeof(long long); ++i) {
// _mm_stream_si64((long long* __restrict)(dst),
// *(const long long* __restrict)(src));
// src += sizeof(long long);
// dst += sizeof(long long);
// }
// size = size % sizeof(long long);
// #pragma unroll
// for (size_t i = 0; i != size / sizeof(int); ++i) {
// _mm_stream_si32((int* __restrict)(dst), *(const int* __restrict)(src));
// src += sizeof(int);
// dst += sizeof(int);
// }
// #else
memcpy(dst, src, size);
// #endif
}
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agent;
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_region_t regions[32];
} regions_t;
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) {
regions_t* regions = (regions_t*)user_data;
regions->regions[regions->count++] = region;
return HSA_STATUS_SUCCESS;
}
static hsa_status_t iterate_symbol(hsa_executable_t executable,
hsa_executable_symbol_t symbol,
void* user_data) {
uint32_t name_length = 0;
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
&name_length);
char name[128];
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
fprintf(stderr, "symbol: %.*s\n", (int)name_length, name);
return HSA_STATUS_SUCCESS;
}
static hsa_status_t iterate_program_symbol(hsa_executable_t executable,
hsa_executable_symbol_t symbol,
void* user_data) {
uint32_t name_length = 0;
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
&name_length);
char name[128];
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
fprintf(stderr, "program symbol: %.*s\n", (int)name_length, name);
return HSA_STATUS_SUCCESS;
}
static hsa_status_t iterate_agent_symbol(hsa_executable_t executable,
hsa_agent_t agent,
hsa_executable_symbol_t symbol,
void* user_data) {
uint32_t name_length = 0;
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH,
&name_length);
char name[128];
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name);
fprintf(stderr, "agent symbol: %.*s\n", (int)name_length, name);
return HSA_STATUS_SUCCESS;
}
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue,
void* user_data) {
const char* status_str = NULL;
hsa_status_string(status, &status_str);
fprintf(stderr, "gpu_queue_callback %s", status_str);
}
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agent = agents.all_agents[i];
err = hsa_agent_get_info(agents.gpu_agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
break;
}
}
memory_pools_t gpu_memory_pools;
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools);
regions_t gpu_regions;
memset(&gpu_regions, 0, sizeof(gpu_regions));
err = hsa_agent_iterate_regions(agents.gpu_agent, iterate_regions,
&gpu_regions);
hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < gpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = gpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
// gpu_coarse_pool = pool;
} else if (global_flag &
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) {
// gpu_fine_pool = pool;
kernarg_pool = pool;
}
}
}
// for (uint32_t i = 0; i < gpu_regions.count; ++i) {
// hsa_region_t region = gpu_regions.regions[i];
// hsa_region_segment_t segment;
// err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment);
// if (segment == HSA_REGION_SEGMENT_GLOBAL) {
// hsa_region_global_flag_t flags;
// err = hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS,
// &flags); if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) {
// bool host_accessible = false;
// err = hsa_region_get_info(
// region, (hsa_region_info_t)HSA_AMD_REGION_INFO_HOST_ACCESSIBLE,
// &host_accessible);
// void* base_address = NULL;
// err = hsa_region_get_info(
// region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BASE,
// &base_address);
// }
// }
// }
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
regions_t cpu_regions;
memset(&cpu_regions, 0, sizeof(cpu_regions));
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions,
&cpu_regions);
hsa_amd_memory_pool_t cpu_fine_pool = {0};
hsa_amd_memory_pool_t cpu_coarse_pool = {0};
// hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
// kernarg_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
cpu_coarse_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_pool = pool;
}
}
}
uint32_t gpu_queue_min_size = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&gpu_queue_min_size);
uint32_t gpu_queue_max_size = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&gpu_queue_max_size);
uint32_t gpu_queue_size = gpu_queue_max_size;
hsa_queue_t* gpu_queue = NULL;
err = hsa_queue_create(agents.gpu_agent, gpu_queue_size, HSA_QUEUE_TYPE_MULTI,
gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queue);
//
hsa_file_t object_file =
open("experimental/hsa_tests/kernels_c.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY);
hsa_code_object_reader_t object_reader;
err = hsa_code_object_reader_create_from_file(object_file, &object_reader);
hsa_executable_t executable;
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
hsa_loaded_code_object_t loaded_code_object;
err = hsa_executable_load_agent_code_object(
executable, agents.gpu_agent, object_reader, NULL, &loaded_code_object);
err = hsa_executable_freeze(executable, NULL);
hsa_ven_amd_loader_1_03_pfn_t loader;
err = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1,
sizeof(loader), &loader);
uint64_t code_base = 0;
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE,
&code_base);
uint64_t code_size = 0;
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE,
&code_size);
int64_t code_delta = 0;
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info(
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA,
&code_delta);
err = hsa_executable_iterate_agent_symbols(executable, agents.gpu_agent,
iterate_agent_symbol, NULL);
err = hsa_executable_iterate_symbols(executable, iterate_symbol, NULL);
err = hsa_executable_iterate_program_symbols(executable,
iterate_program_symbol, NULL);
struct kernel_info_t {
uint64_t handle;
uint32_t private_size;
uint32_t group_size;
uint32_t kernarg_alignment;
uint32_t kernarg_size;
} kernel_info;
hsa_executable_symbol_t symbol;
err = hsa_executable_get_symbol_by_name(
executable, "add_one_with_timestamp.kd", &agents.gpu_agent, &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info.handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info.private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info.group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info.kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info.kernarg_size);
void* kernarg_storage = NULL;
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info.kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storage);
err =
hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, kernarg_storage);
hsa_signal_t dispatch_signal;
err = hsa_signal_create(1, 0, NULL, &dispatch_signal);
// err = hsa_amd_signal_create(1, 0, NULL, HSA_AMD_SIGNAL_IPC,
// &dispatch_signal);
err = hsa_amd_profiling_set_profiler_enabled(gpu_queue, 1);
// submit
// hidden args
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/rocm/rocvirtual.cpp#L3134
// uint8_t kind
// uint8_t value
// uint16_t offset
//
// or hardcode
// uint16_t implicit_offset;
// uint8_t kind_offsets[hidden enum];
// if (kind_offsets[hidden_block_count_x] != 0xFF) {
// kernargs[kind_offsets[hidden_block_count_x]] = block_count_x;
// }
// ...
// keeps data small, no loops just conditional moves
// could have a bit for all?
//
// or bitmap of hidden args
// bit indicates presence
// assume dense and ordered because of the way the compiler does things
//
// amdgpu-no-implicitarg-ptr is the only way to disable
// otherwise all are required with a fixed 256b size
// so just need to find the base offset
//
// explicit args always start at 0
// implicit are explicit + 8-byte aligned
//
// third_party/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
typedef struct implicit_kernargs_t {
// Grid dispatch workgroup count.
// Some languages, such as OpenCL, support a last workgroup in each
// dimension being partial. This count only includes the non-partial
// workgroup count. This is not the same as the value in the AQL dispatch
// packet, which has the grid size in workitems.
//
// Represented in metadata as:
// hidden_block_count_x
// hidden_block_count_y
// hidden_block_count_z
uint32_t block_count[3]; // + 0/4/8
// Grid dispatch workgroup size.
// This size only applies to the non-partial workgroups. This is the same
// value as the AQL dispatch packet workgroup size.
//
// Represented in metadata as:
// hidden_group_size_x
// hidden_group_size_y
// hidden_group_size_z
uint16_t group_size[3]; // + 12/14/16
// Grid dispatch work group size of the partial work group, if it exists.
// Any dimension that does not exist must be 0.
//
// Represented in metadata as:
// hidden_remainder_x
// hidden_remainder_y
// hidden_remainder_z
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
// OpenCL grid dispatch global offset.
//
// Represented in metadata as:
// hidden_global_offset_x
// hidden_global_offset_y
// hidden_global_offset_z
uint64_t global_offset[3]; // + 40/48/56
// Grid dispatch dimensionality. This is the same value as the AQL
// dispatch packet dimensionality. Must be a value between 1 and 3.
//
// Represented in metadata as:
// hidden_grid_dims
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
uint32_t element_count = 64;
uint32_t* buffer = NULL;
err = hsa_amd_memory_pool_allocate(
cpu_coarse_pool, element_count * sizeof(uint32_t),
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer);
for (uint32_t i = 0; i < element_count; ++i) {
buffer[i] = i;
}
err = hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, buffer);
uint32_t grid_size[3] = {element_count, 1, 1};
uint16_t workgroup_size[3] = {32, 1, 1};
typedef struct add_one_args_t {
uint32_t n;
void* buffer;
} add_one_args_t;
add_one_args_t* explicit_kernargs = (add_one_args_t*)kernarg_storage;
explicit_kernargs->n = element_count;
explicit_kernargs->buffer = buffer;
implicit_kernargs_t* implicit_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storage +
iree_host_align(sizeof(add_one_args_t), 8));
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit_kernargs->group_size[0] = workgroup_size[0];
implicit_kernargs->group_size[1] = workgroup_size[1];
implicit_kernargs->group_size[2] = workgroup_size[2];
implicit_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0; // newOffset[0];
implicit_kernargs->global_offset[1] = 0; // newOffset[1];
implicit_kernargs->global_offset[2] = 0; // newOffset[2];
implicit_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet;
packet.header = HSA_PACKET_TYPE_INVALID;
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet.workgroup_size_x = workgroup_size[0];
packet.workgroup_size_y = workgroup_size[1];
packet.workgroup_size_z = workgroup_size[2];
packet.reserved0 = 0;
packet.grid_size_x = grid_size[0];
packet.grid_size_y = grid_size[1];
packet.grid_size_z = grid_size[2];
packet.private_segment_size = kernel_info.private_size;
packet.group_segment_size = kernel_info.group_size;
packet.kernel_object = kernel_info.handle;
packet.kernarg_address = kernarg_storage;
packet.completion_signal = dispatch_signal;
uint16_t packet_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet_header_setup = packet_header | (packet.setup << 16);
uint64_t packet_id = hsa_queue_add_write_index_screlease(gpu_queue, 1);
while ((packet_id - hsa_queue_load_read_index_acquire(gpu_queue)) >=
gpu_queue->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queue->base_address +
(packet_id & (gpu_queue->size - 1)) * 64);
nontemporalMemcpy(packet_ptr, &packet, sizeof(packet));
atomic_store_explicit((volatile atomic_uint*)packet_ptr, packet_header_setup,
memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queue->doorbell_signal, packet_id);
hsa_signal_value_t wait_value =
hsa_signal_wait_scacquire(dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (wait_value != 0) {
fprintf(stderr, "wait failed\n");
}
// can directly access agent-specific ticks on the signal
// could store these as part of the trace provider
// must call hsa_amd_profiling_set_profiler_enabled to ensure populated
// request a batch hsa_amd_profiling_convert_tick_to_system_domain?
// may still want to adjust, but avoid API overheads when converting 1000's
iree_amd_signal_t* amd_dispatch_signal =
(iree_amd_signal_t*)dispatch_signal.handle;
// amd_dispatch_signal->start_ts;
// amd_dispatch_signal->end_ts;
uint64_t start_ts = 0;
err = hsa_amd_profiling_convert_tick_to_system_domain(
agents.gpu_agent, amd_dispatch_signal->start_ts, &start_ts);
uint64_t end_ts = 0;
err = hsa_amd_profiling_convert_tick_to_system_domain(
agents.gpu_agent, amd_dispatch_signal->end_ts, &end_ts);
// (end - start) / system_frequency = seconds
hsa_amd_profiling_dispatch_time_t time;
err = hsa_amd_profiling_get_dispatch_time(agents.gpu_agent, dispatch_signal,
&time);
uint64_t system_frequency = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY,
&system_frequency);
uint64_t system_timestamp = 0;
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &system_timestamp);
// sleep(10);
// uint64_t system_timestamp2 = 0;
// err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &system_timestamp2);
uint64_t kernel_time = ((uint64_t)buffer[0]) << 32 | buffer[1];
// use kmt to query system/gpu time
// can use to adjust like GpuAgent::TranslateTime
// https://sourcegraph.com/github.com/ROCm/ROCR-Runtime@909b82d4632b86dff0faadcb19488a43d2108686/-/blob/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp?L2048
static const char kfd_device_name[] = "/dev/kfd";
int hsakmt_kfd_fd = open(kfd_device_name, O_RDWR | O_CLOEXEC);
if (hsakmt_kfd_fd == -1) {
// result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED;
abort();
}
// hsaKmtGetClockCounters
struct kfd_ioctl_get_clock_counters_args args;
args.gpu_id = 0; // node id
int kmt_err =
hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args);
if (hsakmt_kfd_fd) {
close(hsakmt_kfd_fd);
hsakmt_kfd_fd = -1;
}
uint64_t srcloc = ((uint64_t)buffer[2]) << 32 | buffer[3];
typedef struct iree_tracing_location_s {
const char* name;
const char* function;
const char* file;
uint32_t line;
uint32_t color;
} iree_tracing_location_t;
const iree_tracing_location_t* srcloc_p =
(const iree_tracing_location_t*)srcloc;
assert(srcloc >= code_base && srcloc < code_base + code_size);
const uint8_t* host_code_base = NULL;
err = loader.hsa_ven_amd_loader_query_host_address(
(void*)code_base, (const void**)&host_code_base);
const iree_tracing_location_t* srcloc_shadow3 =
(iree_tracing_location_t*)(srcloc - code_base + host_code_base);
const iree_tracing_location_t* srcloc_shadow2 = NULL;
err = loader.hsa_ven_amd_loader_query_host_address(
(void*)srcloc, (const void**)&srcloc_shadow2);
const char* shadow_name = NULL;
err = loader.hsa_ven_amd_loader_query_host_address(
(void*)srcloc_shadow2->name, (const void**)&shadow_name);
uint8_t* code_shadow = (uint8_t*)malloc(code_size);
memcpy(code_shadow, (const void*)code_base, code_size);
iree_tracing_location_t* srcloc_shadow =
(iree_tracing_location_t*)(srcloc - code_base + (uint64_t)code_shadow);
srcloc_shadow->name = (const char*)((uint64_t)srcloc_shadow->name -
code_base + (uint64_t)code_shadow);
hsa_amd_pointer_info_t srcloc_info;
srcloc_info.size = sizeof(srcloc_info);
uint32_t num_agents_accessible = 0;
hsa_agent_t* accessible = NULL;
err = hsa_amd_pointer_info(srcloc_p, &srcloc_info, malloc,
&num_agents_accessible, &accessible);
// owned by gpu_agent, global_flags=5 (coarse | kernarg)
free(accessible);
for (uint32_t i = 0; i < element_count; ++i) {
fprintf(stderr, "%u ", buffer[i]);
}
fprintf(stderr, "\n");
err = hsa_amd_profiling_set_profiler_enabled(gpu_queue, 0);
err = hsa_amd_memory_pool_free(buffer);
err = hsa_signal_destroy(dispatch_signal);
err = hsa_amd_memory_pool_free(kernarg_storage);
err = hsa_queue_destroy(gpu_queue);
err = hsa_executable_destroy(executable);
err = hsa_code_object_reader_destroy(object_reader);
err = hsa_shut_down();
return 0;
}
// pc_sampling?
// doesn't work
#include <alloca.h>
#include <errno.h>
#include <fcntl.h> // open
#include <memory.h>
#include <stdatomic.h>
#include <stdio.h>
#include <string.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
__attribute__((always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
memcpy(dst, src, size);
}
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agent;
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_region_t regions[32];
} regions_t;
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) {
regions_t* regions = (regions_t*)user_data;
regions->regions[regions->count++] = region;
return HSA_STATUS_SUCCESS;
}
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue,
void* user_data) {
const char* status_str = NULL;
hsa_status_string(status, &status_str);
fprintf(stderr, "gpu_queue_callback %s", status_str);
}
static hsa_status_t pcs_config_callback(
const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data) {
//
return HSA_STATUS_SUCCESS;
}
#include "/home/nod/src/ROCR-Runtime/libhsakmt/include/hsakmt/hsakmt.h"
#include "/home/nod/src/ROCR-Runtime/libhsakmt/include/hsakmt/linux/kfd_ioctl.h"
void pcs_data_ready(void* client_callback_data, size_t data_size,
size_t lost_sample_count,
hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback,
void* hsa_callback_data) {
perf_sample_snapshot_v1_t* sample_buffer =
(perf_sample_snapshot_v1_t*)client_callback_data;
fprintf(stderr, "PCS; data size = %zu, lost samples = %zu\n", data_size,
lost_sample_count);
hsa_status_t err =
data_copy_callback(hsa_callback_data, data_size, sample_buffer);
}
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agent = agents.all_agents[i];
err = hsa_agent_get_info(agents.gpu_agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
break;
}
}
memory_pools_t gpu_memory_pools;
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools);
regions_t gpu_regions;
memset(&gpu_regions, 0, sizeof(gpu_regions));
err = hsa_agent_iterate_regions(agents.gpu_agent, iterate_regions,
&gpu_regions);
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
regions_t cpu_regions;
memset(&cpu_regions, 0, sizeof(cpu_regions));
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions,
&cpu_regions);
hsa_amd_memory_pool_t cpu_fine_pool = {0};
hsa_amd_memory_pool_t cpu_coarse_pool = {0};
hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
kernarg_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
cpu_coarse_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_pool = pool;
}
}
}
uint32_t gpu_queue_min_size = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&gpu_queue_min_size);
uint32_t gpu_queue_max_size = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&gpu_queue_max_size);
uint32_t gpu_queue_size = gpu_queue_max_size;
hsa_queue_t* gpu_queue = NULL;
err = hsa_queue_create(agents.gpu_agent, gpu_queue_size, HSA_QUEUE_TYPE_MULTI,
gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queue);
uint32_t gpu_node_id = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_NODE, &gpu_node_id);
int kmtfd = open("/dev/kfd", O_RDWR);
struct kfd_ioctl_pc_sample_args args;
memset(&args, 0, sizeof(args));
args.op = KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES;
// cat /sys/devices/virtual/kfd/kfd/topology/nodes/1/gpu_id
args.gpu_id = 56588;
args.sample_info_ptr = 0; //(uint64_t)sample_info;
args.num_sample_info = 0; // sample_info_sz;
args.flags = 0;
int ioctl_ret = 0;
do {
ioctl_ret = ioctl(kmtfd, AMDKFD_IOC_PC_SAMPLE, &args);
} while (ioctl_ret == -1 && (errno == EINTR || errno == EAGAIN));
fprintf(stderr, "ioctl %d\n", ioctl_ret);
// uint32_t size = 0;
// HSAKMT_STATUS ret =
// hsaKmtPcSamplingQueryCapabilities(gpu_node_id, NULL, 0, &size);
// if (ret != HSAKMT_STATUS_SUCCESS || size == 0) {
// fprintf(stderr, "KMT FAIL\n");
// }
// HsaPcSamplingInfo* sample_info_list =
// alloca(size * sizeof(HsaPcSamplingInfo));
// ret = hsaKmtPcSamplingQueryCapabilities(gpu_node_id, sample_info_list,
// size,
// &size);
// iterate configs not found?
// HSA_EXTENSION_AMD_PC_SAMPLING extension bit
// err = hsa_ven_amd_pcs_iterate_configuration(agents.gpu_agent,
// pcs_config_callback, NULL);
perf_sample_snapshot_v1_t sample_buffer[16 * 1024];
hsa_ven_amd_pcs_t pc_sampling;
err = hsa_ven_amd_pcs_create(
agents.gpu_agent, HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1,
HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, 100, 0,
sizeof(sample_buffer), pcs_data_ready, sample_buffer, &pc_sampling);
//
hsa_file_t object_file =
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY);
hsa_code_object_reader_t object_reader;
err = hsa_code_object_reader_create_from_file(object_file, &object_reader);
hsa_executable_t executable;
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agent,
object_reader, NULL, NULL);
err = hsa_executable_freeze(executable, NULL);
struct kernel_info_t {
uint64_t handle;
uint32_t private_size;
uint32_t group_size;
uint32_t kernarg_alignment;
uint32_t kernarg_size;
} kernel_info;
hsa_executable_symbol_t symbol;
err = hsa_executable_get_symbol_by_name(executable, "add_one.kd",
&agents.gpu_agent, &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info.handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info.private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info.group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info.kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info.kernarg_size);
err = hsa_ven_amd_pcs_start(pc_sampling);
void* kernarg_storage = NULL;
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info.kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storage);
err =
hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, kernarg_storage);
hsa_signal_t dispatch_signal;
err = hsa_signal_create(1, 0, NULL, &dispatch_signal);
typedef struct implicit_kernargs_t {
uint32_t block_count[3]; // + 0/4/8
uint16_t group_size[3]; // + 12/14/16
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
uint64_t global_offset[3]; // + 40/48/56
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
uint32_t element_count = 65;
uint32_t* buffer = NULL;
err = hsa_amd_memory_pool_allocate(
cpu_coarse_pool, element_count * sizeof(uint32_t),
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer);
for (uint32_t i = 0; i < element_count; ++i) {
buffer[i] = i;
}
err = hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, buffer);
uint32_t grid_size[3] = {element_count, 1, 1};
uint16_t workgroup_size[3] = {32, 1, 1};
typedef struct add_one_args_t {
uint32_t n;
void* buffer;
} add_one_args_t;
add_one_args_t* explicit_kernargs = (add_one_args_t*)kernarg_storage;
explicit_kernargs->n = element_count;
explicit_kernargs->buffer = buffer;
implicit_kernargs_t* implicit_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storage +
iree_host_align(sizeof(add_one_args_t), 8));
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit_kernargs->group_size[0] = workgroup_size[0];
implicit_kernargs->group_size[1] = workgroup_size[1];
implicit_kernargs->group_size[2] = workgroup_size[2];
implicit_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0; // newOffset[0];
implicit_kernargs->global_offset[1] = 0; // newOffset[1];
implicit_kernargs->global_offset[2] = 0; // newOffset[2];
implicit_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet;
packet.header = HSA_PACKET_TYPE_INVALID;
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet.workgroup_size_x = workgroup_size[0];
packet.workgroup_size_y = workgroup_size[1];
packet.workgroup_size_z = workgroup_size[2];
packet.reserved0 = 0;
packet.grid_size_x = grid_size[0];
packet.grid_size_y = grid_size[1];
packet.grid_size_z = grid_size[2];
packet.private_segment_size = kernel_info.private_size;
packet.group_segment_size = kernel_info.group_size;
packet.kernel_object = kernel_info.handle;
packet.kernarg_address = kernarg_storage;
packet.reserved2 = 0;
packet.completion_signal = dispatch_signal;
uint16_t packet_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet_header_setup = packet_header | (packet.setup << 16);
uint64_t packet_id = hsa_queue_add_write_index_screlease(gpu_queue, 1);
while ((packet_id - hsa_queue_load_read_index_acquire(gpu_queue)) >=
gpu_queue->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queue->base_address +
(packet_id & (gpu_queue->size - 1)) * 64);
nontemporalMemcpy(packet_ptr, &packet, sizeof(packet));
atomic_store_explicit((volatile atomic_uint*)packet_ptr, packet_header_setup,
memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queue->doorbell_signal, packet_id);
hsa_signal_value_t wait_value =
hsa_signal_wait_scacquire(dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (wait_value != 0) {
fprintf(stderr, "wait failed\n");
}
err = hsa_ven_amd_pcs_stop(pc_sampling);
err = hsa_ven_amd_pcs_flush(pc_sampling);
for (uint32_t i = 0; i < element_count; ++i) {
fprintf(stderr, "%u ", buffer[i]);
}
fprintf(stderr, "\n");
err = hsa_amd_memory_pool_free(buffer);
err = hsa_signal_destroy(dispatch_signal);
err = hsa_amd_memory_pool_free(kernarg_storage);
err = hsa_ven_amd_pcs_destroy(pc_sampling);
err = hsa_queue_destroy(gpu_queue);
err = hsa_executable_destroy(executable);
err = hsa_code_object_reader_destroy(object_reader);
err = hsa_shut_down();
return 0;
}
// mutli-device from host
#include <fcntl.h> // open
#include <memory.h>
#include <stdatomic.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
__attribute__((always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
memcpy(dst, src, size);
}
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agents[2];
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_region_t regions[32];
} regions_t;
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) {
regions_t* regions = (regions_t*)user_data;
regions->regions[regions->count++] = region;
return HSA_STATUS_SUCCESS;
}
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue,
void* user_data) {
const char* status_str = NULL;
hsa_status_string(status, &status_str);
fprintf(stderr, "gpu_queue_callback %s", status_str);
}
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
uint32_t gpu_count = 0;
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agents[gpu_count++] = agents.all_agents[i];
err = hsa_agent_get_info(agents.all_agents[i],
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
}
}
memory_pools_t gpu_memory_pools[2];
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agents[0], iterate_memory_pool, &gpu_memory_pools[0]);
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agents[1], iterate_memory_pool, &gpu_memory_pools[1]);
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
regions_t cpu_regions;
memset(&cpu_regions, 0, sizeof(cpu_regions));
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions,
&cpu_regions);
hsa_amd_memory_pool_t cpu_fine_pool = {0};
hsa_amd_memory_pool_t cpu_coarse_pool = {0};
hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
kernarg_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
cpu_coarse_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_pool = pool;
}
}
}
// assumes same params
uint32_t gpu_queue_min_size = 0;
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&gpu_queue_min_size);
uint32_t gpu_queue_max_size = 0;
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&gpu_queue_max_size);
uint32_t gpu_queue_size = gpu_queue_max_size;
hsa_queue_t* gpu_queues[2] = {NULL, NULL};
err = hsa_queue_create(agents.gpu_agents[0], gpu_queue_size,
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queues[0]);
err = hsa_queue_create(agents.gpu_agents[1], gpu_queue_size,
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queues[1]);
//
hsa_file_t object_file =
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY);
hsa_code_object_reader_t object_reader;
err = hsa_code_object_reader_create_from_file(object_file, &object_reader);
hsa_executable_t executable;
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[0],
object_reader, NULL, NULL);
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[1],
object_reader, NULL, NULL);
err = hsa_executable_freeze(executable, NULL);
struct kernel_info_t {
uint64_t handle;
uint32_t private_size;
uint32_t group_size;
uint32_t kernarg_alignment;
uint32_t kernarg_size;
} kernel_info[2];
hsa_executable_symbol_t symbol;
err = hsa_executable_get_symbol_by_name(executable, "add_one.kd",
&agents.gpu_agents[0], &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[0].handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info[0].private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info[0].group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info[0].kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info[0].kernarg_size);
err = hsa_executable_get_symbol_by_name(executable, "mul_x.kd",
&agents.gpu_agents[1], &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[1].handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info[1].private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info[1].group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info[1].kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info[1].kernarg_size);
void* kernarg_storages[2] = {NULL, NULL};
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[0].kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storages[0]);
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[0], NULL,
kernarg_storages[0]);
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[1].kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storages[1]);
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[1], NULL,
kernarg_storages[1]);
hsa_signal_t dispatch_signals[2];
// if device->device then can use AMD_GPU_ONLY to avoid interrupt signals
// err = hsa_signal_create(1, 0, NULL, &dispatch_signals[0]);
err = hsa_amd_signal_create(1, 0, NULL, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &dispatch_signals[0]);
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[1]);
typedef struct implicit_kernargs_t {
uint32_t block_count[3]; // + 0/4/8
uint16_t group_size[3]; // + 12/14/16
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
uint64_t global_offset[3]; // + 40/48/56
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
uint32_t element_count = 65;
uint32_t* buffer = NULL;
err = hsa_amd_memory_pool_allocate(
cpu_coarse_pool, element_count * sizeof(uint32_t),
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer);
for (uint32_t i = 0; i < element_count; ++i) {
buffer[i] = i;
}
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL, buffer);
uint32_t grid_size[3] = {element_count, 1, 1};
uint16_t workgroup_size[3] = {32, 1, 1};
// gpu1: barrier (wait) -> dispatch2
// gpu0: dispatch1 (signal)
// enqueue gpu1 before gpu0 to ensure wait happens
{
hsa_barrier_and_packet_t barrier;
memset(&barrier, 0, sizeof(barrier));
barrier.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
barrier.dep_signal[0] = dispatch_signals[0];
// note uint16_t high is reserved0
uint32_t barrier_header =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint64_t barrier_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1);
while ((barrier_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >=
gpu_queues[1]->size) {
sleep(0);
}
hsa_barrier_and_packet_t* barrier_ptr =
(hsa_barrier_and_packet_t*)((uint8_t*)gpu_queues[1]->base_address +
(barrier_id & (gpu_queues[1]->size - 1)) *
64);
nontemporalMemcpy(barrier_ptr, &barrier, sizeof(barrier));
atomic_store_explicit((volatile atomic_uint*)barrier_ptr, barrier_header,
memory_order_release);
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, barrier_id);
}
typedef struct mul_x_args_t {
uint32_t x;
uint32_t n;
void* buffer;
} mul_x_args_t;
mul_x_args_t* mul_x_kernargs = (mul_x_args_t*)kernarg_storages[1];
mul_x_kernargs->x = 2;
mul_x_kernargs->n = element_count;
mul_x_kernargs->buffer = buffer;
implicit_kernargs_t* implicit1_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[1] +
iree_host_align(sizeof(mul_x_args_t), 8));
implicit1_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit1_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit1_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit1_kernargs->group_size[0] = workgroup_size[0];
implicit1_kernargs->group_size[1] = workgroup_size[1];
implicit1_kernargs->group_size[2] = workgroup_size[2];
implicit1_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit1_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit1_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit1_kernargs->reserved0 = 0;
implicit1_kernargs->reserved1 = 0;
implicit1_kernargs->global_offset[0] = 0; // newOffset[0];
implicit1_kernargs->global_offset[1] = 0; // newOffset[1];
implicit1_kernargs->global_offset[2] = 0; // newOffset[2];
implicit1_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet1;
packet1.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
packet1.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet1.workgroup_size_x = workgroup_size[0];
packet1.workgroup_size_y = workgroup_size[1];
packet1.workgroup_size_z = workgroup_size[2];
packet1.reserved0 = 0;
packet1.grid_size_x = grid_size[0];
packet1.grid_size_y = grid_size[1];
packet1.grid_size_z = grid_size[2];
packet1.private_segment_size = kernel_info[1].private_size;
packet1.group_segment_size = kernel_info[1].group_size;
packet1.kernel_object = kernel_info[1].handle;
packet1.kernarg_address = kernarg_storages[1];
packet1.reserved2 = 0;
packet1.completion_signal = dispatch_signals[1];
uint16_t packet1_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(0 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet1_header_setup = packet1_header | (packet1.setup << 16);
uint64_t packet1_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1);
while ((packet1_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >=
gpu_queues[1]->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet1_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[1]->base_address +
(packet1_id & (gpu_queues[1]->size - 1)) *
64);
nontemporalMemcpy(packet1_ptr, &packet1, sizeof(packet1));
atomic_store_explicit((volatile atomic_uint*)packet1_ptr,
packet1_header_setup, memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, packet1_id);
// "ensure" gpu1 is waiting
sleep(1);
typedef struct add_one_args_t {
uint32_t n;
void* buffer;
} add_one_args_t;
add_one_args_t* add_one_kernargs = (add_one_args_t*)kernarg_storages[0];
add_one_kernargs->n = element_count;
add_one_kernargs->buffer = buffer;
implicit_kernargs_t* implicit_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[0] +
iree_host_align(sizeof(add_one_args_t), 8));
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit_kernargs->group_size[0] = workgroup_size[0];
implicit_kernargs->group_size[1] = workgroup_size[1];
implicit_kernargs->group_size[2] = workgroup_size[2];
implicit_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0; // newOffset[0];
implicit_kernargs->global_offset[1] = 0; // newOffset[1];
implicit_kernargs->global_offset[2] = 0; // newOffset[2];
implicit_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet0;
packet0.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
packet0.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet0.workgroup_size_x = workgroup_size[0];
packet0.workgroup_size_y = workgroup_size[1];
packet0.workgroup_size_z = workgroup_size[2];
packet0.reserved0 = 0;
packet0.grid_size_x = grid_size[0];
packet0.grid_size_y = grid_size[1];
packet0.grid_size_z = grid_size[2];
packet0.private_segment_size = kernel_info[0].private_size;
packet0.group_segment_size = kernel_info[0].group_size;
packet0.kernel_object = kernel_info[0].handle;
packet0.kernarg_address = kernarg_storages[0];
packet0.reserved2 = 0;
packet0.completion_signal = dispatch_signals[0];
uint16_t packet0_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet0_header_setup = packet0_header | (packet0.setup << 16);
uint64_t packet0_id = hsa_queue_add_write_index_screlease(gpu_queues[0], 1);
while ((packet0_id - hsa_queue_load_read_index_acquire(gpu_queues[0])) >=
gpu_queues[0]->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet0_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[0]->base_address +
(packet0_id & (gpu_queues[0]->size - 1)) *
64);
nontemporalMemcpy(packet0_ptr, &packet0, sizeof(packet0));
atomic_store_explicit((volatile atomic_uint*)packet0_ptr,
packet0_header_setup, memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queues[0]->doorbell_signal, packet0_id);
hsa_signal_value_t wait_value =
hsa_signal_wait_scacquire(dispatch_signals[1], HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (wait_value != 0) {
fprintf(stderr, "wait failed\n");
}
for (uint32_t i = 0; i < element_count; ++i) {
fprintf(stderr, "%u ", buffer[i]);
}
fprintf(stderr, "\n");
err = hsa_amd_memory_pool_free(buffer);
err = hsa_signal_destroy(dispatch_signals[0]);
err = hsa_signal_destroy(dispatch_signals[1]);
err = hsa_amd_memory_pool_free(kernarg_storages[0]);
err = hsa_amd_memory_pool_free(kernarg_storages[1]);
err = hsa_queue_destroy(gpu_queues[0]);
err = hsa_queue_destroy(gpu_queues[1]);
err = hsa_executable_destroy(executable);
err = hsa_code_object_reader_destroy(object_reader);
err = hsa_shut_down();
return 0;
}
// device to host call soft queue
// cpu_set
#define _GNU_SOURCE
#include <sched.h>
//
#include <errno.h>
#include <fcntl.h> // open
#include <inttypes.h>
#include <memory.h>
#include <numaif.h> // mbind
#include <pthread.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#ifndef MPOL_F_STATIC_NODES
/* Bug in numaif.h, this should be defined in there. Definition copied
* from linux/mempolicy.h.
*/
#define MPOL_F_STATIC_NODES (1 << 15)
#endif
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ven_amd_loader.h"
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
__attribute__((always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
memcpy(dst, src, size);
}
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agents[2];
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_region_t regions[32];
} regions_t;
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) {
regions_t* regions = (regions_t*)user_data;
regions->regions[regions->count++] = region;
return HSA_STATUS_SUCCESS;
}
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue,
void* user_data) {
const char* status_str = NULL;
hsa_status_string(status, &status_str);
fprintf(stderr, "gpu_queue_callback %s", status_str);
}
static bool parse_cpumap_bitmask(char* line, cpu_set_t* cpu_set) {
char* str = line;
size_t base = 0;
while (strlen(str) > 1 && str[0] != '\n') {
unsigned long mask = strtoul(str, &str, 16);
if (str[0] == ',') ++str;
for (size_t i = 0; i < 32; ++i) {
if (mask & (1ul << i)) {
CPU_SET(base + i, cpu_set);
}
}
base += 32;
}
return true;
}
static void set_affinity(uint32_t cpu_node_id) {
// numa_node_to_cpus
// https://sourcegraph.com/github.com/stranded-fish/XtraBackup-Source-Code-Analysis/-/blob/storage/ndb/src/common/portlib/NdbNuma.cpp?L132-134
// libnuma.so (.1)
// https://sourcegraph.com/github.com/numactl/numactl/-/blob/libnuma.c?L1471
// /sys/devices/system/node/node%d/cpumap (node_id)
// nod@Shark30:~/src/ROCR-Runtime/build$ cat
// /sys/devices/system/node/node0/cpumap
// ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff
// numa_parse_bitmap_v2
// https://sourcegraph.com/github.com/numactl/numactl/-/blob/libnuma.c?L1324:1-1324:21
cpu_set_t cpu_set;
CPU_ZERO(&cpu_set);
char cpumap_name[256];
snprintf(cpumap_name, sizeof(cpumap_name),
"/sys/devices/system/node/node%u/cpumap", cpu_node_id);
FILE* f = fopen(cpumap_name, "r");
size_t len = 0;
char* line = NULL;
bool did_read = f && getdelim(&line, &len, '\n', f) >= 1;
bool did_parse = false;
if (did_read) {
did_parse = parse_cpumap_bitmask(line, &cpu_set);
}
if (line) free(line);
fclose(f);
if (did_parse) {
int ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
fprintf(stderr, "pinned affinity: %d\n", ret);
}
}
static bool is_cond_met(hsa_signal_condition32_t cond,
hsa_signal_value_t desired_value,
hsa_signal_value_t current_value) {
switch (cond) {
default:
case HSA_SIGNAL_CONDITION_EQ:
return current_value == desired_value;
case HSA_SIGNAL_CONDITION_NE:
return current_value != desired_value;
case HSA_SIGNAL_CONDITION_LT:
return current_value < desired_value;
case HSA_SIGNAL_CONDITION_GTE:
return current_value >= desired_value;
}
}
typedef struct {
uint32_t cpu_node_id;
pthread_t handle;
hsa_queue_t* queue;
hsa_signal_t doorbell;
} cpu_thread_t;
static void* cpu_thread_main(void* user_data) {
cpu_thread_t* cpu_thread = (cpu_thread_t*)user_data;
set_affinity(cpu_thread->cpu_node_id);
// loop over doorbell
uint64_t queue_mask = cpu_thread->queue->size - 1;
uint64_t last_packet_id = 0;
uint64_t read_index = 0;
while (true) {
uint64_t new_packet_id = (uint64_t)hsa_signal_wait_scacquire(
cpu_thread->doorbell, HSA_SIGNAL_CONDITION_NE, last_packet_id,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
fprintf(stderr, "CPU INCOMING WAKE %lu\n", new_packet_id);
last_packet_id = new_packet_id;
if (new_packet_id == UINT64_MAX) break;
while (read_index !=
hsa_queue_load_write_index_scacquire(cpu_thread->queue)) {
hsa_agent_dispatch_packet_t* packet_ptr =
(hsa_agent_dispatch_packet_t*)cpu_thread->queue->base_address +
(read_index & queue_mask);
// wait until packet populated
uint32_t packet_type = HSA_PACKET_TYPE_INVALID;
do {
// NOTE: we assume this is waiting for at most a few cycles and spin.
uint32_t packet_header = atomic_load_explicit(
(volatile atomic_uint*)packet_ptr, memory_order_acquire);
packet_type = (packet_header >> HSA_PACKET_HEADER_TYPE) &
((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1);
} while (packet_type == HSA_PACKET_TYPE_INVALID);
fprintf(stderr, "agent packet %lu read\n", read_index);
// copy packet locally
union {
uint8_t data[64];
hsa_barrier_and_packet_t barrier_and;
hsa_barrier_or_packet_t barrier_or;
hsa_amd_barrier_value_packet_t barrier_value;
hsa_agent_dispatch_packet_t agent_dispatch;
} packet;
memcpy(packet.data, packet_ptr, sizeof(packet.data));
// swap packet back to invalid so that it can be reused immediately
atomic_store_explicit((volatile atomic_uint*)packet_ptr,
(HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE),
memory_order_relaxed);
hsa_queue_store_read_index_screlease(cpu_thread->queue, ++read_index);
switch (packet_type) {
case HSA_PACKET_TYPE_BARRIER_AND: {
// TODO: propagate failures according to spec?
// https://github.com/ROCm/ROCR-Runtime/issues/241
// need hsa_amd_signal_wait_all
for (uint32_t i = 0; i < 5; ++i) {
if (packet.barrier_and.dep_signal[i].handle != 0) {
// may have to handle spurious wakes?
while (hsa_signal_wait_scacquire(packet.barrier_and.dep_signal[i],
HSA_SIGNAL_CONDITION_EQ, 0u,
UINT64_MAX,
HSA_WAIT_STATE_BLOCKED) != 0) {
// retry
}
}
}
if (packet.barrier_and.completion_signal.handle != 0) {
hsa_signal_subtract_screlease(packet.barrier_and.completion_signal,
1);
}
break;
}
case HSA_PACKET_TYPE_BARRIER_OR: {
// TODO: propagate failures according to spec?
// hsa_amd_signal_wait_any does not accept 0 signals, have to filter
hsa_signal_t signals[5];
uint32_t signal_count = 0;
for (uint32_t i = 0; i < 5; ++i) {
if (packet.barrier_or.dep_signal[i].handle != 0) {
signals[signal_count++] = packet.barrier_or.dep_signal[i];
}
}
hsa_signal_condition_t conds[5] = {
HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_EQ,
HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_EQ,
HSA_SIGNAL_CONDITION_EQ,
};
hsa_signal_value_t values[5] = {
0, 0, 0, 0, 0,
};
hsa_signal_value_t satisfying_value;
// relaxed semantics
// satisfying_index == UINT32_MAX if failed
uint32_t satisfying_index = hsa_amd_signal_wait_any(
signal_count, signals, conds, values, UINT64_MAX,
HSA_WAIT_STATE_BLOCKED, &satisfying_value);
// acquire on satisfying signal
if (satisfying_index != UINT32_MAX) {
hsa_signal_load_scacquire(signals[satisfying_index]);
}
if (packet.barrier_or.completion_signal.handle != 0) {
hsa_signal_subtract_screlease(packet.barrier_or.completion_signal,
1);
}
break;
}
case HSA_AMD_PACKET_TYPE_BARRIER_VALUE: {
// TODO: propagate failures according to spec?
if (packet.barrier_value.mask == UINT64_MAX) {
// default path for when we can use a normal wait
// may have to handle spurious wakes?
hsa_signal_wait_scacquire(
packet.barrier_value.signal, packet.barrier_value.cond,
packet.barrier_value.value, UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
} else {
// mask needs to be in the loop
uint64_t value =
hsa_signal_load_scacquire(packet.barrier_value.signal);
while (!is_cond_met(packet.barrier_value.cond,
packet.barrier_value.value,
value & packet.barrier_value.mask)) {
// wait until it changes value and try again
hsa_signal_wait_scacquire(packet.barrier_value.signal,
HSA_SIGNAL_CONDITION_NE, value,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
}
}
if (packet.barrier_value.completion_signal.handle != 0) {
hsa_signal_subtract_screlease(
packet.barrier_value.completion_signal, 1);
}
break;
}
case HSA_PACKET_TYPE_AGENT_DISPATCH: {
// TODO: propagate failures according to spec?
fprintf(stderr,
"agent dispatch %u: %p %" PRIx64 " (%" PRId64 ") %" PRIx64
" (%" PRId64 ") %" PRIx64 " (%" PRId64 ") %" PRIx64
" (%" PRId64 ")\n",
packet.agent_dispatch.type,
packet.agent_dispatch.return_address,
packet.agent_dispatch.arg[0], packet.agent_dispatch.arg[0],
packet.agent_dispatch.arg[1], packet.agent_dispatch.arg[1],
packet.agent_dispatch.arg[2], packet.agent_dispatch.arg[2],
packet.agent_dispatch.arg[3], packet.agent_dispatch.arg[3]);
if (packet.agent_dispatch.completion_signal.handle != 0) {
hsa_signal_subtract_screlease(
packet.agent_dispatch.completion_signal, 1);
}
break;
}
}
}
}
return NULL;
}
// AMD Signal Kind Enumeration Values.
enum iree_amd_signal_kind_t {
IREE_AMD_SIGNAL_KIND_INVALID = 0,
IREE_AMD_SIGNAL_KIND_USER = 1,
IREE_AMD_SIGNAL_KIND_DOORBELL = -1,
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
};
typedef int64_t iree_amd_signal_kind64_t;
typedef struct iree_amd_signal_s {
iree_amd_signal_kind64_t kind;
union {
volatile int64_t value;
volatile uint32_t* legacy_hardware_doorbell_ptr;
volatile uint64_t* hardware_doorbell_ptr;
};
uint64_t event_mailbox_ptr;
uint32_t event_id;
uint32_t reserved1;
uint64_t start_ts;
uint64_t end_ts;
union {
/*iree_amd_queue_t*/ void* queue_ptr;
uint64_t reserved2;
};
uint32_t reserved3[2];
} iree_amd_signal_t;
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
uint32_t gpu_count = 0;
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agents[gpu_count++] = agents.all_agents[i];
err = hsa_agent_get_info(agents.all_agents[i],
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
}
}
memory_pools_t gpu_memory_pools[2];
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agents[0], iterate_memory_pool, &gpu_memory_pools[0]);
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agents[1], iterate_memory_pool, &gpu_memory_pools[1]);
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
regions_t cpu_regions;
memset(&cpu_regions, 0, sizeof(cpu_regions));
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions,
&cpu_regions);
hsa_region_t cpu_fine_region;
for (uint32_t i = 0; i < cpu_regions.count; ++i) {
hsa_region_segment_t segment;
err = hsa_region_get_info(cpu_regions.regions[i], HSA_REGION_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t flags;
err = hsa_region_get_info(cpu_regions.regions[i],
HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_region = cpu_regions.regions[i];
}
}
}
hsa_amd_memory_pool_t cpu_fine_pool = {0};
hsa_amd_memory_pool_t cpu_coarse_pool = {0};
hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
kernarg_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
cpu_coarse_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_pool = pool;
}
}
}
// assumes same params
uint32_t gpu_queue_min_size = 0;
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&gpu_queue_min_size);
uint32_t gpu_queue_max_size = 0;
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&gpu_queue_max_size);
uint32_t gpu_queue_size = gpu_queue_max_size;
hsa_queue_t* gpu_queues[2] = {NULL, NULL};
err = hsa_queue_create(agents.gpu_agents[0], gpu_queue_size,
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queues[0]);
err = hsa_queue_create(agents.gpu_agents[1], gpu_queue_size,
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queues[1]);
cpu_thread_t cpu_thread;
err = hsa_agent_get_info(agents.cpu_agent, HSA_AGENT_INFO_NODE,
&cpu_thread.cpu_node_id);
err = hsa_signal_create(0u, 1, &agents.cpu_agent, &cpu_thread.doorbell);
iree_amd_signal_t* sig = (iree_amd_signal_t*)cpu_thread.doorbell.handle;
err = hsa_soft_queue_create(cpu_fine_region, 8 * 1024, HSA_QUEUE_TYPE_MULTI,
HSA_QUEUE_FEATURE_AGENT_DISPATCH,
cpu_thread.doorbell, &cpu_thread.queue);
// doesn't work - EINVAL?
// maybe because queue memory already pinned?
// https://sourcegraph.com/github.com/ROCm/ROCT-Thunk-Interface/-/blob/src/fmm.c?L1743
unsigned long node_mask[1] = {0};
node_mask[0] = 1u << cpu_thread.cpu_node_id;
int mbind_ret = mbind(
cpu_thread.queue->base_address,
iree_host_align(
cpu_thread.queue->size * sizeof(hsa_agent_dispatch_packet_t), 4096),
MPOL_BIND, node_mask, cpu_thread.cpu_node_id + 1, MPOL_MF_MOVE);
if (mbind_ret) {
// docker blocks
if (errno == EPERM) {
fprintf(stderr, "mbind blocked\n");
} else {
fprintf(stderr, "mbind failed\n");
}
}
pthread_attr_t pthread_attr;
pthread_attr_init(&pthread_attr);
int pthread_ret = pthread_create(&cpu_thread.handle, &pthread_attr,
cpu_thread_main, &cpu_thread);
pthread_attr_destroy(&pthread_attr);
//
hsa_file_t object_file =
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY);
hsa_code_object_reader_t object_reader;
err = hsa_code_object_reader_create_from_file(object_file, &object_reader);
hsa_executable_t executable;
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
hsa_loaded_code_object_t gpu0_object;
err = hsa_executable_load_agent_code_object(
executable, agents.gpu_agents[0], object_reader, NULL, &gpu0_object);
hsa_loaded_code_object_t gpu1_object;
err = hsa_executable_load_agent_code_object(
executable, agents.gpu_agents[1], object_reader, NULL, &gpu1_object);
err = hsa_executable_freeze(executable, NULL);
hsa_ven_amd_loader_1_03_pfn_t loader;
err = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1,
sizeof(loader), &loader);
uint64_t code_base0 = 0;
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info(
gpu0_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE,
&code_base0);
uint64_t code_base1 = 0;
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info(
gpu1_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE,
&code_base1);
struct kernel_info_t {
uint64_t handle;
uint32_t private_size;
uint32_t group_size;
uint32_t kernarg_alignment;
uint32_t kernarg_size;
} kernel_info[2];
hsa_executable_symbol_t symbol;
err = hsa_executable_get_symbol_by_name(executable, "issue_host_call.kd",
&agents.gpu_agents[0], &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[0].handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info[0].private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info[0].group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info[0].kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info[0].kernarg_size);
err = hsa_executable_get_symbol_by_name(executable, "mul_x.kd",
&agents.gpu_agents[1], &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[1].handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info[1].private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info[1].group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info[1].kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info[1].kernarg_size);
void* kernarg_storages[2] = {NULL, NULL};
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[0].kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storages[0]);
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[0], NULL,
kernarg_storages[0]);
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[1].kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storages[1]);
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[1], NULL,
kernarg_storages[1]);
hsa_signal_t dispatch_signals[2];
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[0]);
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[1]);
typedef struct implicit_kernargs_t {
uint32_t block_count[3]; // + 0/4/8
uint16_t group_size[3]; // + 12/14/16
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
uint64_t global_offset[3]; // + 40/48/56
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
uint32_t element_count = 65;
uint32_t* buffer = NULL;
err = hsa_amd_memory_pool_allocate(
cpu_coarse_pool, element_count * sizeof(uint32_t),
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer);
for (uint32_t i = 0; i < element_count; ++i) {
buffer[i] = i;
}
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL, buffer);
uint32_t grid_size[3] = {element_count, 1, 1};
uint16_t workgroup_size[3] = {32, 1, 1};
// gpu1: barrier (wait) -> dispatch1
// gpu0: dispatch0 (signal)
// enqueue gpu1 before gpu0 to ensure wait happens
{
hsa_barrier_and_packet_t barrier;
memset(&barrier, 0, sizeof(barrier));
barrier.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
barrier.dep_signal[0] = dispatch_signals[0];
// note uint16_t high is reserved0
uint32_t barrier_header =
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint64_t barrier_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1);
while ((barrier_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >=
gpu_queues[1]->size) {
sleep(0);
}
hsa_barrier_and_packet_t* barrier_ptr =
(hsa_barrier_and_packet_t*)((uint8_t*)gpu_queues[1]->base_address +
(barrier_id & (gpu_queues[1]->size - 1)) *
64);
nontemporalMemcpy(barrier_ptr, &barrier, sizeof(barrier));
atomic_store_explicit((volatile atomic_uint*)barrier_ptr, barrier_header,
memory_order_release);
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, barrier_id);
}
typedef struct mul_x_args_t {
uint32_t x;
uint32_t n;
void* buffer;
} mul_x_args_t;
mul_x_args_t* mul_x_kernargs = (mul_x_args_t*)kernarg_storages[1];
mul_x_kernargs->x = 2;
mul_x_kernargs->n = element_count;
mul_x_kernargs->buffer = buffer;
implicit_kernargs_t* implicit1_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[1] +
iree_host_align(sizeof(mul_x_args_t), 8));
implicit1_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit1_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit1_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit1_kernargs->group_size[0] = workgroup_size[0];
implicit1_kernargs->group_size[1] = workgroup_size[1];
implicit1_kernargs->group_size[2] = workgroup_size[2];
implicit1_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit1_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit1_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit1_kernargs->reserved0 = 0;
implicit1_kernargs->reserved1 = 0;
implicit1_kernargs->global_offset[0] = 0; // newOffset[0];
implicit1_kernargs->global_offset[1] = 0; // newOffset[1];
implicit1_kernargs->global_offset[2] = 0; // newOffset[2];
implicit1_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet1;
packet1.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
packet1.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet1.workgroup_size_x = workgroup_size[0];
packet1.workgroup_size_y = workgroup_size[1];
packet1.workgroup_size_z = workgroup_size[2];
packet1.reserved0 = 0;
packet1.grid_size_x = grid_size[0];
packet1.grid_size_y = grid_size[1];
packet1.grid_size_z = grid_size[2];
packet1.private_segment_size = kernel_info[1].private_size;
packet1.group_segment_size = kernel_info[1].group_size;
packet1.kernel_object = kernel_info[1].handle;
packet1.kernarg_address = kernarg_storages[1];
packet1.reserved2 = 0;
packet1.completion_signal = dispatch_signals[1];
uint16_t packet1_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(0 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet1_header_setup = packet1_header | (packet1.setup << 16);
uint64_t packet1_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1);
while ((packet1_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >=
gpu_queues[1]->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet1_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[1]->base_address +
(packet1_id & (gpu_queues[1]->size - 1)) *
64);
nontemporalMemcpy(packet1_ptr, &packet1, sizeof(packet1));
atomic_store_explicit((volatile atomic_uint*)packet1_ptr,
packet1_header_setup, memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, packet1_id);
// "ensure" gpu1 is waiting
sleep(1);
typedef struct issue_host_call_args_t {
hsa_queue_t* queue;
hsa_signal_t completion_signal;
uint32_t arg;
} issue_host_call_args_t;
issue_host_call_args_t* issue_host_call_kernargs =
(issue_host_call_args_t*)kernarg_storages[0];
issue_host_call_kernargs->queue = cpu_thread.queue;
issue_host_call_kernargs->completion_signal = dispatch_signals[0];
issue_host_call_kernargs->arg = 100;
implicit_kernargs_t* implicit_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[0] +
iree_host_align(sizeof(issue_host_call_args_t),
8));
implicit_kernargs->block_count[0] = 1;
implicit_kernargs->block_count[1] = 1;
implicit_kernargs->block_count[2] = 1;
implicit_kernargs->group_size[0] = 1;
implicit_kernargs->group_size[1] = 1;
implicit_kernargs->group_size[2] = 1;
implicit_kernargs->remainder[0] = 0;
implicit_kernargs->remainder[1] = 0;
implicit_kernargs->remainder[2] = 0;
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0;
implicit_kernargs->global_offset[1] = 0;
implicit_kernargs->global_offset[2] = 0;
implicit_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet0;
packet0.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
packet0.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet0.workgroup_size_x = 1;
packet0.workgroup_size_y = 1;
packet0.workgroup_size_z = 1;
packet0.reserved0 = 0;
packet0.grid_size_x = 1;
packet0.grid_size_y = 1;
packet0.grid_size_z = 1;
packet0.private_segment_size = kernel_info[0].private_size;
packet0.group_segment_size = kernel_info[0].group_size;
packet0.kernel_object = kernel_info[0].handle;
packet0.kernarg_address = kernarg_storages[0];
packet0.reserved2 = 0;
packet0.completion_signal.handle = 0;
uint16_t packet0_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet0_header_setup = packet0_header | (packet0.setup << 16);
uint64_t packet0_id = hsa_queue_add_write_index_screlease(gpu_queues[0], 1);
while ((packet0_id - hsa_queue_load_read_index_acquire(gpu_queues[0])) >=
gpu_queues[0]->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet0_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[0]->base_address +
(packet0_id & (gpu_queues[0]->size - 1)) *
64);
nontemporalMemcpy(packet0_ptr, &packet0, sizeof(packet0));
atomic_store_explicit((volatile atomic_uint*)packet0_ptr,
packet0_header_setup, memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queues[0]->doorbell_signal, packet0_id);
hsa_signal_value_t wait_value =
hsa_signal_wait_scacquire(dispatch_signals[1], HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (wait_value != 0) {
fprintf(stderr, "wait failed\n");
}
for (uint32_t i = 0; i < element_count; ++i) {
fprintf(stderr, "%u ", buffer[i]);
}
fprintf(stderr, "\n");
// signal exit
hsa_signal_store_screlease(cpu_thread.doorbell, UINT64_MAX);
err = hsa_amd_memory_pool_free(buffer);
err = hsa_signal_destroy(dispatch_signals[0]);
err = hsa_signal_destroy(dispatch_signals[1]);
err = hsa_amd_memory_pool_free(kernarg_storages[0]);
err = hsa_amd_memory_pool_free(kernarg_storages[1]);
pthread_join(cpu_thread.handle, NULL);
err = hsa_queue_destroy(cpu_thread.queue);
err = hsa_signal_destroy(cpu_thread.doorbell);
err = hsa_queue_destroy(gpu_queues[0]);
err = hsa_queue_destroy(gpu_queues[1]);
err = hsa_executable_destroy(executable);
err = hsa_code_object_reader_destroy(object_reader);
err = hsa_shut_down();
return 0;
}
// device to device dispatch enqueue
#include <errno.h>
#include <fcntl.h> // open
#include <inttypes.h>
#include <memory.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
__attribute__((always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
memcpy(dst, src, size);
}
// AMD Signal Kind Enumeration Values.
enum iree_amd_signal_kind_t {
IREE_AMD_SIGNAL_KIND_INVALID = 0,
IREE_AMD_SIGNAL_KIND_USER = 1,
IREE_AMD_SIGNAL_KIND_DOORBELL = -1,
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
};
typedef int64_t iree_amd_signal_kind64_t;
typedef struct __attribute__((aligned(64))) iree_amd_signal_s {
iree_amd_signal_kind64_t kind;
union {
volatile int64_t value;
volatile uint32_t* legacy_hardware_doorbell_ptr;
volatile uint64_t* hardware_doorbell_ptr;
};
uint64_t event_mailbox_ptr;
uint32_t event_id;
uint32_t reserved1;
uint64_t start_ts;
uint64_t end_ts;
union {
/*iree_amd_queue_t*/ void* queue_ptr;
uint64_t reserved2;
};
uint32_t reserved3[2];
} iree_amd_signal_t;
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agents[2];
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_region_t regions[32];
} regions_t;
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) {
regions_t* regions = (regions_t*)user_data;
regions->regions[regions->count++] = region;
return HSA_STATUS_SUCCESS;
}
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue,
void* user_data) {
const char* status_str = NULL;
hsa_status_string(status, &status_str);
fprintf(stderr, "gpu_queue_callback %s", status_str);
}
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
uint32_t gpu_count = 0;
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agents[gpu_count++] = agents.all_agents[i];
err = hsa_agent_get_info(agents.all_agents[i],
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
}
}
memory_pools_t gpu_memory_pools[2];
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agents[0], iterate_memory_pool, &gpu_memory_pools[0]);
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agents[1], iterate_memory_pool, &gpu_memory_pools[1]);
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
regions_t cpu_regions;
memset(&cpu_regions, 0, sizeof(cpu_regions));
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions,
&cpu_regions);
hsa_region_t cpu_fine_region;
for (uint32_t i = 0; i < cpu_regions.count; ++i) {
hsa_region_segment_t segment;
err = hsa_region_get_info(cpu_regions.regions[i], HSA_REGION_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t flags;
err = hsa_region_get_info(cpu_regions.regions[i],
HSA_REGION_INFO_GLOBAL_FLAGS, &flags);
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_region = cpu_regions.regions[i];
}
}
}
hsa_amd_memory_pool_t cpu_fine_pool = {0};
hsa_amd_memory_pool_t cpu_coarse_pool = {0};
hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
kernarg_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
cpu_coarse_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
cpu_fine_pool = pool;
}
}
}
hsa_amd_memory_pool_t device_fine_pools[2] = {0, 0};
for (uint32_t j = 0; j < 2; ++j) {
for (uint32_t i = 0; i < gpu_memory_pools[j].count; ++i) {
hsa_amd_memory_pool_t pool = gpu_memory_pools[j].pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag &
(HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED |
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED)) {
device_fine_pools[j] = pool;
}
}
}
}
// assumes same params
uint32_t gpu_queue_min_size = 0;
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&gpu_queue_min_size);
uint32_t gpu_queue_max_size = 0;
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&gpu_queue_max_size);
uint32_t gpu_queue_size = gpu_queue_max_size;
hsa_queue_t* gpu_queues[2] = {NULL, NULL};
err = hsa_queue_create(agents.gpu_agents[0], gpu_queue_size,
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queues[0]);
err = hsa_queue_create(agents.gpu_agents[1], gpu_queue_size,
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queues[1]);
//
hsa_file_t object_file =
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY);
hsa_code_object_reader_t object_reader;
err = hsa_code_object_reader_create_from_file(object_file, &object_reader);
hsa_executable_t executable;
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[0],
object_reader, NULL, NULL);
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[1],
object_reader, NULL, NULL);
err = hsa_executable_freeze(executable, NULL);
struct kernel_info_t {
uint64_t handle;
uint32_t private_size;
uint32_t group_size;
uint32_t kernarg_alignment;
uint32_t kernarg_size;
} kernel_info[2];
hsa_executable_symbol_t symbol;
err = hsa_executable_get_symbol_by_name(executable, "issue_dispatch.kd",
&agents.gpu_agents[0], &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[0].handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info[0].private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info[0].group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info[0].kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info[0].kernarg_size);
err = hsa_executable_get_symbol_by_name(executable, "issue_dispatch.kd",
&agents.gpu_agents[1], &symbol);
uint64_t handle1 = 0;
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &handle1);
err = hsa_executable_get_symbol_by_name(executable, "mul_x.kd",
&agents.gpu_agents[1], &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[1].handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info[1].private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info[1].group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info[1].kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info[1].kernarg_size);
void* kernarg_storages[2] = {NULL, NULL};
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[0].kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storages[0]);
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[0], NULL,
kernarg_storages[0]);
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[1].kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storages[1]);
// must be for both ages (write 0/read 1)
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL,
kernarg_storages[1]);
// allocate a fully device->device signal
// allocated on device[1] (as that'll be waiting)
// shared with both devices
// in fine-grained memory (system-wide atomics)
//
// MAYBE: point-to-point HSA_AMD_MEMORY_POOL_PCIE_FLAG? forces fine-grained
// even if pool is not? kmt_alloc_flags.ui32.CoarseGrain forced 0
iree_amd_signal_t* device1_dispatch_signal = NULL;
err = hsa_amd_memory_pool_allocate(
device_fine_pools[1], sizeof(*device1_dispatch_signal),
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&device1_dispatch_signal);
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL,
device1_dispatch_signal);
iree_amd_signal_t device1_dispatch_signal_init;
memset(&device1_dispatch_signal_init, 0,
sizeof(device1_dispatch_signal_init));
device1_dispatch_signal_init.kind = IREE_AMD_SIGNAL_KIND_USER;
device1_dispatch_signal_init.value = 1;
device1_dispatch_signal_init.queue_ptr = gpu_queues[1];
err = hsa_memory_copy(device1_dispatch_signal, &device1_dispatch_signal_init,
sizeof(*device1_dispatch_signal));
hsa_signal_t dispatch_signals[2];
dispatch_signals[0].handle = (uint64_t)device1_dispatch_signal;
// err = hsa_amd_signal_create(1, 0, NULL, HSA_AMD_SIGNAL_AMD_GPU_ONLY,
// &dispatch_signals[0]);
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[1]);
typedef struct implicit_kernargs_t {
uint32_t block_count[3]; // + 0/4/8
uint16_t group_size[3]; // + 12/14/16
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
uint64_t global_offset[3]; // + 40/48/56
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
uint32_t element_count = 65;
uint32_t* buffer = NULL;
err = hsa_amd_memory_pool_allocate(
cpu_coarse_pool, element_count * sizeof(uint32_t),
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer);
for (uint32_t i = 0; i < element_count; ++i) {
buffer[i] = i;
}
// buffer only used on 1 (and host)
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[1], NULL, buffer);
// gpu0: issue_dispatch
// gpu1: mul_x
// enqueue gpu1 before gpu0 to ensure wait happens
typedef struct issue_dispatch_args_t {
hsa_queue_t* queue;
uint64_t mul_x_object;
uint32_t mul_x_private_size;
uint32_t mul_x_group_size;
hsa_signal_t completion_signal;
void* buffer;
void* kernarg_storage;
uint32_t element_count;
uint32_t mul_by;
} issue_dispatch_args_t;
issue_dispatch_args_t* issue_dispatch_kernargs =
(issue_dispatch_args_t*)kernarg_storages[0];
issue_dispatch_kernargs->queue = gpu_queues[1];
issue_dispatch_kernargs->mul_x_object = kernel_info[1].handle;
issue_dispatch_kernargs->mul_x_private_size = kernel_info[1].private_size;
issue_dispatch_kernargs->mul_x_group_size = kernel_info[1].group_size;
issue_dispatch_kernargs->completion_signal = dispatch_signals[1];
issue_dispatch_kernargs->buffer = buffer;
issue_dispatch_kernargs->kernarg_storage = kernarg_storages[1];
issue_dispatch_kernargs->element_count = element_count;
issue_dispatch_kernargs->mul_by = 4;
// DO NOT SUBMIT no implicit needed?
#if 0
implicit_kernargs_t* implicit_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[0] +
iree_host_align(sizeof(issue_dispatch_args_t), 8));
implicit_kernargs->block_count[0] = 1;
implicit_kernargs->block_count[1] = 1;
implicit_kernargs->block_count[2] = 1;
implicit_kernargs->group_size[0] = 1;
implicit_kernargs->group_size[1] = 1;
implicit_kernargs->group_size[2] = 1;
implicit_kernargs->remainder[0] = 0;
implicit_kernargs->remainder[1] = 0;
implicit_kernargs->remainder[2] = 0;
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0;
implicit_kernargs->global_offset[1] = 0;
implicit_kernargs->global_offset[2] = 0;
implicit_kernargs->grid_dims = 3;
#endif
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet0;
packet0.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
packet0.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet0.workgroup_size_x = 1;
packet0.workgroup_size_y = 1;
packet0.workgroup_size_z = 1;
packet0.reserved0 = 0;
packet0.grid_size_x = 1;
packet0.grid_size_y = 1;
packet0.grid_size_z = 1;
packet0.private_segment_size = kernel_info[0].private_size;
packet0.group_segment_size = kernel_info[0].group_size;
packet0.kernel_object = kernel_info[0].handle;
packet0.kernarg_address = kernarg_storages[0];
packet0.reserved2 = 0;
packet0.completion_signal = dispatch_signals[0];
uint16_t packet0_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet0_header_setup = packet0_header | (packet0.setup << 16);
uint64_t packet0_id = hsa_queue_add_write_index_screlease(gpu_queues[0], 1);
while ((packet0_id - hsa_queue_load_read_index_acquire(gpu_queues[0])) >=
gpu_queues[0]->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet0_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[0]->base_address +
(packet0_id & (gpu_queues[0]->size - 1)) *
64);
nontemporalMemcpy(packet0_ptr, &packet0, sizeof(packet0));
atomic_store_explicit((volatile atomic_uint*)packet0_ptr,
packet0_header_setup, memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queues[0]->doorbell_signal, packet0_id);
hsa_signal_value_t wait_value =
hsa_signal_wait_scacquire(dispatch_signals[1], HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (wait_value != 0) {
fprintf(stderr, "wait failed\n");
}
for (uint32_t i = 0; i < element_count; ++i) {
fprintf(stderr, "%u ", buffer[i]);
}
fprintf(stderr, "\n");
err = hsa_amd_memory_pool_free(buffer);
// user allocated signal
// err = hsa_signal_destroy(dispatch_signals[0]);
err = hsa_signal_destroy(dispatch_signals[1]);
err = hsa_amd_memory_pool_free(kernarg_storages[0]);
err = hsa_amd_memory_pool_free(kernarg_storages[1]);
err = hsa_queue_destroy(gpu_queues[0]);
err = hsa_queue_destroy(gpu_queues[1]);
err = hsa_executable_destroy(executable);
err = hsa_code_object_reader_destroy(object_reader);
err = hsa_shut_down();
return 0;
}
// hip + hsa interop
#include <fcntl.h> // open
#include <memory.h>
#include <stdatomic.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h"
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h"
#define __HIP_PLATFORM_AMD__
#include <hip/hip_runtime.h>
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
__attribute__((always_inline)) static inline void nontemporalMemcpy(
void* __restrict dst, const void* __restrict src, size_t size) {
memcpy(dst, src, size);
}
typedef struct {
uint32_t agent_count;
hsa_agent_t all_agents[32];
hsa_agent_t cpu_agent;
hsa_agent_t gpu_agent;
} agents_t;
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) {
agents_t* agents = (agents_t*)user_data;
agents->all_agents[agents->agent_count++] = agent;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_amd_memory_pool_t pools[32];
} memory_pools_t;
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool,
void* user_data) {
memory_pools_t* memory_pools = (memory_pools_t*)user_data;
memory_pools->pools[memory_pools->count++] = memory_pool;
return HSA_STATUS_SUCCESS;
}
typedef struct {
uint32_t count;
hsa_region_t regions[32];
} regions_t;
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) {
regions_t* regions = (regions_t*)user_data;
regions->regions[regions->count++] = region;
return HSA_STATUS_SUCCESS;
}
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue,
void* user_data) {
const char* status_str = NULL;
hsa_status_string(status, &status_str);
fprintf(stderr, "gpu_queue_callback %s", status_str);
}
int main(int argc, char** argv) {
hsa_status_t err;
err = hsa_init();
agents_t agents;
memset(&agents, 0, sizeof(agents));
err = hsa_iterate_agents(iterate_agent, &agents);
for (uint32_t i = 0; i < agents.agent_count; ++i) {
hsa_device_type_t device_type = 0;
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE,
&device_type);
if (device_type == HSA_DEVICE_TYPE_GPU) {
agents.gpu_agent = agents.all_agents[i];
err = hsa_agent_get_info(agents.gpu_agent,
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU,
&agents.cpu_agent);
break;
}
}
hipError_t herr = hipInit(0);
// device_props[0].uuid; "0e12865a3bf5b7ab"
// HSA_AMD_AGENT_INFO_UUID "GPU-0e12865a3bf5b7ab"
hipDeviceProp_t device_props[2];
herr = hipGetDeviceProperties(&device_props[0], 0);
herr = hipGetDeviceProperties(&device_props[1], 1);
hipModule_t hip_module;
herr = hipModuleLoad(&hip_module, "experimental/hsa_tests/kernels_hip.elf");
// no way to interop events/signals?
// ipc events in hip are multiple signals with a ringbuffer
// external semaphores not supported
// regular events
// https://github.com/ROCm/clr/blob/d6193a2f23d08f24bdc2a26f9501ae6cfc73ccf0/hipamd/src/hip_event.cpp#L420
// event record bottoms out on ProfilingSignal:
// https://github.com/ROCm/clr/blob/d6193a2f23d08f24bdc2a26f9501ae6cfc73ccf0/rocclr/device/rocm/rocdevice.hpp#L79
// which has hsa_signal_t
memory_pools_t gpu_memory_pools;
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools);
regions_t gpu_regions;
memset(&gpu_regions, 0, sizeof(gpu_regions));
err = hsa_agent_iterate_regions(agents.gpu_agent, iterate_regions,
&gpu_regions);
memory_pools_t cpu_memory_pools;
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools));
err = hsa_amd_agent_iterate_memory_pools(
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools);
regions_t cpu_regions;
memset(&cpu_regions, 0, sizeof(cpu_regions));
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions,
&cpu_regions);
hsa_amd_memory_pool_t cpu_coarse_pool = {0};
hsa_amd_memory_pool_t kernarg_pool = {0};
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) {
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i];
hsa_region_segment_t segment;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT,
&segment);
if (segment == HSA_REGION_SEGMENT_GLOBAL) {
hsa_region_global_flag_t global_flag;
err = hsa_amd_memory_pool_get_info(
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag);
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) {
kernarg_pool = pool;
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
cpu_coarse_pool = pool;
}
}
}
uint32_t gpu_queue_min_size = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE,
&gpu_queue_min_size);
uint32_t gpu_queue_max_size = 0;
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE,
&gpu_queue_max_size);
uint32_t gpu_queue_size = gpu_queue_max_size;
hsa_queue_t* gpu_queue = NULL;
err = hsa_queue_create(agents.gpu_agent, gpu_queue_size, HSA_QUEUE_TYPE_MULTI,
gpu_queue_callback,
/*callback_data=*/NULL,
/*private_segment_size=*/UINT32_MAX,
/*group_segment_size=*/UINT32_MAX, &gpu_queue);
//
hsa_file_t object_file =
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY);
// hsa_file_t object_file =
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY);
hsa_code_object_reader_t object_reader;
err = hsa_code_object_reader_create_from_file(object_file, &object_reader);
hsa_executable_t executable;
err = hsa_executable_create_alt(HSA_PROFILE_FULL,
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL,
&executable);
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agent,
object_reader, NULL, NULL);
err = hsa_executable_freeze(executable, NULL);
struct kernel_info_t {
uint64_t handle;
uint32_t private_size;
uint32_t group_size;
uint32_t kernarg_alignment;
uint32_t kernarg_size;
} kernel_info;
hsa_executable_symbol_t symbol;
err = hsa_executable_get_symbol_by_name(executable, "add_one.kd",
&agents.gpu_agent, &symbol);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info.handle);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE,
&kernel_info.private_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE,
&kernel_info.group_size);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT,
&kernel_info.kernarg_alignment);
err = hsa_executable_symbol_get_info(
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
&kernel_info.kernarg_size);
void* kernarg_storage = NULL;
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info.kernarg_size,
HSA_AMD_MEMORY_POOL_STANDARD_FLAG,
&kernarg_storage);
err =
hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, kernarg_storage);
hsa_signal_t dispatch_signal;
err = hsa_signal_create(1, 0, NULL, &dispatch_signal);
typedef struct implicit_kernargs_t {
uint32_t block_count[3]; // + 0/4/8
uint16_t group_size[3]; // + 12/14/16
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
uint64_t global_offset[3]; // + 40/48/56
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
uint32_t element_count = 65;
uint32_t* buffer = NULL;
// err = hsa_amd_memory_pool_allocate(
// cpu_coarse_pool, element_count * sizeof(uint32_t),
// HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer);
herr = hipHostMalloc(&buffer, element_count * sizeof(uint32_t), 0);
for (uint32_t i = 0; i < element_count; ++i) {
buffer[i] = i;
}
err = hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, buffer);
uint32_t grid_size[3] = {element_count, 1, 1};
uint16_t workgroup_size[3] = {32, 1, 1};
typedef struct add_one_args_t {
uint32_t n;
void* buffer;
} add_one_args_t;
add_one_args_t* explicit_kernargs = (add_one_args_t*)kernarg_storage;
explicit_kernargs->n = element_count;
explicit_kernargs->buffer = buffer;
implicit_kernargs_t* implicit_kernargs =
(implicit_kernargs_t*)((uint8_t*)kernarg_storage +
iree_host_align(sizeof(add_one_args_t), 8));
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit_kernargs->group_size[0] = workgroup_size[0];
implicit_kernargs->group_size[1] = workgroup_size[1];
implicit_kernargs->group_size[2] = workgroup_size[2];
implicit_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0; // newOffset[0];
implicit_kernargs->global_offset[1] = 0; // newOffset[1];
implicit_kernargs->global_offset[2] = 0; // newOffset[2];
implicit_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet;
packet.header = HSA_PACKET_TYPE_INVALID;
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet.workgroup_size_x = workgroup_size[0];
packet.workgroup_size_y = workgroup_size[1];
packet.workgroup_size_z = workgroup_size[2];
packet.reserved0 = 0;
packet.grid_size_x = grid_size[0];
packet.grid_size_y = grid_size[1];
packet.grid_size_z = grid_size[2];
packet.private_segment_size = kernel_info.private_size;
packet.group_segment_size = kernel_info.group_size;
packet.kernel_object = kernel_info.handle;
packet.kernarg_address = kernarg_storage;
packet.completion_signal = dispatch_signal;
uint16_t packet_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet_header_setup = packet_header | (packet.setup << 16);
uint64_t packet_id = hsa_queue_add_write_index_screlease(gpu_queue, 1);
while ((packet_id - hsa_queue_load_read_index_acquire(gpu_queue)) >=
gpu_queue->size) {
sleep(0);
}
hsa_kernel_dispatch_packet_t* packet_ptr =
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queue->base_address +
(packet_id & (gpu_queue->size - 1)) * 64);
nontemporalMemcpy(packet_ptr, &packet, sizeof(packet));
atomic_store_explicit((volatile atomic_uint*)packet_ptr, packet_header_setup,
memory_order_release);
// value ignored in MULTI cases
hsa_signal_store_relaxed(gpu_queue->doorbell_signal, packet_id);
hsa_signal_value_t wait_value =
hsa_signal_wait_scacquire(dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0,
UINT64_MAX, HSA_WAIT_STATE_BLOCKED);
if (wait_value != 0) {
fprintf(stderr, "wait failed\n");
}
for (uint32_t i = 0; i < element_count; ++i) {
fprintf(stderr, "%u ", buffer[i]);
}
fprintf(stderr, "\n");
// err = hsa_amd_memory_pool_free(buffer);
herr = hipHostFree(buffer);
err = hsa_signal_destroy(dispatch_signal);
err = hsa_amd_memory_pool_free(kernarg_storage);
err = hsa_queue_destroy(gpu_queue);
err = hsa_executable_destroy(executable);
err = hsa_code_object_reader_destroy(object_reader);
hipModuleUnload(hip_module);
err = hsa_shut_down();
return 0;
}
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#ifndef IREE_HAL_DRIVERS_AMDGPU_DEVICE_SUPPORT_OPENCL_H_
#define IREE_HAL_DRIVERS_AMDGPU_DEVICE_SUPPORT_OPENCL_H_
//===----------------------------------------------------------------------===//
// Compiler Configuration
//===----------------------------------------------------------------------===//
#if defined(__AMDGPU__)
#define IREE_AMDGPU_TARGET_DEVICE 1
#else
#define IREE_AMDGPU_TARGET_HOST 1
#endif // __AMDGPU__
//===----------------------------------------------------------------------===//
// OpenCL Attributes
//===----------------------------------------------------------------------===//
#if defined(IREE_AMDGPU_TARGET_DEVICE)
#define __kernel [[clang::amdgpu_kernel, gnu::visibility("protected")]]
#define IREE_AMDGPU_RESTRICT __restrict__
#define IREE_AMDGPU_ALIGNAS(x) __attribute__((aligned(x)))
#define IREE_AMDGPU_GLOBAL
#define IREE_AMDGPU_CONSTANT
#define IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#define IREE_AMDGPU_ATTRIBUTE_CONST
#define IREE_AMDGPU_ATTRIBUTE_SINGLE_WORK_ITEM __attribute__((work_group_size_hint(1, 1, 1))
#define IREE_AMDGPU_ATTRIBUTE_PACKED __attribute__((__packed__))
#define IREE_AMDGPU_LIKELY(x) (__builtin_expect(!!(x), 1))
#define IREE_AMDGPU_UNLIKELY(x) (__builtin_expect(!!(x), 0))
#define IREE_AMDGPU_STATIC_ASSERT(x, y) \
IREE_AMDGPU_STATIC_ASSERT__(x, __COUNTER__)
#define IREE_AMDGPU_STATIC_ASSERT__(x, y) IREE_AMDGPU_STATIC_ASSERT___(x, y)
#define IREE_AMDGPU_STATIC_ASSERT___(x, y) \
typedef char __assert_##y[(x) ? 1 : -1] __attribute__((__unused__))
#else
#define IREE_AMDGPU_RESTRICT IREE_RESTRICT
#define IREE_AMDGPU_ALIGNAS(x) iree_alignas(x)
#define IREE_AMDGPU_GLOBAL
#define IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE IREE_ATTRIBUTE_ALWAYS_INLINE
#define IREE_AMDGPU_ATTRIBUTE_CONST
#define IREE_AMDGPU_ATTRIBUTE_SINGLE_WORK_ITEM
#define IREE_AMDGPU_ATTRIBUTE_PACKED IREE_ATTRIBUTE_PACKED
#define IREE_AMDGPU_LIKELY(x) IREE_LIKELY(x)
#define IREE_AMDGPU_UNLIKELY(x) IREE_UNLIKELY(x)
#define IREE_AMDGPU_STATIC_ASSERT(x, y) static_assert(x, y)
#endif // IREE_AMDGPU_TARGET_DEVICE
//===----------------------------------------------------------------------===//
// Types
//===----------------------------------------------------------------------===//
#if defined(IREE_AMDGPU_TARGET_DEVICE)
typedef char int8_t;
typedef unsigned char uint8_t;
typedef short int16_t;
typedef unsigned short uint16_t;
typedef int int32_t;
typedef unsigned int uint32_t;
typedef long int64_t;
typedef unsigned long uint64_t;
typedef int64_t ssize_t;
typedef uint64_t size_t;
#else
#include <stdint.h>
#endif // IREE_AMDGPU_TARGET_DEVICE
//===----------------------------------------------------------------------===//
// Alignment
//===----------------------------------------------------------------------===//
#define IREE_AMDGPU_MIN(a, b) (((a) < (b)) ? (a) : (b))
#define IREE_AMDGPU_MAX(a, b) (((a) > (b)) ? (a) : (b))
#define IREE_AMDGPU_CEIL_DIV(lhs, rhs) (((lhs) + (rhs) - 1) / (rhs))
//===----------------------------------------------------------------------===//
// OpenCL Atomics
//===----------------------------------------------------------------------===//
#define iree_hal_amdgpu_device_destructive_interference_size 64
#define iree_hal_amdgpu_device_constructive_interference_size 64
#if defined(IREE_AMDGPU_TARGET_DEVICE)
// #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
// #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
#define memory_scope_all_svm_devices __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES
#define memory_scope_all_devices memory_scope_all_svm_devices
typedef uint32_t iree_hal_amdgpu_device_memory_order_t;
#define iree_hal_amdgpu_device_memory_order_relaxed __ATOMIC_RELAXED
#define iree_hal_amdgpu_device_memory_order_acquire __ATOMIC_ACQUIRE
#define iree_hal_amdgpu_device_memory_order_release __ATOMIC_RELEASE
#define iree_hal_amdgpu_device_memory_order_acq_rel __ATOMIC_ACQ_REL
#define iree_hal_amdgpu_device_memory_order_seq_cst __ATOMIC_SEQ_CST
#define iree_hal_amdgpu_device_memory_scope_work_item memory_scope_work_item
#define iree_hal_amdgpu_device_memory_scope_work_group memory_scope_work_group
#define iree_hal_amdgpu_device_memory_scope_device memory_scope_device
#define iree_hal_amdgpu_device_memory_scope_system memory_scope_all_svm_devices
#define iree_hal_amdgpu_device_memory_scope_sub_group memory_scope_sub_group
#define IREE_HAL_AMDGPU_DEVICE_ATOMIC_INIT(object, value) \
atomic_init((object), (value))
typedef _Atomic int32_t iree_hal_amdgpu_device_atomic_int32_t;
typedef _Atomic int64_t iree_hal_amdgpu_device_atomic_int64_t;
typedef _Atomic uint32_t iree_hal_amdgpu_device_atomic_uint32_t;
typedef _Atomic uint64_t iree_hal_amdgpu_device_atomic_uint64_t;
#define iree_hal_amdgpu_device_atomic_load_explicit(object, memory_order, \
memory_scope) \
__opencl_atomic_load((object), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_store_explicit( \
object, desired, memory_order, memory_scope) \
__opencl_atomic_store((object), (desired), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_fetch_add_explicit( \
object, operand, memory_order, memory_scope) \
__opencl_atomic_fetch_add((object), (operand), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_fetch_sub_explicit( \
object, operand, memory_order, memory_scope) \
__opencl_atomic_fetch_sub((object), (operand), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_fetch_and_explicit( \
object, operand, memory_order, memory_scope) \
__opencl_atomic_fetch_and((object), (operand), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_fetch_or_explicit( \
object, operand, memory_order, memory_scope) \
__opencl_atomic_fetch_or((object), (operand), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_fetch_xor_explicit( \
object, operand, memory_order, memory_scope) \
__opencl_atomic_fetch_xor((object), (operand), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_exchange_explicit( \
object, desired, memory_order, memory_scope) \
__opencl_atomic_exchange((object), (desired), (memory_order), (memory_scope))
#define iree_hal_amdgpu_device_atomic_compare_exchange_weak_explicit( \
object, expected, desired, memory_order_success, memory_order_fail, \
memory_scope) \
__opencl_atomic_compare_exchange_weak((object), (expected), (desired), \
(memory_order_success), \
(memory_order_fail), (memory_scope))
#define iree_hal_amdgpu_device_atomic_compare_exchange_strong_explicit( \
object, expected, desired, memory_order_success, memory_order_fail, \
memory_scope) \
__opencl_atomic_compare_exchange_strong((object), (expected), (desired), \
(memory_order_success), \
(memory_order_fail), (memory_scope))
#else
#define IREE_HAL_AMDGPU_DEVICE_ATOMIC_INIT(object, value) \
*(object) = IREE_ATOMIC_VAR_INIT(value)
typedef iree_atomic_int32_t iree_hal_amdgpu_device_atomic_int32_t;
typedef iree_atomic_int64_t iree_hal_amdgpu_device_atomic_int64_t;
typedef iree_atomic_uint32_t iree_hal_amdgpu_device_atomic_uint32_t;
typedef iree_atomic_uint64_t iree_hal_amdgpu_device_atomic_uint64_t;
#endif // IREE_AMDGPU_TARGET_DEVICE
//===----------------------------------------------------------------------===//
// OpenCL Dispatch ABI
//===----------------------------------------------------------------------===//
typedef struct iree_hsa_signal_s {
// Opaque handle. The value 0 is reserved.
uint64_t handle;
} iree_hsa_signal_t;
#define iree_hsa_signal_null() \
(iree_hsa_signal_t) { 0 }
#define iree_hsa_signal_is_null(signal) ((signal).handle == 0)
typedef enum {
HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0
} hsa_kernel_dispatch_packet_setup_t;
typedef struct hsa_kernel_dispatch_packet_s {
uint16_t header;
uint16_t setup;
uint16_t workgroup_size_x;
uint16_t workgroup_size_y;
uint16_t workgroup_size_z;
uint16_t reserved0;
uint32_t grid_size_x;
uint32_t grid_size_y;
uint32_t grid_size_z;
uint32_t private_segment_size;
uint32_t group_segment_size;
uint64_t kernel_object;
IREE_AMDGPU_GLOBAL void* kernarg_address;
uint64_t reserved2;
iree_hsa_signal_t completion_signal;
} hsa_kernel_dispatch_packet_t;
typedef enum {
HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0,
HSA_PACKET_TYPE_INVALID = 1,
HSA_PACKET_TYPE_KERNEL_DISPATCH = 2,
HSA_PACKET_TYPE_BARRIER_AND = 3,
HSA_PACKET_TYPE_AGENT_DISPATCH = 4,
HSA_PACKET_TYPE_BARRIER_OR = 5
} hsa_packet_type_t;
typedef enum {
HSA_PACKET_HEADER_TYPE = 0,
HSA_PACKET_HEADER_BARRIER = 8,
HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9,
HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11,
} hsa_packet_header_t;
typedef enum {
HSA_FENCE_SCOPE_NONE = 0,
HSA_FENCE_SCOPE_AGENT = 1,
HSA_FENCE_SCOPE_SYSTEM = 2
} hsa_fence_scope_t;
typedef struct hsa_agent_dispatch_packet_s {
uint16_t header;
uint16_t type;
uint32_t reserved0;
void* return_address;
uint64_t arg[4];
uint64_t reserved2;
iree_hsa_signal_t completion_signal;
} iree_hsa_agent_dispatch_packet_t;
// AMD Signal Kind Enumeration Values.
enum iree_amd_signal_kind_t {
IREE_AMD_SIGNAL_KIND_INVALID = 0,
IREE_AMD_SIGNAL_KIND_USER = 1,
IREE_AMD_SIGNAL_KIND_DOORBELL = -1,
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2
};
typedef int64_t iree_amd_signal_kind64_t;
typedef struct IREE_AMDGPU_ALIGNAS(64) iree_amd_signal_s {
iree_amd_signal_kind64_t kind;
union {
volatile int64_t value;
IREE_AMDGPU_GLOBAL volatile uint32_t* legacy_hardware_doorbell_ptr;
IREE_AMDGPU_GLOBAL volatile uint64_t* hardware_doorbell_ptr;
};
uint64_t event_mailbox_ptr;
uint32_t event_id;
uint32_t reserved1;
uint64_t start_ts;
uint64_t end_ts;
union {
IREE_AMDGPU_GLOBAL /*iree_amd_queue_t*/ void* queue_ptr;
uint64_t reserved2;
};
uint32_t reserved3[2];
} iree_amd_signal_t;
#if defined(IREE_AMDGPU_TARGET_DEVICE)
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_global_id(unsigned dim);
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_local_id(unsigned dim);
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_group_id(unsigned dim);
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_local_size(unsigned dim);
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_num_groups(unsigned dim);
// #define iree_hal_amdgpu_device_global_id_x() __ockl_get_global_id(0)
#define iree_hal_amdgpu_device_global_id_y() __ockl_get_global_id(1)
#define iree_hal_amdgpu_device_global_id_z() __ockl_get_global_id(2)
#define iree_hal_amdgpu_device_group_id_x() __ockl_get_group_id(0)
#define iree_hal_amdgpu_device_group_id_y() __ockl_get_group_id(1)
#define iree_hal_amdgpu_device_group_id_z() __ockl_get_group_id(2)
#define iree_hal_amdgpu_device_group_count_x() __ockl_get_num_groups(0)
#define iree_hal_amdgpu_device_group_count_y() __ockl_get_num_groups(1)
#define iree_hal_amdgpu_device_group_count_z() __ockl_get_num_groups(2)
#define iree_hal_amdgpu_device_local_id_x() __ockl_get_local_id(0)
#define iree_hal_amdgpu_device_local_id_y() __ockl_get_local_id(1)
#define iree_hal_amdgpu_device_local_id_z() __ockl_get_local_id(2)
#define iree_hal_amdgpu_device_workgroup_size_x() __ockl_get_local_size(0)
#define iree_hal_amdgpu_device_workgroup_size_y() __ockl_get_local_size(1)
#define iree_hal_amdgpu_device_workgroup_size_z() __ockl_get_local_size(2)
extern IREE_AMDGPU_ATTRIBUTE_CONST IREE_AMDGPU_CONSTANT void*
iree_amdgcn_implicitarg_ptr(void) __asm("llvm.amdgcn.implicitarg.ptr");
// Returns the pointer to the iree_hsa_kernel_dispatch_packet_t being executed.
#define iree_amdgcn_dispatch_ptr __builtin_amdgcn_dispatch_ptr
// __ockl_get_global_id(0) / get_global_id_x using OLD_ABI
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE size_t
iree_hal_amdgpu_device_global_id_x(void) {
const uint32_t local_id = __builtin_amdgcn_workitem_id_x();
const uint32_t group_id = __builtin_amdgcn_workgroup_id_x();
const uint32_t group_size =
((hsa_kernel_dispatch_packet_t*)iree_amdgcn_dispatch_ptr())
->workgroup_size_x;
return (group_id * group_size + local_id);
}
#endif // IREE_AMDGPU_TARGET_DEVICE
//===----------------------------------------------------------------------===//
// Sleep
//===----------------------------------------------------------------------===//
#if defined(IREE_AMDGPU_TARGET_DEVICE)
extern void __builtin_amdgcn_s_sleep(int);
// Sleeps the current thread for some "short" amount of time.
// This maps to the S_SLEEP instruction that varies on different architectures
// in how long it can delay execution. The behavior cannot be mapped to wall
// time as it suspends for 64*arg + 1-64 clocks but archs have different limits,
// clock speed can vary over the course of execution, etc. This is mostly only
// useful as a "yield for a few instructions to stop hammering a memory
// location" primitive.
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void
iree_hal_amdgpu_device_yield(void) {
__builtin_amdgcn_s_sleep(1);
}
#endif // IREE_AMDGPU_TARGET_DEVICE
#endif // IREE_HAL_DRIVERS_AMDGPU_DEVICE_SUPPORT_OPENCL_H_
// const IREE_AMDGPU_CONSTANT int __oclc_ABI_version = 500;
const IREE_AMDGPU_CONSTANT bool __oclc_unsafe_math_opt = false;
const IREE_AMDGPU_CONSTANT bool __oclc_daz_opt = false;
const IREE_AMDGPU_CONSTANT bool __oclc_correctly_rounded_sqrt32 = true;
const IREE_AMDGPU_CONSTANT bool __oclc_finite_only_opt = false;
const IREE_AMDGPU_CONSTANT bool __oclc_wavefrontsize64 =
__AMDGCN_WAVEFRONT_SIZE__ == 64 ? 1 : 0;
#if defined(__gfx700__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7000;
#elif defined(__gfx701__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7001;
#elif defined(__gfx702__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7002;
#elif defined(__gfx703__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7003;
#elif defined(__gfx704__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7004;
#elif defined(__gfx705__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7005;
#elif defined(__gfx801__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8001;
#elif defined(__gfx802__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8002;
#elif defined(__gfx803__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8003;
#elif defined(__gfx805__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8005;
#elif defined(__gfx810__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8100;
#elif defined(__gfx900__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9000;
#elif defined(__gfx902__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9002;
#elif defined(__gfx904__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9004;
#elif defined(__gfx906__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9006;
#elif defined(__gfx908__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9008;
#elif defined(__gfx909__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9009;
#elif defined(__gfx90a__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9010;
#elif defined(__gfx90c__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9012;
#elif defined(__gfx940__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9400;
#elif defined(__gfx941__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9401;
#elif defined(__gfx942__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9402;
#elif defined(__gfx1010__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10100;
#elif defined(__gfx1011__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10101;
#elif defined(__gfx1012__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10102;
#elif defined(__gfx1013__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10103;
#elif defined(__gfx1030__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10300;
#elif defined(__gfx1031__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10301;
#elif defined(__gfx1032__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10302;
#elif defined(__gfx1033__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10303;
#elif defined(__gfx1034__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10304;
#elif defined(__gfx1035__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10305;
#elif defined(__gfx1036__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10306;
#elif defined(__gfx1100__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11000;
#elif defined(__gfx1101__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11001;
#elif defined(__gfx1102__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11002;
#elif defined(__gfx1103__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11003;
#elif defined(__gfx1150__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11500;
#elif defined(__gfx1151__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11501;
#elif defined(__gfx1200__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 12000;
#elif defined(__gfx1201__)
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 12001;
#else
#error "Unknown AMDGPU architecture"
#endif
__kernel void add_one(
uint32_t n, IREE_AMDGPU_GLOBAL uint32_t* IREE_AMDGPU_RESTRICT buffer) {
const size_t idx = iree_hal_amdgpu_device_global_id_x();
if (idx < n) {
buffer[idx] += 1;
}
}
typedef struct iree_tracing_location_s {
IREE_AMDGPU_CONSTANT const char* name;
IREE_AMDGPU_CONSTANT const char* function;
IREE_AMDGPU_CONSTANT const char* file;
uint32_t line;
uint32_t color;
} iree_tracing_location_t;
__kernel void add_one_with_timestamp(
uint32_t n, IREE_AMDGPU_GLOBAL uint32_t* IREE_AMDGPU_RESTRICT buffer) {
const size_t idx = iree_hal_amdgpu_device_global_id_x();
if (idx == 0) {
// correlates with signal start_ts/end_ts
// can be converted with hsa_amd_profiling_convert_tick_to_system_domain
uint64_t t = __builtin_readsteadycounter();
buffer[0] = (uint32_t)(t >> 32);
buffer[1] = (uint32_t)t;
} else if (idx == 2) {
static const IREE_AMDGPU_CONSTANT iree_tracing_location_t
__tracy_source_location = {
"helloworld", __FUNCTION__, __FILE__,
(uint32_t)__LINE__, 0xCDCDCDCD,
};
uint64_t t = (uint64_t)&__tracy_source_location;
buffer[2] = (uint32_t)(t >> 32);
buffer[3] = (uint32_t)t;
} else if (idx > 3) {
buffer[idx] += 1;
}
}
__kernel void mul_x(uint32_t x, uint32_t n,
IREE_AMDGPU_GLOBAL uint32_t* IREE_AMDGPU_RESTRICT buffer) {
const size_t idx = iree_hal_amdgpu_device_global_id_x();
if (idx < n) {
buffer[idx] *= x;
}
}
//===----------------------------------------------------------------------===//
// Device Library Externs
//===----------------------------------------------------------------------===//
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void
iree_hsa_signal_update_mailbox(
const IREE_AMDGPU_GLOBAL iree_amd_signal_t* IREE_AMDGPU_RESTRICT signal) {
IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t* mailbox =
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)
signal->event_mailbox_ptr;
if (mailbox) {
const uint32_t event_id = signal->event_id;
iree_hal_amdgpu_device_atomic_store_explicit(
mailbox, event_id, iree_hal_amdgpu_device_memory_order_release,
iree_hal_amdgpu_device_memory_scope_system);
__builtin_amdgcn_s_sendmsg(1 | (0 << 4),
__builtin_amdgcn_readfirstlane(event_id) & 0xff);
}
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE int64_t
iree_hsa_signal_load(const iree_hsa_signal_t signal,
iree_hal_amdgpu_device_memory_order_t memory_order) {
const IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(const IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
return iree_hal_amdgpu_device_atomic_load_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
memory_order, iree_hal_amdgpu_device_memory_scope_system);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_add(
iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
iree_hal_amdgpu_device_atomic_fetch_add_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_and(
iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
iree_hal_amdgpu_device_atomic_fetch_and_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_or(
iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
iree_hal_amdgpu_device_atomic_fetch_or_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_xor(
iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
iree_hal_amdgpu_device_atomic_fetch_xor_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE int64_t
iree_hsa_signal_exchange(iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
int64_t existing = iree_hal_amdgpu_device_atomic_exchange_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
return existing;
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_subtract(
iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
iree_hal_amdgpu_device_atomic_fetch_sub_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE int64_t
iree_hsa_signal_cas(iree_hsa_signal_t signal, int64_t expected, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
int64_t existing = expected;
if (iree_hal_amdgpu_device_atomic_compare_exchange_strong_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
&existing, value, memory_order,
iree_hal_amdgpu_device_memory_order_relaxed,
iree_hal_amdgpu_device_memory_scope_system)) {
iree_hsa_signal_update_mailbox(s);
}
return existing;
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_store(
iree_hsa_signal_t signal, int64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s =
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle;
if (s->kind == IREE_AMD_SIGNAL_KIND_USER) {
// User signal may need a mailbox poke.
iree_hal_amdgpu_device_atomic_store_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
iree_hsa_signal_update_mailbox(s);
} else {
// Hardware doorbell supports AQL semantics.
// NOTE: this requires __oclc_ISA_version >= 9000; older hardware doesn't
// support the atomic store knocks and needs emulation.
iree_hal_amdgpu_device_atomic_store_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)
s->hardware_doorbell_ptr,
(uint64_t)value, iree_hal_amdgpu_device_memory_order_release,
iree_hal_amdgpu_device_memory_scope_system);
}
}
//===----------------------------------------------------------------------===//
// HSA/AMDGPU AQL Queue
//===----------------------------------------------------------------------===//
typedef enum {
// Queue supports multiple producers.
IREE_HSA_QUEUE_TYPE_MULTI = 0,
// Queue only supports a single producer.
IREE_HSA_QUEUE_TYPE_SINGLE = 1
} iree_hsa_queue_type_t;
typedef struct iree_hsa_queue_s {
// Queue type.
iree_hsa_queue_type_t type;
// Queue features mask. This is a bit-field of hsa_queue_feature_t
// values. Applications should ignore any unknown set bits.
uint32_t features;
IREE_AMDGPU_GLOBAL void* base_address;
// Signal object used by the application to indicate the ID of a packet that
// is ready to be processed. The HSA runtime manages the doorbell signal. If
// the application tries to replace or destroy this signal, the behavior is
// undefined.
//
// If type is HSA_QUEUE_TYPE_SINGLE the doorbell signal value must be
// updated in a monotonically increasing fashion. If type is
// HSA_QUEUE_TYPE_MULTI the doorbell signal value can be updated with any
// value.
iree_hsa_signal_t doorbell_signal;
// Maximum number of packets the queue can hold. Must be a power of 2.
uint32_t size;
// Reserved. Must be 0.
uint32_t reserved1;
// Queue identifier, which is unique over the lifetime of the application.
uint64_t id;
} iree_hsa_queue_t;
#define IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \
name##_SHIFT = (shift), name##_WIDTH = (width), \
name = (((1 << (width)) - 1) << (shift))
enum iree_amd_queue_properties_t {
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(
IREE_AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1),
// All devices we care about are 64-bit.
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(IREE_AMD_QUEUE_PROPERTIES_IS_PTR64, 1,
1),
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(
IREE_AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1),
// Timestamps will be stored on signals (start_ts/end_ts).
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(
IREE_AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1),
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(IREE_AMD_QUEUE_PROPERTIES_RESERVED1, 4,
28)
};
typedef uint32_t iree_amd_queue_properties32_t;
// An AQL packet queue.
// We generally treat these as opaque except for if we need to read queue
// properties to check modes - otherwise we just treat any queue handle as
// an iree_hsa_queue_t.
typedef struct IREE_AMDGPU_ALIGNAS(64) iree_amd_queue_s {
iree_hsa_queue_t hsa_queue;
uint32_t reserved1[4];
volatile uint64_t write_dispatch_id;
uint32_t group_segment_aperture_base_hi;
uint32_t private_segment_aperture_base_hi;
uint32_t max_cu_id;
uint32_t max_wave_id;
volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1;
volatile uint32_t legacy_doorbell_lock;
uint32_t reserved2[9];
volatile uint64_t read_dispatch_id;
uint32_t read_dispatch_id_field_base_byte_offset;
uint32_t compute_tmpring_size;
uint32_t scratch_resource_descriptor[4];
uint64_t scratch_backing_memory_location;
uint64_t scratch_backing_memory_byte_size;
uint32_t scratch_workitem_byte_size;
iree_amd_queue_properties32_t queue_properties;
uint32_t reserved3[2];
iree_hsa_signal_t queue_inactive_signal;
uint32_t reserved4[14];
} iree_amd_queue_t;
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t
iree_hsa_queue_load_read_index(
const IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
iree_hal_amdgpu_device_memory_order_t memory_order) {
const IREE_AMDGPU_GLOBAL iree_amd_queue_t* q =
(const IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue;
return iree_hal_amdgpu_device_atomic_load_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q
->read_dispatch_id,
memory_order, iree_hal_amdgpu_device_memory_scope_system);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t
iree_hsa_queue_load_write_index(
const IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
iree_hal_amdgpu_device_memory_order_t memory_order) {
const IREE_AMDGPU_GLOBAL iree_amd_queue_t* q =
(const IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue;
return iree_hal_amdgpu_device_atomic_load_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q
->write_dispatch_id,
memory_order, iree_hal_amdgpu_device_memory_scope_system);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t
iree_hsa_queue_add_write_index(
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
uint64_t value, iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_queue_t* q =
(IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue;
return iree_hal_amdgpu_device_atomic_fetch_add_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q
->write_dispatch_id,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t
iree_hsa_queue_cas_write_index(
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
uint64_t expected, uint64_t value,
iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_queue_t* q =
(IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue;
uint64_t e = expected;
iree_hal_amdgpu_device_atomic_compare_exchange_strong_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q
->write_dispatch_id,
&e, value, memory_order, iree_hal_amdgpu_device_memory_order_relaxed,
iree_hal_amdgpu_device_memory_scope_system);
return e;
}
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void
iree_hsa_queue_store_write_index(
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
uint64_t value, iree_hal_amdgpu_device_memory_order_t memory_order) {
IREE_AMDGPU_GLOBAL iree_amd_queue_t* q =
(IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue;
iree_hal_amdgpu_device_atomic_store_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q
->write_dispatch_id,
value, memory_order, iree_hal_amdgpu_device_memory_scope_system);
}
void iree_hal_amdgpu_device_host_enqueue(
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
uint16_t type, uint64_t return_address, uint64_t arg0, uint64_t arg1,
uint64_t arg2, uint64_t arg3, iree_hsa_signal_t completion_signal) {
// Reserve a packet write index and wait for it to become available in cases
// where the queue is exhausted.
uint64_t packet_id = iree_hsa_queue_add_write_index(
queue, 1u, iree_hal_amdgpu_device_memory_order_relaxed);
while (packet_id - iree_hsa_queue_load_read_index(
queue, iree_hal_amdgpu_device_memory_order_acquire) >=
queue->size) {
iree_hal_amdgpu_device_yield(); // spinning
}
const uint64_t queue_mask = queue->size - 1; // power of two
IREE_AMDGPU_GLOBAL iree_hsa_agent_dispatch_packet_t* agent_packet =
queue->base_address + (packet_id & queue_mask) * 64;
// Populate all of the packet besides the header.
agent_packet->reserved0 = 0;
agent_packet->return_address = (void*)return_address;
agent_packet->arg[0] = arg0;
agent_packet->arg[1] = arg1;
agent_packet->arg[2] = arg2;
agent_packet->arg[3] = arg3;
agent_packet->reserved2 = 0;
agent_packet->completion_signal = completion_signal;
// Populate the header and release the packet to the queue.
// Note that we need to release to all devices so that the host can see it.
uint16_t header = HSA_PACKET_TYPE_AGENT_DISPATCH << HSA_PACKET_HEADER_TYPE;
header |= 0 << HSA_PACKET_HEADER_BARRIER;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE;
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE;
uint32_t header_type = header | (type << 16);
iree_hal_amdgpu_device_atomic_store_explicit(
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint32_t*)agent_packet,
header_type, iree_hal_amdgpu_device_memory_order_release,
iree_hal_amdgpu_device_memory_scope_system);
// Signal the queue doorbell.
// https://sourcegraph.com/github.com/ROCm/rocMLIR/-/blob/external/llvm-project/amd/device-libs/ockl/src/hsaqs.cl?L69
iree_hsa_signal_store(queue->doorbell_signal, packet_id,
iree_hal_amdgpu_device_memory_order_relaxed);
}
__kernel void issue_host_call(
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
iree_hsa_signal_t completion_signal, uint32_t arg) {
iree_hal_amdgpu_device_host_enqueue(queue, 123, 0x100, arg, 0x201, 0x202,
0x203, iree_hsa_signal_null());
iree_hal_amdgpu_device_host_enqueue(queue, 456, 0x100, arg, 0x201, 0x202,
0x203, iree_hsa_signal_null());
iree_hal_amdgpu_device_host_enqueue(queue, 789, 0x100, arg, 0x201, 0x202,
0x203, completion_signal);
}
static inline size_t iree_host_align(size_t value, size_t alignment) {
return (value + (alignment - 1)) & ~(alignment - 1);
}
typedef struct implicit_kernargs_t {
uint32_t block_count[3]; // + 0/4/8
uint16_t group_size[3]; // + 12/14/16
uint16_t remainder[3]; // + 18/20/22
uint64_t reserved0; // + 24 hidden_tool_correlation_id
uint64_t reserved1; // + 32
uint64_t global_offset[3]; // + 40/48/56
uint16_t grid_dims; // + 64
} implicit_kernargs_t;
__kernel void issue_dispatch(
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue,
uint64_t mul_x_object, uint32_t mul_x_private_size,
uint32_t mul_x_group_size, iree_hsa_signal_t completion_signal,
IREE_AMDGPU_GLOBAL void* buffer,
IREE_AMDGPU_GLOBAL void* IREE_AMDGPU_RESTRICT kernarg_storage,
uint32_t element_count, uint32_t mul_by) {
typedef struct mul_x_args_t {
uint32_t x;
uint32_t n;
IREE_AMDGPU_GLOBAL void* buffer;
} mul_x_args_t;
IREE_AMDGPU_GLOBAL mul_x_args_t* mul_x_kernargs =
(IREE_AMDGPU_GLOBAL mul_x_args_t*)kernarg_storage;
mul_x_kernargs->x = mul_by;
mul_x_kernargs->n = element_count;
mul_x_kernargs->buffer = buffer;
uint32_t grid_size[3] = {element_count, 1, 1};
uint16_t workgroup_size[3] = {32, 1, 1};
IREE_AMDGPU_GLOBAL implicit_kernargs_t* implicit_kernargs =
(IREE_AMDGPU_GLOBAL implicit_kernargs_t*)((uint8_t*)kernarg_storage +
iree_host_align(
sizeof(mul_x_args_t), 8));
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0];
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1];
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2];
implicit_kernargs->group_size[0] = workgroup_size[0];
implicit_kernargs->group_size[1] = workgroup_size[1];
implicit_kernargs->group_size[2] = workgroup_size[2];
implicit_kernargs->remainder[0] =
(uint16_t)(grid_size[0] % workgroup_size[0]);
implicit_kernargs->remainder[1] =
(uint16_t)(grid_size[1] % workgroup_size[1]);
implicit_kernargs->remainder[2] =
(uint16_t)(grid_size[2] % workgroup_size[2]);
implicit_kernargs->reserved0 = 0;
implicit_kernargs->reserved1 = 0;
implicit_kernargs->global_offset[0] = 0; // newOffset[0];
implicit_kernargs->global_offset[1] = 0; // newOffset[1];
implicit_kernargs->global_offset[2] = 0; // newOffset[2];
implicit_kernargs->grid_dims = 3;
// DO NOT SUBMIT should do nontemporal kernarg update?
hsa_kernel_dispatch_packet_t packet;
packet.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE;
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
packet.workgroup_size_x = workgroup_size[0];
packet.workgroup_size_y = workgroup_size[1];
packet.workgroup_size_z = workgroup_size[2];
packet.reserved0 = 0;
packet.grid_size_x = grid_size[0];
packet.grid_size_y = grid_size[1];
packet.grid_size_z = grid_size[2];
packet.private_segment_size = mul_x_private_size;
packet.group_segment_size = mul_x_group_size;
packet.kernel_object = mul_x_object;
packet.kernarg_address = kernarg_storage;
packet.reserved2 = 0;
packet.completion_signal = completion_signal;
uint16_t packet_header =
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
(1 << HSA_PACKET_HEADER_BARRIER) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
uint32_t packet_header_setup = packet_header | (packet.setup << 16);
uint64_t packet_id = iree_hsa_queue_add_write_index(
queue, 1, iree_hal_amdgpu_device_memory_order_release);
while (
(packet_id - iree_hsa_queue_load_read_index(
queue, iree_hal_amdgpu_device_memory_order_acquire)) >=
queue->size) {
iree_hal_amdgpu_device_yield();
}
IREE_AMDGPU_GLOBAL hsa_kernel_dispatch_packet_t* packet_ptr =
(IREE_AMDGPU_GLOBAL hsa_kernel_dispatch_packet_t*)((IREE_AMDGPU_GLOBAL uint8_t*)
queue
->base_address +
(packet_id &
(queue->size - 1)) *
64);
// memcpy(packet_ptr, &packet, sizeof(packet));
*packet_ptr = packet;
iree_hal_amdgpu_device_atomic_store_explicit(
(volatile iree_hal_amdgpu_device_atomic_uint32_t*)packet_ptr,
packet_header_setup, iree_hal_amdgpu_device_memory_order_release,
iree_hal_amdgpu_device_memory_scope_system);
// value ignored in MULTI cases
iree_hsa_signal_store(queue->doorbell_signal, packet_id,
iree_hal_amdgpu_device_memory_order_relaxed);
}
{
"type": "lldb-dap",
"request": "launch",
"name": "run-active-hsa-c-file",
"program": "${command:cmake.buildDirectory}/vscode/${fileBasenameNoExtension}",
"args": [
],
"env": [
"LD_LIBRARY_PATH=/home/nod/src/ROCR-Runtime/build/rocr/lib/",
],
"cwd": "${workspaceRoot}",
"preLaunchTask": "build-active-hsa-c-file",
"presentation": {
"clear": true,
"reveal": "silent",
"revealProblems": "never",
"panel": "shared",
"showReuseMessage": false
},
},
{
"type": "cppbuild",
"label": "build-active-hsa-c-file",
"command": "/usr/bin/clang",
"args": [
"-fcolor-diagnostics",
"-fansi-escape-codes",
"-g",
"${file}",
"-I${workspaceFolder}",
"-I${workspaceFolder}/runtime/src/",
"-I/opt/rocm/include/",
"-L/home/nod/src/ROCR-Runtime/build/rocr/lib/",
"-L/opt/rocm/lib/",
"-lhsa-runtime64",
"-lhsakmt",
"-lamdhip64",
"-ldl",
"-lpthread",
"-lnuma", // mbind
"-lrt",
"-o",
"${command:cmake.buildDirectory}/vscode/${fileBasenameNoExtension}"
],
"options": {
"cwd": "${fileDirname}"
},
"problemMatcher": ["$gcc"],
"group": {
"kind": "build",
"isDefault": true
},
"presentation": {
"clear": true,
"reveal": "silent",
"revealProblems": "never",
"panel": "shared",
"showReuseMessage": false
},
},
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment