Last active
November 13, 2024 16:19
-
-
Save benvanik/9e2675f7595d38f432d6677aa295ca37 to your computer and use it in GitHub Desktop.
hsa prototypes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set -x | |
~/src/iree-build/llvm-project/bin/clang \ | |
-x c -std=c23 \ | |
-target amdgcn-amd-amdhsa -march=gfx1100 \ | |
-nogpulib \ | |
-fgpu-rdc \ | |
-fno-short-wchar \ | |
-fno-ident \ | |
-Xclang -finclude-default-header \ | |
-fvisibility=hidden \ | |
-O3 \ | |
kernels.c \ | |
-c -emit-llvm -o kernels_c.bc | |
~/src/iree-build/llvm-project/bin/llvm-dis kernels_c.bc | |
~/src/iree-build/llvm-project/bin/llvm-link \ | |
-internalize \ | |
-only-needed \ | |
kernels_c.bc \ | |
-o kernels_c_linked.bc | |
# /opt/rocm/lib/llvm/lib/clang/17/lib/amdgcn/bitcode/ockl.bc \ | |
# ~/src/iree-build/tools/iree_platform_libs/rocm/ockl.bc \ | |
~/src/iree-build/llvm-project/bin/llvm-dis kernels_c_linked.bc | |
~/src/iree-build/llvm-project/bin/opt -O3 kernels_c_linked.bc -o kernels_c_opt.bc | |
~/src/iree-build/llvm-project/bin/llvm-dis kernels_c_opt.bc | |
~/src/iree-build/llvm-project/bin/lld \ | |
-flavor gnu \ | |
-m elf64_amdgpu \ | |
--build-id=none \ | |
--no-undefined \ | |
-shared \ | |
-plugin-opt=mcpu=gfx1100 \ | |
-plugin-opt=O3 \ | |
--lto-CGO3 \ | |
--no-whole-archive \ | |
--gc-sections \ | |
--strip-debug \ | |
--discard-all \ | |
--discard-locals \ | |
-o kernels_c.elf \ | |
kernels_c_linked.bc | |
#--strip-all \ | |
# -save-temps \ | |
~/src/iree-build/llvm-project/bin/llvm-readelf \ | |
kernels_c.elf --all \ | |
>kernels_c.txt | |
# rm kernels_cl_linked.bc | |
# ~/src/iree-build/llvm-project/bin/clang \ | |
# -x hip \ | |
# --offload-device-only \ | |
# --offload-arch=gfx1100 \ | |
# -fuse-cuid=none \ | |
# -nogpulib \ | |
# -fgpu-rdc \ | |
# -fvisibility=hidden \ | |
# -O3 \ | |
# kernels.cpp \ | |
# -c -emit-llvm -o kernels_hip.bc | |
# ~/src/iree-build/llvm-project/bin/llvm-link \ | |
# -internalize \ | |
# -only-needed \ | |
# kernels_hip.bc \ | |
# /opt/rocm/lib/llvm/lib/clang/17/lib/amdgcn/bitcode/ockl.bc \ | |
# -o kernels_hip_linked.bc | |
# # ~/src/iree-build/tools/iree_platform_libs/rocm/ockl.bc \ | |
# ~/src/iree-build/llvm-project/bin/lld \ | |
# -flavor gnu \ | |
# -m elf64_amdgpu \ | |
# --build-id=none \ | |
# --no-undefined \ | |
# -shared \ | |
# -plugin-opt=mcpu=gfx1100 \ | |
# -plugin-opt=O3 \ | |
# --lto-CGO3 \ | |
# --no-whole-archive \ | |
# -o kernels_hip.elf \ | |
# kernels_hip_linked.bc | |
# # -save-temps \ | |
# ~/src/iree-build/llvm-project/bin/llvm-readelf \ | |
# kernels_hip.elf --all \ | |
# >kernels_hip.txt | |
# rm kernels_hip_linked.bc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// info queries | |
#include <memory.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agent; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
hsa_status_t err; | |
char product_name[64]; | |
err = hsa_agent_get_info( | |
agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_PRODUCT_NAME, product_name); | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
static hsa_status_t pcs_config_callback( | |
const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data) { | |
fprintf(stderr, "HAVE CONFIG\n"); | |
return HSA_STATUS_SUCCESS; | |
} | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
uint16_t version_major = 0; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_major); | |
uint16_t version_minor = 0; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_VERSION_MAJOR, &version_minor); | |
uint64_t timestamp_frequency = 0; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, | |
×tamp_frequency); | |
bool svm_by_default = false; | |
err = hsa_system_get_info(HSA_AMD_SYSTEM_INFO_SVM_ACCESSIBLE_BY_DEFAULT, | |
&svm_by_default); | |
uint64_t max_wait = 0; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_SIGNAL_MAX_WAIT, &max_wait); | |
hsa_endianness_t endianness = HSA_ENDIANNESS_LITTLE; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_ENDIANNESS, &endianness); | |
hsa_machine_model_t machine_model = HSA_MACHINE_MODEL_SMALL; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_MACHINE_MODEL, &machine_model); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agent = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.gpu_agent, | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
break; | |
} | |
} | |
// | |
char gpu_name[64]; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_NAME, gpu_name); | |
uint8_t gpu_extensions[128] = {0}; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_EXTENSIONS, | |
gpu_extensions); | |
bool has_amd_profiler = | |
gpu_extensions[64] & (HSA_EXTENSION_AMD_PROFILER - 64); | |
if (has_amd_profiler) fprintf(stderr, "HSA_EXTENSION_AMD_PROFILER\n"); | |
bool has_amd_loader = gpu_extensions[64] & (HSA_EXTENSION_AMD_LOADER - 64); | |
if (has_amd_loader) fprintf(stderr, "HSA_EXTENSION_AMD_LOADER\n"); | |
bool has_amd_aqlprofile = | |
gpu_extensions[64] & (HSA_EXTENSION_AMD_AQLPROFILE - 64); | |
if (has_amd_aqlprofile) fprintf(stderr, "HSA_EXTENSION_AMD_AQLPROFILE\n"); | |
bool has_amd_pc_sampling = | |
gpu_extensions[64] & (HSA_EXTENSION_AMD_PC_SAMPLING - 64); | |
if (has_amd_pc_sampling) fprintf(stderr, "HSA_EXTENSION_AMD_PC_SAMPLING\n"); | |
char cpu_name[64]; | |
err = hsa_agent_get_info(agents.cpu_agent, HSA_AGENT_INFO_NAME, cpu_name); | |
uint32_t cpu_numa_node = 0; | |
err = | |
hsa_agent_get_info(agents.cpu_agent, HSA_AGENT_INFO_NODE, &cpu_numa_node); | |
memory_pools_t gpu_memory_pools; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools); | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
for (uint32_t i = 0; i < gpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_access_t access = | |
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; | |
err = hsa_amd_agent_memory_pool_get_info( | |
agents.cpu_agent, gpu_memory_pools.pools[i], | |
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); | |
uint32_t num_link_hops = 0; | |
err = hsa_amd_agent_memory_pool_get_info( | |
agents.cpu_agent, gpu_memory_pools.pools[i], | |
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &num_link_hops); | |
hsa_amd_memory_pool_link_info_t link_infos[32]; | |
err = hsa_amd_agent_memory_pool_get_info( | |
agents.cpu_agent, gpu_memory_pools.pools[i], | |
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_infos); | |
err = err; | |
} | |
for (uint32_t i = 0; i < gpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_access_t access = | |
HSA_AMD_MEMORY_POOL_ACCESS_NEVER_ALLOWED; | |
err = hsa_amd_agent_memory_pool_get_info( | |
agents.gpu_agent, cpu_memory_pools.pools[i], | |
HSA_AMD_AGENT_MEMORY_POOL_INFO_ACCESS, &access); | |
uint32_t num_link_hops = 0; | |
err = hsa_amd_agent_memory_pool_get_info( | |
agents.gpu_agent, cpu_memory_pools.pools[i], | |
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, &num_link_hops); | |
hsa_amd_memory_pool_link_info_t link_infos[32]; | |
err = hsa_amd_agent_memory_pool_get_info( | |
agents.gpu_agent, cpu_memory_pools.pools[i], | |
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, link_infos); | |
err = err; | |
} | |
hsa_ven_amd_pc_sampling_1_00_pfn_t pcs_table; | |
err = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_PC_SAMPLING, 1, | |
sizeof(pcs_table), &pcs_table); | |
// err = pcs_table.hsa_ven_amd_pcs_iterate_configuration( | |
// agents.gpu_agent, pcs_config_callback, NULL); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// dispatch hip or opencl kernel and timing | |
#include <assert.h> | |
#include <errno.h> | |
#include <fcntl.h> // open | |
#include <fcntl.h> | |
#include <memory.h> | |
#include <stdatomic.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <sys/ioctl.h> | |
#include <unistd.h> | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ven_amd_loader.h" | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
// AMD Signal Kind Enumeration Values. | |
enum iree_amd_signal_kind_t { | |
IREE_AMD_SIGNAL_KIND_INVALID = 0, | |
IREE_AMD_SIGNAL_KIND_USER = 1, | |
IREE_AMD_SIGNAL_KIND_DOORBELL = -1, | |
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 | |
}; | |
typedef int64_t iree_amd_signal_kind64_t; | |
#define IREE_AMDGPU_GLOBAL | |
#define IREE_AMDGPU_ALIGNAS(x) __attribute__((aligned(x))) | |
typedef struct IREE_AMDGPU_ALIGNAS(64) iree_amd_signal_s { | |
iree_amd_signal_kind64_t kind; | |
union { | |
volatile int64_t value; | |
IREE_AMDGPU_GLOBAL volatile uint32_t* legacy_hardware_doorbell_ptr; | |
IREE_AMDGPU_GLOBAL volatile uint64_t* hardware_doorbell_ptr; | |
}; | |
uint64_t event_mailbox_ptr; | |
uint32_t event_id; | |
uint32_t reserved1; | |
uint64_t start_ts; | |
uint64_t end_ts; | |
union { | |
IREE_AMDGPU_GLOBAL /*iree_amd_queue_t*/ void* queue_ptr; | |
uint64_t reserved2; | |
}; | |
uint32_t reserved3[2]; | |
} iree_amd_signal_t; | |
/* Call ioctl, restarting if it is interrupted */ | |
int hsakmt_ioctl(int fd, unsigned long request, void* arg) { | |
int ret; | |
do { | |
ret = ioctl(fd, request, arg); | |
} while (ret == -1 && (errno == EINTR || errno == EAGAIN)); | |
return ret; | |
} | |
#define AMDKFD_IOCTL_BASE 'K' | |
#define AMDKFD_IO(nr) _IO(AMDKFD_IOCTL_BASE, nr) | |
#define AMDKFD_IOR(nr, type) _IOR(AMDKFD_IOCTL_BASE, nr, type) | |
#define AMDKFD_IOW(nr, type) _IOW(AMDKFD_IOCTL_BASE, nr, type) | |
#define AMDKFD_IOWR(nr, type) _IOWR(AMDKFD_IOCTL_BASE, nr, type) | |
#define AMDKFD_IOC_GET_CLOCK_COUNTERS \ | |
AMDKFD_IOWR(0x05, struct kfd_ioctl_get_clock_counters_args) | |
struct kfd_ioctl_get_clock_counters_args { | |
uint64_t gpu_clock_counter; /* from KFD */ | |
uint64_t cpu_clock_counter; /* from KFD */ | |
uint64_t system_clock_counter; /* from KFD */ | |
uint64_t system_clock_freq; /* from KFD */ | |
uint32_t gpu_id; /* to KFD */ | |
uint32_t pad; | |
}; | |
#if defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ | |
defined(__amd64__) || defined(_M_X64) || defined(_M_AMD64) | |
#include <xmmintrin.h> | |
#define IS_X86 1 | |
#endif | |
// https://gitlab.freedesktop.org/mesa/mesa/-/blob/main/src/util/streaming-load-memcpy.c | |
__attribute__((always_inline)) static inline void nontemporalMemcpy( | |
void* __restrict dst, const void* __restrict src, size_t size) { | |
// #if defined(IS_X86) | |
// #if defined(__AVX512F__) | |
// #pragma unroll | |
// for (size_t i = 0; i != size / sizeof(__m512i); ++i) { | |
// _mm512_stream_si512((__m512i* __restrict)(dst), | |
// *(const __m512i* __restrict)(src)); | |
// src += sizeof(__m512i); | |
// dst += sizeof(__m512i); | |
// } | |
// size = size % sizeof(__m512i); | |
// #endif | |
// #if defined(__AVX__) | |
// #pragma unroll | |
// for (size_t i = 0; i != size / sizeof(__m256i); ++i) { | |
// _mm256_stream_si256((__m256i* __restrict)(dst), | |
// *(const __m256i* __restrict)(src)); | |
// src += sizeof(__m256i); | |
// dst += sizeof(__m256i); | |
// } | |
// size = size % sizeof(__m256i); | |
// #endif | |
// #pragma unroll | |
// for (size_t i = 0; i != size / sizeof(__m128i); ++i) { | |
// _mm_stream_si128((__m128i* __restrict)(dst), | |
// *((const __m128i* __restrict)(src))); | |
// src += sizeof(__m128i); | |
// dst += sizeof(__m128i); | |
// } | |
// size = size % sizeof(__m128i); | |
// #pragma unroll | |
// for (size_t i = 0; i != size / sizeof(long long); ++i) { | |
// _mm_stream_si64((long long* __restrict)(dst), | |
// *(const long long* __restrict)(src)); | |
// src += sizeof(long long); | |
// dst += sizeof(long long); | |
// } | |
// size = size % sizeof(long long); | |
// #pragma unroll | |
// for (size_t i = 0; i != size / sizeof(int); ++i) { | |
// _mm_stream_si32((int* __restrict)(dst), *(const int* __restrict)(src)); | |
// src += sizeof(int); | |
// dst += sizeof(int); | |
// } | |
// #else | |
memcpy(dst, src, size); | |
// #endif | |
} | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agent; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_region_t regions[32]; | |
} regions_t; | |
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) { | |
regions_t* regions = (regions_t*)user_data; | |
regions->regions[regions->count++] = region; | |
return HSA_STATUS_SUCCESS; | |
} | |
static hsa_status_t iterate_symbol(hsa_executable_t executable, | |
hsa_executable_symbol_t symbol, | |
void* user_data) { | |
uint32_t name_length = 0; | |
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, | |
&name_length); | |
char name[128]; | |
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); | |
fprintf(stderr, "symbol: %.*s\n", (int)name_length, name); | |
return HSA_STATUS_SUCCESS; | |
} | |
static hsa_status_t iterate_program_symbol(hsa_executable_t executable, | |
hsa_executable_symbol_t symbol, | |
void* user_data) { | |
uint32_t name_length = 0; | |
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, | |
&name_length); | |
char name[128]; | |
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); | |
fprintf(stderr, "program symbol: %.*s\n", (int)name_length, name); | |
return HSA_STATUS_SUCCESS; | |
} | |
static hsa_status_t iterate_agent_symbol(hsa_executable_t executable, | |
hsa_agent_t agent, | |
hsa_executable_symbol_t symbol, | |
void* user_data) { | |
uint32_t name_length = 0; | |
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, | |
&name_length); | |
char name[128]; | |
hsa_executable_symbol_get_info(symbol, HSA_EXECUTABLE_SYMBOL_INFO_NAME, name); | |
fprintf(stderr, "agent symbol: %.*s\n", (int)name_length, name); | |
return HSA_STATUS_SUCCESS; | |
} | |
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue, | |
void* user_data) { | |
const char* status_str = NULL; | |
hsa_status_string(status, &status_str); | |
fprintf(stderr, "gpu_queue_callback %s", status_str); | |
} | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agent = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.gpu_agent, | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
break; | |
} | |
} | |
memory_pools_t gpu_memory_pools; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools); | |
regions_t gpu_regions; | |
memset(&gpu_regions, 0, sizeof(gpu_regions)); | |
err = hsa_agent_iterate_regions(agents.gpu_agent, iterate_regions, | |
&gpu_regions); | |
hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < gpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = gpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
// gpu_coarse_pool = pool; | |
} else if (global_flag & | |
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED) { | |
// gpu_fine_pool = pool; | |
kernarg_pool = pool; | |
} | |
} | |
} | |
// for (uint32_t i = 0; i < gpu_regions.count; ++i) { | |
// hsa_region_t region = gpu_regions.regions[i]; | |
// hsa_region_segment_t segment; | |
// err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &segment); | |
// if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
// hsa_region_global_flag_t flags; | |
// err = hsa_region_get_info(region, HSA_REGION_INFO_GLOBAL_FLAGS, | |
// &flags); if (flags & HSA_REGION_GLOBAL_FLAG_KERNARG) { | |
// bool host_accessible = false; | |
// err = hsa_region_get_info( | |
// region, (hsa_region_info_t)HSA_AMD_REGION_INFO_HOST_ACCESSIBLE, | |
// &host_accessible); | |
// void* base_address = NULL; | |
// err = hsa_region_get_info( | |
// region, (hsa_region_info_t)HSA_AMD_REGION_INFO_BASE, | |
// &base_address); | |
// } | |
// } | |
// } | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
regions_t cpu_regions; | |
memset(&cpu_regions, 0, sizeof(cpu_regions)); | |
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions, | |
&cpu_regions); | |
hsa_amd_memory_pool_t cpu_fine_pool = {0}; | |
hsa_amd_memory_pool_t cpu_coarse_pool = {0}; | |
// hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { | |
// kernarg_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
cpu_coarse_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_pool = pool; | |
} | |
} | |
} | |
uint32_t gpu_queue_min_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, | |
&gpu_queue_min_size); | |
uint32_t gpu_queue_max_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
&gpu_queue_max_size); | |
uint32_t gpu_queue_size = gpu_queue_max_size; | |
hsa_queue_t* gpu_queue = NULL; | |
err = hsa_queue_create(agents.gpu_agent, gpu_queue_size, HSA_QUEUE_TYPE_MULTI, | |
gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queue); | |
// | |
hsa_file_t object_file = | |
open("experimental/hsa_tests/kernels_c.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY); | |
hsa_code_object_reader_t object_reader; | |
err = hsa_code_object_reader_create_from_file(object_file, &object_reader); | |
hsa_executable_t executable; | |
err = hsa_executable_create_alt(HSA_PROFILE_FULL, | |
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, | |
&executable); | |
hsa_loaded_code_object_t loaded_code_object; | |
err = hsa_executable_load_agent_code_object( | |
executable, agents.gpu_agent, object_reader, NULL, &loaded_code_object); | |
err = hsa_executable_freeze(executable, NULL); | |
hsa_ven_amd_loader_1_03_pfn_t loader; | |
err = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, | |
sizeof(loader), &loader); | |
uint64_t code_base = 0; | |
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info( | |
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, | |
&code_base); | |
uint64_t code_size = 0; | |
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info( | |
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_SIZE, | |
&code_size); | |
int64_t code_delta = 0; | |
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info( | |
loaded_code_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_DELTA, | |
&code_delta); | |
err = hsa_executable_iterate_agent_symbols(executable, agents.gpu_agent, | |
iterate_agent_symbol, NULL); | |
err = hsa_executable_iterate_symbols(executable, iterate_symbol, NULL); | |
err = hsa_executable_iterate_program_symbols(executable, | |
iterate_program_symbol, NULL); | |
struct kernel_info_t { | |
uint64_t handle; | |
uint32_t private_size; | |
uint32_t group_size; | |
uint32_t kernarg_alignment; | |
uint32_t kernarg_size; | |
} kernel_info; | |
hsa_executable_symbol_t symbol; | |
err = hsa_executable_get_symbol_by_name( | |
executable, "add_one_with_timestamp.kd", &agents.gpu_agent, &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info.handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info.private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info.group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info.kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info.kernarg_size); | |
void* kernarg_storage = NULL; | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info.kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storage); | |
err = | |
hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, kernarg_storage); | |
hsa_signal_t dispatch_signal; | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signal); | |
// err = hsa_amd_signal_create(1, 0, NULL, HSA_AMD_SIGNAL_IPC, | |
// &dispatch_signal); | |
err = hsa_amd_profiling_set_profiler_enabled(gpu_queue, 1); | |
// submit | |
// hidden args | |
// https://github.com/ROCm/clr/blob/amd-staging/rocclr/device/rocm/rocvirtual.cpp#L3134 | |
// uint8_t kind | |
// uint8_t value | |
// uint16_t offset | |
// | |
// or hardcode | |
// uint16_t implicit_offset; | |
// uint8_t kind_offsets[hidden enum]; | |
// if (kind_offsets[hidden_block_count_x] != 0xFF) { | |
// kernargs[kind_offsets[hidden_block_count_x]] = block_count_x; | |
// } | |
// ... | |
// keeps data small, no loops just conditional moves | |
// could have a bit for all? | |
// | |
// or bitmap of hidden args | |
// bit indicates presence | |
// assume dense and ordered because of the way the compiler does things | |
// | |
// amdgpu-no-implicitarg-ptr is the only way to disable | |
// otherwise all are required with a fixed 256b size | |
// so just need to find the base offset | |
// | |
// explicit args always start at 0 | |
// implicit are explicit + 8-byte aligned | |
// | |
// third_party/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp | |
typedef struct implicit_kernargs_t { | |
// Grid dispatch workgroup count. | |
// Some languages, such as OpenCL, support a last workgroup in each | |
// dimension being partial. This count only includes the non-partial | |
// workgroup count. This is not the same as the value in the AQL dispatch | |
// packet, which has the grid size in workitems. | |
// | |
// Represented in metadata as: | |
// hidden_block_count_x | |
// hidden_block_count_y | |
// hidden_block_count_z | |
uint32_t block_count[3]; // + 0/4/8 | |
// Grid dispatch workgroup size. | |
// This size only applies to the non-partial workgroups. This is the same | |
// value as the AQL dispatch packet workgroup size. | |
// | |
// Represented in metadata as: | |
// hidden_group_size_x | |
// hidden_group_size_y | |
// hidden_group_size_z | |
uint16_t group_size[3]; // + 12/14/16 | |
// Grid dispatch work group size of the partial work group, if it exists. | |
// Any dimension that does not exist must be 0. | |
// | |
// Represented in metadata as: | |
// hidden_remainder_x | |
// hidden_remainder_y | |
// hidden_remainder_z | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
// OpenCL grid dispatch global offset. | |
// | |
// Represented in metadata as: | |
// hidden_global_offset_x | |
// hidden_global_offset_y | |
// hidden_global_offset_z | |
uint64_t global_offset[3]; // + 40/48/56 | |
// Grid dispatch dimensionality. This is the same value as the AQL | |
// dispatch packet dimensionality. Must be a value between 1 and 3. | |
// | |
// Represented in metadata as: | |
// hidden_grid_dims | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
uint32_t element_count = 64; | |
uint32_t* buffer = NULL; | |
err = hsa_amd_memory_pool_allocate( | |
cpu_coarse_pool, element_count * sizeof(uint32_t), | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
buffer[i] = i; | |
} | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, buffer); | |
uint32_t grid_size[3] = {element_count, 1, 1}; | |
uint16_t workgroup_size[3] = {32, 1, 1}; | |
typedef struct add_one_args_t { | |
uint32_t n; | |
void* buffer; | |
} add_one_args_t; | |
add_one_args_t* explicit_kernargs = (add_one_args_t*)kernarg_storage; | |
explicit_kernargs->n = element_count; | |
explicit_kernargs->buffer = buffer; | |
implicit_kernargs_t* implicit_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storage + | |
iree_host_align(sizeof(add_one_args_t), 8)); | |
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit_kernargs->group_size[0] = workgroup_size[0]; | |
implicit_kernargs->group_size[1] = workgroup_size[1]; | |
implicit_kernargs->group_size[2] = workgroup_size[2]; | |
implicit_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet; | |
packet.header = HSA_PACKET_TYPE_INVALID; | |
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet.workgroup_size_x = workgroup_size[0]; | |
packet.workgroup_size_y = workgroup_size[1]; | |
packet.workgroup_size_z = workgroup_size[2]; | |
packet.reserved0 = 0; | |
packet.grid_size_x = grid_size[0]; | |
packet.grid_size_y = grid_size[1]; | |
packet.grid_size_z = grid_size[2]; | |
packet.private_segment_size = kernel_info.private_size; | |
packet.group_segment_size = kernel_info.group_size; | |
packet.kernel_object = kernel_info.handle; | |
packet.kernarg_address = kernarg_storage; | |
packet.completion_signal = dispatch_signal; | |
uint16_t packet_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet_header_setup = packet_header | (packet.setup << 16); | |
uint64_t packet_id = hsa_queue_add_write_index_screlease(gpu_queue, 1); | |
while ((packet_id - hsa_queue_load_read_index_acquire(gpu_queue)) >= | |
gpu_queue->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queue->base_address + | |
(packet_id & (gpu_queue->size - 1)) * 64); | |
nontemporalMemcpy(packet_ptr, &packet, sizeof(packet)); | |
atomic_store_explicit((volatile atomic_uint*)packet_ptr, packet_header_setup, | |
memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queue->doorbell_signal, packet_id); | |
hsa_signal_value_t wait_value = | |
hsa_signal_wait_scacquire(dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
if (wait_value != 0) { | |
fprintf(stderr, "wait failed\n"); | |
} | |
// can directly access agent-specific ticks on the signal | |
// could store these as part of the trace provider | |
// must call hsa_amd_profiling_set_profiler_enabled to ensure populated | |
// request a batch hsa_amd_profiling_convert_tick_to_system_domain? | |
// may still want to adjust, but avoid API overheads when converting 1000's | |
iree_amd_signal_t* amd_dispatch_signal = | |
(iree_amd_signal_t*)dispatch_signal.handle; | |
// amd_dispatch_signal->start_ts; | |
// amd_dispatch_signal->end_ts; | |
uint64_t start_ts = 0; | |
err = hsa_amd_profiling_convert_tick_to_system_domain( | |
agents.gpu_agent, amd_dispatch_signal->start_ts, &start_ts); | |
uint64_t end_ts = 0; | |
err = hsa_amd_profiling_convert_tick_to_system_domain( | |
agents.gpu_agent, amd_dispatch_signal->end_ts, &end_ts); | |
// (end - start) / system_frequency = seconds | |
hsa_amd_profiling_dispatch_time_t time; | |
err = hsa_amd_profiling_get_dispatch_time(agents.gpu_agent, dispatch_signal, | |
&time); | |
uint64_t system_frequency = 0; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, | |
&system_frequency); | |
uint64_t system_timestamp = 0; | |
err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &system_timestamp); | |
// sleep(10); | |
// uint64_t system_timestamp2 = 0; | |
// err = hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP, &system_timestamp2); | |
uint64_t kernel_time = ((uint64_t)buffer[0]) << 32 | buffer[1]; | |
// use kmt to query system/gpu time | |
// can use to adjust like GpuAgent::TranslateTime | |
// https://sourcegraph.com/github.com/ROCm/ROCR-Runtime@909b82d4632b86dff0faadcb19488a43d2108686/-/blob/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp?L2048 | |
static const char kfd_device_name[] = "/dev/kfd"; | |
int hsakmt_kfd_fd = open(kfd_device_name, O_RDWR | O_CLOEXEC); | |
if (hsakmt_kfd_fd == -1) { | |
// result = HSAKMT_STATUS_KERNEL_IO_CHANNEL_NOT_OPENED; | |
abort(); | |
} | |
// hsaKmtGetClockCounters | |
struct kfd_ioctl_get_clock_counters_args args; | |
args.gpu_id = 0; // node id | |
int kmt_err = | |
hsakmt_ioctl(hsakmt_kfd_fd, AMDKFD_IOC_GET_CLOCK_COUNTERS, &args); | |
if (hsakmt_kfd_fd) { | |
close(hsakmt_kfd_fd); | |
hsakmt_kfd_fd = -1; | |
} | |
uint64_t srcloc = ((uint64_t)buffer[2]) << 32 | buffer[3]; | |
typedef struct iree_tracing_location_s { | |
const char* name; | |
const char* function; | |
const char* file; | |
uint32_t line; | |
uint32_t color; | |
} iree_tracing_location_t; | |
const iree_tracing_location_t* srcloc_p = | |
(const iree_tracing_location_t*)srcloc; | |
assert(srcloc >= code_base && srcloc < code_base + code_size); | |
const uint8_t* host_code_base = NULL; | |
err = loader.hsa_ven_amd_loader_query_host_address( | |
(void*)code_base, (const void**)&host_code_base); | |
const iree_tracing_location_t* srcloc_shadow3 = | |
(iree_tracing_location_t*)(srcloc - code_base + host_code_base); | |
const iree_tracing_location_t* srcloc_shadow2 = NULL; | |
err = loader.hsa_ven_amd_loader_query_host_address( | |
(void*)srcloc, (const void**)&srcloc_shadow2); | |
const char* shadow_name = NULL; | |
err = loader.hsa_ven_amd_loader_query_host_address( | |
(void*)srcloc_shadow2->name, (const void**)&shadow_name); | |
uint8_t* code_shadow = (uint8_t*)malloc(code_size); | |
memcpy(code_shadow, (const void*)code_base, code_size); | |
iree_tracing_location_t* srcloc_shadow = | |
(iree_tracing_location_t*)(srcloc - code_base + (uint64_t)code_shadow); | |
srcloc_shadow->name = (const char*)((uint64_t)srcloc_shadow->name - | |
code_base + (uint64_t)code_shadow); | |
hsa_amd_pointer_info_t srcloc_info; | |
srcloc_info.size = sizeof(srcloc_info); | |
uint32_t num_agents_accessible = 0; | |
hsa_agent_t* accessible = NULL; | |
err = hsa_amd_pointer_info(srcloc_p, &srcloc_info, malloc, | |
&num_agents_accessible, &accessible); | |
// owned by gpu_agent, global_flags=5 (coarse | kernarg) | |
free(accessible); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
fprintf(stderr, "%u ", buffer[i]); | |
} | |
fprintf(stderr, "\n"); | |
err = hsa_amd_profiling_set_profiler_enabled(gpu_queue, 0); | |
err = hsa_amd_memory_pool_free(buffer); | |
err = hsa_signal_destroy(dispatch_signal); | |
err = hsa_amd_memory_pool_free(kernarg_storage); | |
err = hsa_queue_destroy(gpu_queue); | |
err = hsa_executable_destroy(executable); | |
err = hsa_code_object_reader_destroy(object_reader); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// pc_sampling? | |
// doesn't work | |
#include <alloca.h> | |
#include <errno.h> | |
#include <fcntl.h> // open | |
#include <memory.h> | |
#include <stdatomic.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <sys/ioctl.h> | |
#include <unistd.h> | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
__attribute__((always_inline)) static inline void nontemporalMemcpy( | |
void* __restrict dst, const void* __restrict src, size_t size) { | |
memcpy(dst, src, size); | |
} | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agent; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_region_t regions[32]; | |
} regions_t; | |
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) { | |
regions_t* regions = (regions_t*)user_data; | |
regions->regions[regions->count++] = region; | |
return HSA_STATUS_SUCCESS; | |
} | |
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue, | |
void* user_data) { | |
const char* status_str = NULL; | |
hsa_status_string(status, &status_str); | |
fprintf(stderr, "gpu_queue_callback %s", status_str); | |
} | |
static hsa_status_t pcs_config_callback( | |
const hsa_ven_amd_pcs_configuration_t* configuration, void* callback_data) { | |
// | |
return HSA_STATUS_SUCCESS; | |
} | |
#include "/home/nod/src/ROCR-Runtime/libhsakmt/include/hsakmt/hsakmt.h" | |
#include "/home/nod/src/ROCR-Runtime/libhsakmt/include/hsakmt/linux/kfd_ioctl.h" | |
void pcs_data_ready(void* client_callback_data, size_t data_size, | |
size_t lost_sample_count, | |
hsa_ven_amd_pcs_data_copy_callback_t data_copy_callback, | |
void* hsa_callback_data) { | |
perf_sample_snapshot_v1_t* sample_buffer = | |
(perf_sample_snapshot_v1_t*)client_callback_data; | |
fprintf(stderr, "PCS; data size = %zu, lost samples = %zu\n", data_size, | |
lost_sample_count); | |
hsa_status_t err = | |
data_copy_callback(hsa_callback_data, data_size, sample_buffer); | |
} | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agent = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.gpu_agent, | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
break; | |
} | |
} | |
memory_pools_t gpu_memory_pools; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools); | |
regions_t gpu_regions; | |
memset(&gpu_regions, 0, sizeof(gpu_regions)); | |
err = hsa_agent_iterate_regions(agents.gpu_agent, iterate_regions, | |
&gpu_regions); | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
regions_t cpu_regions; | |
memset(&cpu_regions, 0, sizeof(cpu_regions)); | |
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions, | |
&cpu_regions); | |
hsa_amd_memory_pool_t cpu_fine_pool = {0}; | |
hsa_amd_memory_pool_t cpu_coarse_pool = {0}; | |
hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { | |
kernarg_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
cpu_coarse_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_pool = pool; | |
} | |
} | |
} | |
uint32_t gpu_queue_min_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, | |
&gpu_queue_min_size); | |
uint32_t gpu_queue_max_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
&gpu_queue_max_size); | |
uint32_t gpu_queue_size = gpu_queue_max_size; | |
hsa_queue_t* gpu_queue = NULL; | |
err = hsa_queue_create(agents.gpu_agent, gpu_queue_size, HSA_QUEUE_TYPE_MULTI, | |
gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queue); | |
uint32_t gpu_node_id = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_NODE, &gpu_node_id); | |
int kmtfd = open("/dev/kfd", O_RDWR); | |
struct kfd_ioctl_pc_sample_args args; | |
memset(&args, 0, sizeof(args)); | |
args.op = KFD_IOCTL_PCS_OP_QUERY_CAPABILITIES; | |
// cat /sys/devices/virtual/kfd/kfd/topology/nodes/1/gpu_id | |
args.gpu_id = 56588; | |
args.sample_info_ptr = 0; //(uint64_t)sample_info; | |
args.num_sample_info = 0; // sample_info_sz; | |
args.flags = 0; | |
int ioctl_ret = 0; | |
do { | |
ioctl_ret = ioctl(kmtfd, AMDKFD_IOC_PC_SAMPLE, &args); | |
} while (ioctl_ret == -1 && (errno == EINTR || errno == EAGAIN)); | |
fprintf(stderr, "ioctl %d\n", ioctl_ret); | |
// uint32_t size = 0; | |
// HSAKMT_STATUS ret = | |
// hsaKmtPcSamplingQueryCapabilities(gpu_node_id, NULL, 0, &size); | |
// if (ret != HSAKMT_STATUS_SUCCESS || size == 0) { | |
// fprintf(stderr, "KMT FAIL\n"); | |
// } | |
// HsaPcSamplingInfo* sample_info_list = | |
// alloca(size * sizeof(HsaPcSamplingInfo)); | |
// ret = hsaKmtPcSamplingQueryCapabilities(gpu_node_id, sample_info_list, | |
// size, | |
// &size); | |
// iterate configs not found? | |
// HSA_EXTENSION_AMD_PC_SAMPLING extension bit | |
// err = hsa_ven_amd_pcs_iterate_configuration(agents.gpu_agent, | |
// pcs_config_callback, NULL); | |
perf_sample_snapshot_v1_t sample_buffer[16 * 1024]; | |
hsa_ven_amd_pcs_t pc_sampling; | |
err = hsa_ven_amd_pcs_create( | |
agents.gpu_agent, HSA_VEN_AMD_PCS_METHOD_STOCHASTIC_V1, | |
HSA_VEN_AMD_PCS_INTERVAL_UNITS_MICRO_SECONDS, 100, 0, | |
sizeof(sample_buffer), pcs_data_ready, sample_buffer, &pc_sampling); | |
// | |
hsa_file_t object_file = | |
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY); | |
hsa_code_object_reader_t object_reader; | |
err = hsa_code_object_reader_create_from_file(object_file, &object_reader); | |
hsa_executable_t executable; | |
err = hsa_executable_create_alt(HSA_PROFILE_FULL, | |
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, | |
&executable); | |
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agent, | |
object_reader, NULL, NULL); | |
err = hsa_executable_freeze(executable, NULL); | |
struct kernel_info_t { | |
uint64_t handle; | |
uint32_t private_size; | |
uint32_t group_size; | |
uint32_t kernarg_alignment; | |
uint32_t kernarg_size; | |
} kernel_info; | |
hsa_executable_symbol_t symbol; | |
err = hsa_executable_get_symbol_by_name(executable, "add_one.kd", | |
&agents.gpu_agent, &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info.handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info.private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info.group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info.kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info.kernarg_size); | |
err = hsa_ven_amd_pcs_start(pc_sampling); | |
void* kernarg_storage = NULL; | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info.kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storage); | |
err = | |
hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, kernarg_storage); | |
hsa_signal_t dispatch_signal; | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signal); | |
typedef struct implicit_kernargs_t { | |
uint32_t block_count[3]; // + 0/4/8 | |
uint16_t group_size[3]; // + 12/14/16 | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
uint64_t global_offset[3]; // + 40/48/56 | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
uint32_t element_count = 65; | |
uint32_t* buffer = NULL; | |
err = hsa_amd_memory_pool_allocate( | |
cpu_coarse_pool, element_count * sizeof(uint32_t), | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
buffer[i] = i; | |
} | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, buffer); | |
uint32_t grid_size[3] = {element_count, 1, 1}; | |
uint16_t workgroup_size[3] = {32, 1, 1}; | |
typedef struct add_one_args_t { | |
uint32_t n; | |
void* buffer; | |
} add_one_args_t; | |
add_one_args_t* explicit_kernargs = (add_one_args_t*)kernarg_storage; | |
explicit_kernargs->n = element_count; | |
explicit_kernargs->buffer = buffer; | |
implicit_kernargs_t* implicit_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storage + | |
iree_host_align(sizeof(add_one_args_t), 8)); | |
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit_kernargs->group_size[0] = workgroup_size[0]; | |
implicit_kernargs->group_size[1] = workgroup_size[1]; | |
implicit_kernargs->group_size[2] = workgroup_size[2]; | |
implicit_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet; | |
packet.header = HSA_PACKET_TYPE_INVALID; | |
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet.workgroup_size_x = workgroup_size[0]; | |
packet.workgroup_size_y = workgroup_size[1]; | |
packet.workgroup_size_z = workgroup_size[2]; | |
packet.reserved0 = 0; | |
packet.grid_size_x = grid_size[0]; | |
packet.grid_size_y = grid_size[1]; | |
packet.grid_size_z = grid_size[2]; | |
packet.private_segment_size = kernel_info.private_size; | |
packet.group_segment_size = kernel_info.group_size; | |
packet.kernel_object = kernel_info.handle; | |
packet.kernarg_address = kernarg_storage; | |
packet.reserved2 = 0; | |
packet.completion_signal = dispatch_signal; | |
uint16_t packet_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet_header_setup = packet_header | (packet.setup << 16); | |
uint64_t packet_id = hsa_queue_add_write_index_screlease(gpu_queue, 1); | |
while ((packet_id - hsa_queue_load_read_index_acquire(gpu_queue)) >= | |
gpu_queue->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queue->base_address + | |
(packet_id & (gpu_queue->size - 1)) * 64); | |
nontemporalMemcpy(packet_ptr, &packet, sizeof(packet)); | |
atomic_store_explicit((volatile atomic_uint*)packet_ptr, packet_header_setup, | |
memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queue->doorbell_signal, packet_id); | |
hsa_signal_value_t wait_value = | |
hsa_signal_wait_scacquire(dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
if (wait_value != 0) { | |
fprintf(stderr, "wait failed\n"); | |
} | |
err = hsa_ven_amd_pcs_stop(pc_sampling); | |
err = hsa_ven_amd_pcs_flush(pc_sampling); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
fprintf(stderr, "%u ", buffer[i]); | |
} | |
fprintf(stderr, "\n"); | |
err = hsa_amd_memory_pool_free(buffer); | |
err = hsa_signal_destroy(dispatch_signal); | |
err = hsa_amd_memory_pool_free(kernarg_storage); | |
err = hsa_ven_amd_pcs_destroy(pc_sampling); | |
err = hsa_queue_destroy(gpu_queue); | |
err = hsa_executable_destroy(executable); | |
err = hsa_code_object_reader_destroy(object_reader); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// mutli-device from host | |
#include <fcntl.h> // open | |
#include <memory.h> | |
#include <stdatomic.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
__attribute__((always_inline)) static inline void nontemporalMemcpy( | |
void* __restrict dst, const void* __restrict src, size_t size) { | |
memcpy(dst, src, size); | |
} | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agents[2]; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_region_t regions[32]; | |
} regions_t; | |
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) { | |
regions_t* regions = (regions_t*)user_data; | |
regions->regions[regions->count++] = region; | |
return HSA_STATUS_SUCCESS; | |
} | |
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue, | |
void* user_data) { | |
const char* status_str = NULL; | |
hsa_status_string(status, &status_str); | |
fprintf(stderr, "gpu_queue_callback %s", status_str); | |
} | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
uint32_t gpu_count = 0; | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agents[gpu_count++] = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.all_agents[i], | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
} | |
} | |
memory_pools_t gpu_memory_pools[2]; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agents[0], iterate_memory_pool, &gpu_memory_pools[0]); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agents[1], iterate_memory_pool, &gpu_memory_pools[1]); | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
regions_t cpu_regions; | |
memset(&cpu_regions, 0, sizeof(cpu_regions)); | |
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions, | |
&cpu_regions); | |
hsa_amd_memory_pool_t cpu_fine_pool = {0}; | |
hsa_amd_memory_pool_t cpu_coarse_pool = {0}; | |
hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { | |
kernarg_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
cpu_coarse_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_pool = pool; | |
} | |
} | |
} | |
// assumes same params | |
uint32_t gpu_queue_min_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MIN_SIZE, | |
&gpu_queue_min_size); | |
uint32_t gpu_queue_max_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
&gpu_queue_max_size); | |
uint32_t gpu_queue_size = gpu_queue_max_size; | |
hsa_queue_t* gpu_queues[2] = {NULL, NULL}; | |
err = hsa_queue_create(agents.gpu_agents[0], gpu_queue_size, | |
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queues[0]); | |
err = hsa_queue_create(agents.gpu_agents[1], gpu_queue_size, | |
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queues[1]); | |
// | |
hsa_file_t object_file = | |
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY); | |
hsa_code_object_reader_t object_reader; | |
err = hsa_code_object_reader_create_from_file(object_file, &object_reader); | |
hsa_executable_t executable; | |
err = hsa_executable_create_alt(HSA_PROFILE_FULL, | |
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, | |
&executable); | |
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[0], | |
object_reader, NULL, NULL); | |
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[1], | |
object_reader, NULL, NULL); | |
err = hsa_executable_freeze(executable, NULL); | |
struct kernel_info_t { | |
uint64_t handle; | |
uint32_t private_size; | |
uint32_t group_size; | |
uint32_t kernarg_alignment; | |
uint32_t kernarg_size; | |
} kernel_info[2]; | |
hsa_executable_symbol_t symbol; | |
err = hsa_executable_get_symbol_by_name(executable, "add_one.kd", | |
&agents.gpu_agents[0], &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[0].handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info[0].private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info[0].group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info[0].kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info[0].kernarg_size); | |
err = hsa_executable_get_symbol_by_name(executable, "mul_x.kd", | |
&agents.gpu_agents[1], &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[1].handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info[1].private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info[1].group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info[1].kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info[1].kernarg_size); | |
void* kernarg_storages[2] = {NULL, NULL}; | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[0].kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storages[0]); | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[0], NULL, | |
kernarg_storages[0]); | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[1].kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storages[1]); | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[1], NULL, | |
kernarg_storages[1]); | |
hsa_signal_t dispatch_signals[2]; | |
// if device->device then can use AMD_GPU_ONLY to avoid interrupt signals | |
// err = hsa_signal_create(1, 0, NULL, &dispatch_signals[0]); | |
err = hsa_amd_signal_create(1, 0, NULL, HSA_AMD_SIGNAL_AMD_GPU_ONLY, &dispatch_signals[0]); | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[1]); | |
typedef struct implicit_kernargs_t { | |
uint32_t block_count[3]; // + 0/4/8 | |
uint16_t group_size[3]; // + 12/14/16 | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
uint64_t global_offset[3]; // + 40/48/56 | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
uint32_t element_count = 65; | |
uint32_t* buffer = NULL; | |
err = hsa_amd_memory_pool_allocate( | |
cpu_coarse_pool, element_count * sizeof(uint32_t), | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
buffer[i] = i; | |
} | |
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL, buffer); | |
uint32_t grid_size[3] = {element_count, 1, 1}; | |
uint16_t workgroup_size[3] = {32, 1, 1}; | |
// gpu1: barrier (wait) -> dispatch2 | |
// gpu0: dispatch1 (signal) | |
// enqueue gpu1 before gpu0 to ensure wait happens | |
{ | |
hsa_barrier_and_packet_t barrier; | |
memset(&barrier, 0, sizeof(barrier)); | |
barrier.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
barrier.dep_signal[0] = dispatch_signals[0]; | |
// note uint16_t high is reserved0 | |
uint32_t barrier_header = | |
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint64_t barrier_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1); | |
while ((barrier_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >= | |
gpu_queues[1]->size) { | |
sleep(0); | |
} | |
hsa_barrier_and_packet_t* barrier_ptr = | |
(hsa_barrier_and_packet_t*)((uint8_t*)gpu_queues[1]->base_address + | |
(barrier_id & (gpu_queues[1]->size - 1)) * | |
64); | |
nontemporalMemcpy(barrier_ptr, &barrier, sizeof(barrier)); | |
atomic_store_explicit((volatile atomic_uint*)barrier_ptr, barrier_header, | |
memory_order_release); | |
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, barrier_id); | |
} | |
typedef struct mul_x_args_t { | |
uint32_t x; | |
uint32_t n; | |
void* buffer; | |
} mul_x_args_t; | |
mul_x_args_t* mul_x_kernargs = (mul_x_args_t*)kernarg_storages[1]; | |
mul_x_kernargs->x = 2; | |
mul_x_kernargs->n = element_count; | |
mul_x_kernargs->buffer = buffer; | |
implicit_kernargs_t* implicit1_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[1] + | |
iree_host_align(sizeof(mul_x_args_t), 8)); | |
implicit1_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit1_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit1_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit1_kernargs->group_size[0] = workgroup_size[0]; | |
implicit1_kernargs->group_size[1] = workgroup_size[1]; | |
implicit1_kernargs->group_size[2] = workgroup_size[2]; | |
implicit1_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit1_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit1_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit1_kernargs->reserved0 = 0; | |
implicit1_kernargs->reserved1 = 0; | |
implicit1_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit1_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit1_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit1_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet1; | |
packet1.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
packet1.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet1.workgroup_size_x = workgroup_size[0]; | |
packet1.workgroup_size_y = workgroup_size[1]; | |
packet1.workgroup_size_z = workgroup_size[2]; | |
packet1.reserved0 = 0; | |
packet1.grid_size_x = grid_size[0]; | |
packet1.grid_size_y = grid_size[1]; | |
packet1.grid_size_z = grid_size[2]; | |
packet1.private_segment_size = kernel_info[1].private_size; | |
packet1.group_segment_size = kernel_info[1].group_size; | |
packet1.kernel_object = kernel_info[1].handle; | |
packet1.kernarg_address = kernarg_storages[1]; | |
packet1.reserved2 = 0; | |
packet1.completion_signal = dispatch_signals[1]; | |
uint16_t packet1_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(0 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet1_header_setup = packet1_header | (packet1.setup << 16); | |
uint64_t packet1_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1); | |
while ((packet1_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >= | |
gpu_queues[1]->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet1_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[1]->base_address + | |
(packet1_id & (gpu_queues[1]->size - 1)) * | |
64); | |
nontemporalMemcpy(packet1_ptr, &packet1, sizeof(packet1)); | |
atomic_store_explicit((volatile atomic_uint*)packet1_ptr, | |
packet1_header_setup, memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, packet1_id); | |
// "ensure" gpu1 is waiting | |
sleep(1); | |
typedef struct add_one_args_t { | |
uint32_t n; | |
void* buffer; | |
} add_one_args_t; | |
add_one_args_t* add_one_kernargs = (add_one_args_t*)kernarg_storages[0]; | |
add_one_kernargs->n = element_count; | |
add_one_kernargs->buffer = buffer; | |
implicit_kernargs_t* implicit_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[0] + | |
iree_host_align(sizeof(add_one_args_t), 8)); | |
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit_kernargs->group_size[0] = workgroup_size[0]; | |
implicit_kernargs->group_size[1] = workgroup_size[1]; | |
implicit_kernargs->group_size[2] = workgroup_size[2]; | |
implicit_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet0; | |
packet0.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
packet0.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet0.workgroup_size_x = workgroup_size[0]; | |
packet0.workgroup_size_y = workgroup_size[1]; | |
packet0.workgroup_size_z = workgroup_size[2]; | |
packet0.reserved0 = 0; | |
packet0.grid_size_x = grid_size[0]; | |
packet0.grid_size_y = grid_size[1]; | |
packet0.grid_size_z = grid_size[2]; | |
packet0.private_segment_size = kernel_info[0].private_size; | |
packet0.group_segment_size = kernel_info[0].group_size; | |
packet0.kernel_object = kernel_info[0].handle; | |
packet0.kernarg_address = kernarg_storages[0]; | |
packet0.reserved2 = 0; | |
packet0.completion_signal = dispatch_signals[0]; | |
uint16_t packet0_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet0_header_setup = packet0_header | (packet0.setup << 16); | |
uint64_t packet0_id = hsa_queue_add_write_index_screlease(gpu_queues[0], 1); | |
while ((packet0_id - hsa_queue_load_read_index_acquire(gpu_queues[0])) >= | |
gpu_queues[0]->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet0_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[0]->base_address + | |
(packet0_id & (gpu_queues[0]->size - 1)) * | |
64); | |
nontemporalMemcpy(packet0_ptr, &packet0, sizeof(packet0)); | |
atomic_store_explicit((volatile atomic_uint*)packet0_ptr, | |
packet0_header_setup, memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queues[0]->doorbell_signal, packet0_id); | |
hsa_signal_value_t wait_value = | |
hsa_signal_wait_scacquire(dispatch_signals[1], HSA_SIGNAL_CONDITION_EQ, 0, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
if (wait_value != 0) { | |
fprintf(stderr, "wait failed\n"); | |
} | |
for (uint32_t i = 0; i < element_count; ++i) { | |
fprintf(stderr, "%u ", buffer[i]); | |
} | |
fprintf(stderr, "\n"); | |
err = hsa_amd_memory_pool_free(buffer); | |
err = hsa_signal_destroy(dispatch_signals[0]); | |
err = hsa_signal_destroy(dispatch_signals[1]); | |
err = hsa_amd_memory_pool_free(kernarg_storages[0]); | |
err = hsa_amd_memory_pool_free(kernarg_storages[1]); | |
err = hsa_queue_destroy(gpu_queues[0]); | |
err = hsa_queue_destroy(gpu_queues[1]); | |
err = hsa_executable_destroy(executable); | |
err = hsa_code_object_reader_destroy(object_reader); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// device to host call soft queue | |
// cpu_set | |
#define _GNU_SOURCE | |
#include <sched.h> | |
// | |
#include <errno.h> | |
#include <fcntl.h> // open | |
#include <inttypes.h> | |
#include <memory.h> | |
#include <numaif.h> // mbind | |
#include <pthread.h> | |
#include <stdatomic.h> | |
#include <stdbool.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <unistd.h> | |
#ifndef MPOL_F_STATIC_NODES | |
/* Bug in numaif.h, this should be defined in there. Definition copied | |
* from linux/mempolicy.h. | |
*/ | |
#define MPOL_F_STATIC_NODES (1 << 15) | |
#endif | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ven_amd_loader.h" | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
__attribute__((always_inline)) static inline void nontemporalMemcpy( | |
void* __restrict dst, const void* __restrict src, size_t size) { | |
memcpy(dst, src, size); | |
} | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agents[2]; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_region_t regions[32]; | |
} regions_t; | |
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) { | |
regions_t* regions = (regions_t*)user_data; | |
regions->regions[regions->count++] = region; | |
return HSA_STATUS_SUCCESS; | |
} | |
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue, | |
void* user_data) { | |
const char* status_str = NULL; | |
hsa_status_string(status, &status_str); | |
fprintf(stderr, "gpu_queue_callback %s", status_str); | |
} | |
static bool parse_cpumap_bitmask(char* line, cpu_set_t* cpu_set) { | |
char* str = line; | |
size_t base = 0; | |
while (strlen(str) > 1 && str[0] != '\n') { | |
unsigned long mask = strtoul(str, &str, 16); | |
if (str[0] == ',') ++str; | |
for (size_t i = 0; i < 32; ++i) { | |
if (mask & (1ul << i)) { | |
CPU_SET(base + i, cpu_set); | |
} | |
} | |
base += 32; | |
} | |
return true; | |
} | |
static void set_affinity(uint32_t cpu_node_id) { | |
// numa_node_to_cpus | |
// https://sourcegraph.com/github.com/stranded-fish/XtraBackup-Source-Code-Analysis/-/blob/storage/ndb/src/common/portlib/NdbNuma.cpp?L132-134 | |
// libnuma.so (.1) | |
// https://sourcegraph.com/github.com/numactl/numactl/-/blob/libnuma.c?L1471 | |
// /sys/devices/system/node/node%d/cpumap (node_id) | |
// nod@Shark30:~/src/ROCR-Runtime/build$ cat | |
// /sys/devices/system/node/node0/cpumap | |
// ffffffff,ffffffff,ffffffff,ffffffff,ffffffff,ffffffff | |
// numa_parse_bitmap_v2 | |
// https://sourcegraph.com/github.com/numactl/numactl/-/blob/libnuma.c?L1324:1-1324:21 | |
cpu_set_t cpu_set; | |
CPU_ZERO(&cpu_set); | |
char cpumap_name[256]; | |
snprintf(cpumap_name, sizeof(cpumap_name), | |
"/sys/devices/system/node/node%u/cpumap", cpu_node_id); | |
FILE* f = fopen(cpumap_name, "r"); | |
size_t len = 0; | |
char* line = NULL; | |
bool did_read = f && getdelim(&line, &len, '\n', f) >= 1; | |
bool did_parse = false; | |
if (did_read) { | |
did_parse = parse_cpumap_bitmask(line, &cpu_set); | |
} | |
if (line) free(line); | |
fclose(f); | |
if (did_parse) { | |
int ret = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set); | |
fprintf(stderr, "pinned affinity: %d\n", ret); | |
} | |
} | |
static bool is_cond_met(hsa_signal_condition32_t cond, | |
hsa_signal_value_t desired_value, | |
hsa_signal_value_t current_value) { | |
switch (cond) { | |
default: | |
case HSA_SIGNAL_CONDITION_EQ: | |
return current_value == desired_value; | |
case HSA_SIGNAL_CONDITION_NE: | |
return current_value != desired_value; | |
case HSA_SIGNAL_CONDITION_LT: | |
return current_value < desired_value; | |
case HSA_SIGNAL_CONDITION_GTE: | |
return current_value >= desired_value; | |
} | |
} | |
typedef struct { | |
uint32_t cpu_node_id; | |
pthread_t handle; | |
hsa_queue_t* queue; | |
hsa_signal_t doorbell; | |
} cpu_thread_t; | |
static void* cpu_thread_main(void* user_data) { | |
cpu_thread_t* cpu_thread = (cpu_thread_t*)user_data; | |
set_affinity(cpu_thread->cpu_node_id); | |
// loop over doorbell | |
uint64_t queue_mask = cpu_thread->queue->size - 1; | |
uint64_t last_packet_id = 0; | |
uint64_t read_index = 0; | |
while (true) { | |
uint64_t new_packet_id = (uint64_t)hsa_signal_wait_scacquire( | |
cpu_thread->doorbell, HSA_SIGNAL_CONDITION_NE, last_packet_id, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
fprintf(stderr, "CPU INCOMING WAKE %lu\n", new_packet_id); | |
last_packet_id = new_packet_id; | |
if (new_packet_id == UINT64_MAX) break; | |
while (read_index != | |
hsa_queue_load_write_index_scacquire(cpu_thread->queue)) { | |
hsa_agent_dispatch_packet_t* packet_ptr = | |
(hsa_agent_dispatch_packet_t*)cpu_thread->queue->base_address + | |
(read_index & queue_mask); | |
// wait until packet populated | |
uint32_t packet_type = HSA_PACKET_TYPE_INVALID; | |
do { | |
// NOTE: we assume this is waiting for at most a few cycles and spin. | |
uint32_t packet_header = atomic_load_explicit( | |
(volatile atomic_uint*)packet_ptr, memory_order_acquire); | |
packet_type = (packet_header >> HSA_PACKET_HEADER_TYPE) & | |
((1 << HSA_PACKET_HEADER_WIDTH_TYPE) - 1); | |
} while (packet_type == HSA_PACKET_TYPE_INVALID); | |
fprintf(stderr, "agent packet %lu read\n", read_index); | |
// copy packet locally | |
union { | |
uint8_t data[64]; | |
hsa_barrier_and_packet_t barrier_and; | |
hsa_barrier_or_packet_t barrier_or; | |
hsa_amd_barrier_value_packet_t barrier_value; | |
hsa_agent_dispatch_packet_t agent_dispatch; | |
} packet; | |
memcpy(packet.data, packet_ptr, sizeof(packet.data)); | |
// swap packet back to invalid so that it can be reused immediately | |
atomic_store_explicit((volatile atomic_uint*)packet_ptr, | |
(HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE), | |
memory_order_relaxed); | |
hsa_queue_store_read_index_screlease(cpu_thread->queue, ++read_index); | |
switch (packet_type) { | |
case HSA_PACKET_TYPE_BARRIER_AND: { | |
// TODO: propagate failures according to spec? | |
// https://github.com/ROCm/ROCR-Runtime/issues/241 | |
// need hsa_amd_signal_wait_all | |
for (uint32_t i = 0; i < 5; ++i) { | |
if (packet.barrier_and.dep_signal[i].handle != 0) { | |
// may have to handle spurious wakes? | |
while (hsa_signal_wait_scacquire(packet.barrier_and.dep_signal[i], | |
HSA_SIGNAL_CONDITION_EQ, 0u, | |
UINT64_MAX, | |
HSA_WAIT_STATE_BLOCKED) != 0) { | |
// retry | |
} | |
} | |
} | |
if (packet.barrier_and.completion_signal.handle != 0) { | |
hsa_signal_subtract_screlease(packet.barrier_and.completion_signal, | |
1); | |
} | |
break; | |
} | |
case HSA_PACKET_TYPE_BARRIER_OR: { | |
// TODO: propagate failures according to spec? | |
// hsa_amd_signal_wait_any does not accept 0 signals, have to filter | |
hsa_signal_t signals[5]; | |
uint32_t signal_count = 0; | |
for (uint32_t i = 0; i < 5; ++i) { | |
if (packet.barrier_or.dep_signal[i].handle != 0) { | |
signals[signal_count++] = packet.barrier_or.dep_signal[i]; | |
} | |
} | |
hsa_signal_condition_t conds[5] = { | |
HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_EQ, | |
HSA_SIGNAL_CONDITION_EQ, HSA_SIGNAL_CONDITION_EQ, | |
HSA_SIGNAL_CONDITION_EQ, | |
}; | |
hsa_signal_value_t values[5] = { | |
0, 0, 0, 0, 0, | |
}; | |
hsa_signal_value_t satisfying_value; | |
// relaxed semantics | |
// satisfying_index == UINT32_MAX if failed | |
uint32_t satisfying_index = hsa_amd_signal_wait_any( | |
signal_count, signals, conds, values, UINT64_MAX, | |
HSA_WAIT_STATE_BLOCKED, &satisfying_value); | |
// acquire on satisfying signal | |
if (satisfying_index != UINT32_MAX) { | |
hsa_signal_load_scacquire(signals[satisfying_index]); | |
} | |
if (packet.barrier_or.completion_signal.handle != 0) { | |
hsa_signal_subtract_screlease(packet.barrier_or.completion_signal, | |
1); | |
} | |
break; | |
} | |
case HSA_AMD_PACKET_TYPE_BARRIER_VALUE: { | |
// TODO: propagate failures according to spec? | |
if (packet.barrier_value.mask == UINT64_MAX) { | |
// default path for when we can use a normal wait | |
// may have to handle spurious wakes? | |
hsa_signal_wait_scacquire( | |
packet.barrier_value.signal, packet.barrier_value.cond, | |
packet.barrier_value.value, UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
} else { | |
// mask needs to be in the loop | |
uint64_t value = | |
hsa_signal_load_scacquire(packet.barrier_value.signal); | |
while (!is_cond_met(packet.barrier_value.cond, | |
packet.barrier_value.value, | |
value & packet.barrier_value.mask)) { | |
// wait until it changes value and try again | |
hsa_signal_wait_scacquire(packet.barrier_value.signal, | |
HSA_SIGNAL_CONDITION_NE, value, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
} | |
} | |
if (packet.barrier_value.completion_signal.handle != 0) { | |
hsa_signal_subtract_screlease( | |
packet.barrier_value.completion_signal, 1); | |
} | |
break; | |
} | |
case HSA_PACKET_TYPE_AGENT_DISPATCH: { | |
// TODO: propagate failures according to spec? | |
fprintf(stderr, | |
"agent dispatch %u: %p %" PRIx64 " (%" PRId64 ") %" PRIx64 | |
" (%" PRId64 ") %" PRIx64 " (%" PRId64 ") %" PRIx64 | |
" (%" PRId64 ")\n", | |
packet.agent_dispatch.type, | |
packet.agent_dispatch.return_address, | |
packet.agent_dispatch.arg[0], packet.agent_dispatch.arg[0], | |
packet.agent_dispatch.arg[1], packet.agent_dispatch.arg[1], | |
packet.agent_dispatch.arg[2], packet.agent_dispatch.arg[2], | |
packet.agent_dispatch.arg[3], packet.agent_dispatch.arg[3]); | |
if (packet.agent_dispatch.completion_signal.handle != 0) { | |
hsa_signal_subtract_screlease( | |
packet.agent_dispatch.completion_signal, 1); | |
} | |
break; | |
} | |
} | |
} | |
} | |
return NULL; | |
} | |
// AMD Signal Kind Enumeration Values. | |
enum iree_amd_signal_kind_t { | |
IREE_AMD_SIGNAL_KIND_INVALID = 0, | |
IREE_AMD_SIGNAL_KIND_USER = 1, | |
IREE_AMD_SIGNAL_KIND_DOORBELL = -1, | |
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 | |
}; | |
typedef int64_t iree_amd_signal_kind64_t; | |
typedef struct iree_amd_signal_s { | |
iree_amd_signal_kind64_t kind; | |
union { | |
volatile int64_t value; | |
volatile uint32_t* legacy_hardware_doorbell_ptr; | |
volatile uint64_t* hardware_doorbell_ptr; | |
}; | |
uint64_t event_mailbox_ptr; | |
uint32_t event_id; | |
uint32_t reserved1; | |
uint64_t start_ts; | |
uint64_t end_ts; | |
union { | |
/*iree_amd_queue_t*/ void* queue_ptr; | |
uint64_t reserved2; | |
}; | |
uint32_t reserved3[2]; | |
} iree_amd_signal_t; | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
uint32_t gpu_count = 0; | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agents[gpu_count++] = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.all_agents[i], | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
} | |
} | |
memory_pools_t gpu_memory_pools[2]; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agents[0], iterate_memory_pool, &gpu_memory_pools[0]); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agents[1], iterate_memory_pool, &gpu_memory_pools[1]); | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
regions_t cpu_regions; | |
memset(&cpu_regions, 0, sizeof(cpu_regions)); | |
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions, | |
&cpu_regions); | |
hsa_region_t cpu_fine_region; | |
for (uint32_t i = 0; i < cpu_regions.count; ++i) { | |
hsa_region_segment_t segment; | |
err = hsa_region_get_info(cpu_regions.regions[i], HSA_REGION_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t flags; | |
err = hsa_region_get_info(cpu_regions.regions[i], | |
HSA_REGION_INFO_GLOBAL_FLAGS, &flags); | |
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_region = cpu_regions.regions[i]; | |
} | |
} | |
} | |
hsa_amd_memory_pool_t cpu_fine_pool = {0}; | |
hsa_amd_memory_pool_t cpu_coarse_pool = {0}; | |
hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { | |
kernarg_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
cpu_coarse_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_pool = pool; | |
} | |
} | |
} | |
// assumes same params | |
uint32_t gpu_queue_min_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MIN_SIZE, | |
&gpu_queue_min_size); | |
uint32_t gpu_queue_max_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
&gpu_queue_max_size); | |
uint32_t gpu_queue_size = gpu_queue_max_size; | |
hsa_queue_t* gpu_queues[2] = {NULL, NULL}; | |
err = hsa_queue_create(agents.gpu_agents[0], gpu_queue_size, | |
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queues[0]); | |
err = hsa_queue_create(agents.gpu_agents[1], gpu_queue_size, | |
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queues[1]); | |
cpu_thread_t cpu_thread; | |
err = hsa_agent_get_info(agents.cpu_agent, HSA_AGENT_INFO_NODE, | |
&cpu_thread.cpu_node_id); | |
err = hsa_signal_create(0u, 1, &agents.cpu_agent, &cpu_thread.doorbell); | |
iree_amd_signal_t* sig = (iree_amd_signal_t*)cpu_thread.doorbell.handle; | |
err = hsa_soft_queue_create(cpu_fine_region, 8 * 1024, HSA_QUEUE_TYPE_MULTI, | |
HSA_QUEUE_FEATURE_AGENT_DISPATCH, | |
cpu_thread.doorbell, &cpu_thread.queue); | |
// doesn't work - EINVAL? | |
// maybe because queue memory already pinned? | |
// https://sourcegraph.com/github.com/ROCm/ROCT-Thunk-Interface/-/blob/src/fmm.c?L1743 | |
unsigned long node_mask[1] = {0}; | |
node_mask[0] = 1u << cpu_thread.cpu_node_id; | |
int mbind_ret = mbind( | |
cpu_thread.queue->base_address, | |
iree_host_align( | |
cpu_thread.queue->size * sizeof(hsa_agent_dispatch_packet_t), 4096), | |
MPOL_BIND, node_mask, cpu_thread.cpu_node_id + 1, MPOL_MF_MOVE); | |
if (mbind_ret) { | |
// docker blocks | |
if (errno == EPERM) { | |
fprintf(stderr, "mbind blocked\n"); | |
} else { | |
fprintf(stderr, "mbind failed\n"); | |
} | |
} | |
pthread_attr_t pthread_attr; | |
pthread_attr_init(&pthread_attr); | |
int pthread_ret = pthread_create(&cpu_thread.handle, &pthread_attr, | |
cpu_thread_main, &cpu_thread); | |
pthread_attr_destroy(&pthread_attr); | |
// | |
hsa_file_t object_file = | |
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY); | |
hsa_code_object_reader_t object_reader; | |
err = hsa_code_object_reader_create_from_file(object_file, &object_reader); | |
hsa_executable_t executable; | |
err = hsa_executable_create_alt(HSA_PROFILE_FULL, | |
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, | |
&executable); | |
hsa_loaded_code_object_t gpu0_object; | |
err = hsa_executable_load_agent_code_object( | |
executable, agents.gpu_agents[0], object_reader, NULL, &gpu0_object); | |
hsa_loaded_code_object_t gpu1_object; | |
err = hsa_executable_load_agent_code_object( | |
executable, agents.gpu_agents[1], object_reader, NULL, &gpu1_object); | |
err = hsa_executable_freeze(executable, NULL); | |
hsa_ven_amd_loader_1_03_pfn_t loader; | |
err = hsa_system_get_major_extension_table(HSA_EXTENSION_AMD_LOADER, 1, | |
sizeof(loader), &loader); | |
uint64_t code_base0 = 0; | |
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info( | |
gpu0_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, | |
&code_base0); | |
uint64_t code_base1 = 0; | |
err = loader.hsa_ven_amd_loader_loaded_code_object_get_info( | |
gpu1_object, HSA_VEN_AMD_LOADER_LOADED_CODE_OBJECT_INFO_LOAD_BASE, | |
&code_base1); | |
struct kernel_info_t { | |
uint64_t handle; | |
uint32_t private_size; | |
uint32_t group_size; | |
uint32_t kernarg_alignment; | |
uint32_t kernarg_size; | |
} kernel_info[2]; | |
hsa_executable_symbol_t symbol; | |
err = hsa_executable_get_symbol_by_name(executable, "issue_host_call.kd", | |
&agents.gpu_agents[0], &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[0].handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info[0].private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info[0].group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info[0].kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info[0].kernarg_size); | |
err = hsa_executable_get_symbol_by_name(executable, "mul_x.kd", | |
&agents.gpu_agents[1], &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[1].handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info[1].private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info[1].group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info[1].kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info[1].kernarg_size); | |
void* kernarg_storages[2] = {NULL, NULL}; | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[0].kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storages[0]); | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[0], NULL, | |
kernarg_storages[0]); | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[1].kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storages[1]); | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[1], NULL, | |
kernarg_storages[1]); | |
hsa_signal_t dispatch_signals[2]; | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[0]); | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[1]); | |
typedef struct implicit_kernargs_t { | |
uint32_t block_count[3]; // + 0/4/8 | |
uint16_t group_size[3]; // + 12/14/16 | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
uint64_t global_offset[3]; // + 40/48/56 | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
uint32_t element_count = 65; | |
uint32_t* buffer = NULL; | |
err = hsa_amd_memory_pool_allocate( | |
cpu_coarse_pool, element_count * sizeof(uint32_t), | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
buffer[i] = i; | |
} | |
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL, buffer); | |
uint32_t grid_size[3] = {element_count, 1, 1}; | |
uint16_t workgroup_size[3] = {32, 1, 1}; | |
// gpu1: barrier (wait) -> dispatch1 | |
// gpu0: dispatch0 (signal) | |
// enqueue gpu1 before gpu0 to ensure wait happens | |
{ | |
hsa_barrier_and_packet_t barrier; | |
memset(&barrier, 0, sizeof(barrier)); | |
barrier.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
barrier.dep_signal[0] = dispatch_signals[0]; | |
// note uint16_t high is reserved0 | |
uint32_t barrier_header = | |
(HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint64_t barrier_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1); | |
while ((barrier_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >= | |
gpu_queues[1]->size) { | |
sleep(0); | |
} | |
hsa_barrier_and_packet_t* barrier_ptr = | |
(hsa_barrier_and_packet_t*)((uint8_t*)gpu_queues[1]->base_address + | |
(barrier_id & (gpu_queues[1]->size - 1)) * | |
64); | |
nontemporalMemcpy(barrier_ptr, &barrier, sizeof(barrier)); | |
atomic_store_explicit((volatile atomic_uint*)barrier_ptr, barrier_header, | |
memory_order_release); | |
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, barrier_id); | |
} | |
typedef struct mul_x_args_t { | |
uint32_t x; | |
uint32_t n; | |
void* buffer; | |
} mul_x_args_t; | |
mul_x_args_t* mul_x_kernargs = (mul_x_args_t*)kernarg_storages[1]; | |
mul_x_kernargs->x = 2; | |
mul_x_kernargs->n = element_count; | |
mul_x_kernargs->buffer = buffer; | |
implicit_kernargs_t* implicit1_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[1] + | |
iree_host_align(sizeof(mul_x_args_t), 8)); | |
implicit1_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit1_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit1_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit1_kernargs->group_size[0] = workgroup_size[0]; | |
implicit1_kernargs->group_size[1] = workgroup_size[1]; | |
implicit1_kernargs->group_size[2] = workgroup_size[2]; | |
implicit1_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit1_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit1_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit1_kernargs->reserved0 = 0; | |
implicit1_kernargs->reserved1 = 0; | |
implicit1_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit1_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit1_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit1_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet1; | |
packet1.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
packet1.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet1.workgroup_size_x = workgroup_size[0]; | |
packet1.workgroup_size_y = workgroup_size[1]; | |
packet1.workgroup_size_z = workgroup_size[2]; | |
packet1.reserved0 = 0; | |
packet1.grid_size_x = grid_size[0]; | |
packet1.grid_size_y = grid_size[1]; | |
packet1.grid_size_z = grid_size[2]; | |
packet1.private_segment_size = kernel_info[1].private_size; | |
packet1.group_segment_size = kernel_info[1].group_size; | |
packet1.kernel_object = kernel_info[1].handle; | |
packet1.kernarg_address = kernarg_storages[1]; | |
packet1.reserved2 = 0; | |
packet1.completion_signal = dispatch_signals[1]; | |
uint16_t packet1_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(0 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_NONE << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet1_header_setup = packet1_header | (packet1.setup << 16); | |
uint64_t packet1_id = hsa_queue_add_write_index_screlease(gpu_queues[1], 1); | |
while ((packet1_id - hsa_queue_load_read_index_acquire(gpu_queues[1])) >= | |
gpu_queues[1]->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet1_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[1]->base_address + | |
(packet1_id & (gpu_queues[1]->size - 1)) * | |
64); | |
nontemporalMemcpy(packet1_ptr, &packet1, sizeof(packet1)); | |
atomic_store_explicit((volatile atomic_uint*)packet1_ptr, | |
packet1_header_setup, memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queues[1]->doorbell_signal, packet1_id); | |
// "ensure" gpu1 is waiting | |
sleep(1); | |
typedef struct issue_host_call_args_t { | |
hsa_queue_t* queue; | |
hsa_signal_t completion_signal; | |
uint32_t arg; | |
} issue_host_call_args_t; | |
issue_host_call_args_t* issue_host_call_kernargs = | |
(issue_host_call_args_t*)kernarg_storages[0]; | |
issue_host_call_kernargs->queue = cpu_thread.queue; | |
issue_host_call_kernargs->completion_signal = dispatch_signals[0]; | |
issue_host_call_kernargs->arg = 100; | |
implicit_kernargs_t* implicit_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[0] + | |
iree_host_align(sizeof(issue_host_call_args_t), | |
8)); | |
implicit_kernargs->block_count[0] = 1; | |
implicit_kernargs->block_count[1] = 1; | |
implicit_kernargs->block_count[2] = 1; | |
implicit_kernargs->group_size[0] = 1; | |
implicit_kernargs->group_size[1] = 1; | |
implicit_kernargs->group_size[2] = 1; | |
implicit_kernargs->remainder[0] = 0; | |
implicit_kernargs->remainder[1] = 0; | |
implicit_kernargs->remainder[2] = 0; | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; | |
implicit_kernargs->global_offset[1] = 0; | |
implicit_kernargs->global_offset[2] = 0; | |
implicit_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet0; | |
packet0.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
packet0.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet0.workgroup_size_x = 1; | |
packet0.workgroup_size_y = 1; | |
packet0.workgroup_size_z = 1; | |
packet0.reserved0 = 0; | |
packet0.grid_size_x = 1; | |
packet0.grid_size_y = 1; | |
packet0.grid_size_z = 1; | |
packet0.private_segment_size = kernel_info[0].private_size; | |
packet0.group_segment_size = kernel_info[0].group_size; | |
packet0.kernel_object = kernel_info[0].handle; | |
packet0.kernarg_address = kernarg_storages[0]; | |
packet0.reserved2 = 0; | |
packet0.completion_signal.handle = 0; | |
uint16_t packet0_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet0_header_setup = packet0_header | (packet0.setup << 16); | |
uint64_t packet0_id = hsa_queue_add_write_index_screlease(gpu_queues[0], 1); | |
while ((packet0_id - hsa_queue_load_read_index_acquire(gpu_queues[0])) >= | |
gpu_queues[0]->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet0_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[0]->base_address + | |
(packet0_id & (gpu_queues[0]->size - 1)) * | |
64); | |
nontemporalMemcpy(packet0_ptr, &packet0, sizeof(packet0)); | |
atomic_store_explicit((volatile atomic_uint*)packet0_ptr, | |
packet0_header_setup, memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queues[0]->doorbell_signal, packet0_id); | |
hsa_signal_value_t wait_value = | |
hsa_signal_wait_scacquire(dispatch_signals[1], HSA_SIGNAL_CONDITION_EQ, 0, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
if (wait_value != 0) { | |
fprintf(stderr, "wait failed\n"); | |
} | |
for (uint32_t i = 0; i < element_count; ++i) { | |
fprintf(stderr, "%u ", buffer[i]); | |
} | |
fprintf(stderr, "\n"); | |
// signal exit | |
hsa_signal_store_screlease(cpu_thread.doorbell, UINT64_MAX); | |
err = hsa_amd_memory_pool_free(buffer); | |
err = hsa_signal_destroy(dispatch_signals[0]); | |
err = hsa_signal_destroy(dispatch_signals[1]); | |
err = hsa_amd_memory_pool_free(kernarg_storages[0]); | |
err = hsa_amd_memory_pool_free(kernarg_storages[1]); | |
pthread_join(cpu_thread.handle, NULL); | |
err = hsa_queue_destroy(cpu_thread.queue); | |
err = hsa_signal_destroy(cpu_thread.doorbell); | |
err = hsa_queue_destroy(gpu_queues[0]); | |
err = hsa_queue_destroy(gpu_queues[1]); | |
err = hsa_executable_destroy(executable); | |
err = hsa_code_object_reader_destroy(object_reader); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// device to device dispatch enqueue | |
#include <errno.h> | |
#include <fcntl.h> // open | |
#include <inttypes.h> | |
#include <memory.h> | |
#include <stdatomic.h> | |
#include <stdbool.h> | |
#include <stdio.h> | |
#include <stdlib.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
__attribute__((always_inline)) static inline void nontemporalMemcpy( | |
void* __restrict dst, const void* __restrict src, size_t size) { | |
memcpy(dst, src, size); | |
} | |
// AMD Signal Kind Enumeration Values. | |
enum iree_amd_signal_kind_t { | |
IREE_AMD_SIGNAL_KIND_INVALID = 0, | |
IREE_AMD_SIGNAL_KIND_USER = 1, | |
IREE_AMD_SIGNAL_KIND_DOORBELL = -1, | |
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 | |
}; | |
typedef int64_t iree_amd_signal_kind64_t; | |
typedef struct __attribute__((aligned(64))) iree_amd_signal_s { | |
iree_amd_signal_kind64_t kind; | |
union { | |
volatile int64_t value; | |
volatile uint32_t* legacy_hardware_doorbell_ptr; | |
volatile uint64_t* hardware_doorbell_ptr; | |
}; | |
uint64_t event_mailbox_ptr; | |
uint32_t event_id; | |
uint32_t reserved1; | |
uint64_t start_ts; | |
uint64_t end_ts; | |
union { | |
/*iree_amd_queue_t*/ void* queue_ptr; | |
uint64_t reserved2; | |
}; | |
uint32_t reserved3[2]; | |
} iree_amd_signal_t; | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agents[2]; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_region_t regions[32]; | |
} regions_t; | |
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) { | |
regions_t* regions = (regions_t*)user_data; | |
regions->regions[regions->count++] = region; | |
return HSA_STATUS_SUCCESS; | |
} | |
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue, | |
void* user_data) { | |
const char* status_str = NULL; | |
hsa_status_string(status, &status_str); | |
fprintf(stderr, "gpu_queue_callback %s", status_str); | |
} | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
uint32_t gpu_count = 0; | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agents[gpu_count++] = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.all_agents[i], | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
} | |
} | |
memory_pools_t gpu_memory_pools[2]; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agents[0], iterate_memory_pool, &gpu_memory_pools[0]); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agents[1], iterate_memory_pool, &gpu_memory_pools[1]); | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
regions_t cpu_regions; | |
memset(&cpu_regions, 0, sizeof(cpu_regions)); | |
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions, | |
&cpu_regions); | |
hsa_region_t cpu_fine_region; | |
for (uint32_t i = 0; i < cpu_regions.count; ++i) { | |
hsa_region_segment_t segment; | |
err = hsa_region_get_info(cpu_regions.regions[i], HSA_REGION_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t flags; | |
err = hsa_region_get_info(cpu_regions.regions[i], | |
HSA_REGION_INFO_GLOBAL_FLAGS, &flags); | |
if (flags & HSA_REGION_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_region = cpu_regions.regions[i]; | |
} | |
} | |
} | |
hsa_amd_memory_pool_t cpu_fine_pool = {0}; | |
hsa_amd_memory_pool_t cpu_coarse_pool = {0}; | |
hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { | |
kernarg_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
cpu_coarse_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) { | |
cpu_fine_pool = pool; | |
} | |
} | |
} | |
hsa_amd_memory_pool_t device_fine_pools[2] = {0, 0}; | |
for (uint32_t j = 0; j < 2; ++j) { | |
for (uint32_t i = 0; i < gpu_memory_pools[j].count; ++i) { | |
hsa_amd_memory_pool_t pool = gpu_memory_pools[j].pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & | |
(HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED | | |
HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_EXTENDED_SCOPE_FINE_GRAINED)) { | |
device_fine_pools[j] = pool; | |
} | |
} | |
} | |
} | |
// assumes same params | |
uint32_t gpu_queue_min_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MIN_SIZE, | |
&gpu_queue_min_size); | |
uint32_t gpu_queue_max_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agents[0], HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
&gpu_queue_max_size); | |
uint32_t gpu_queue_size = gpu_queue_max_size; | |
hsa_queue_t* gpu_queues[2] = {NULL, NULL}; | |
err = hsa_queue_create(agents.gpu_agents[0], gpu_queue_size, | |
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queues[0]); | |
err = hsa_queue_create(agents.gpu_agents[1], gpu_queue_size, | |
HSA_QUEUE_TYPE_MULTI, gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queues[1]); | |
// | |
hsa_file_t object_file = | |
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY); | |
hsa_code_object_reader_t object_reader; | |
err = hsa_code_object_reader_create_from_file(object_file, &object_reader); | |
hsa_executable_t executable; | |
err = hsa_executable_create_alt(HSA_PROFILE_FULL, | |
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, | |
&executable); | |
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[0], | |
object_reader, NULL, NULL); | |
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agents[1], | |
object_reader, NULL, NULL); | |
err = hsa_executable_freeze(executable, NULL); | |
struct kernel_info_t { | |
uint64_t handle; | |
uint32_t private_size; | |
uint32_t group_size; | |
uint32_t kernarg_alignment; | |
uint32_t kernarg_size; | |
} kernel_info[2]; | |
hsa_executable_symbol_t symbol; | |
err = hsa_executable_get_symbol_by_name(executable, "issue_dispatch.kd", | |
&agents.gpu_agents[0], &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[0].handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info[0].private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info[0].group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info[0].kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info[0].kernarg_size); | |
err = hsa_executable_get_symbol_by_name(executable, "issue_dispatch.kd", | |
&agents.gpu_agents[1], &symbol); | |
uint64_t handle1 = 0; | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &handle1); | |
err = hsa_executable_get_symbol_by_name(executable, "mul_x.kd", | |
&agents.gpu_agents[1], &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info[1].handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info[1].private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info[1].group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info[1].kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info[1].kernarg_size); | |
void* kernarg_storages[2] = {NULL, NULL}; | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[0].kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storages[0]); | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[0], NULL, | |
kernarg_storages[0]); | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info[1].kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storages[1]); | |
// must be for both ages (write 0/read 1) | |
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL, | |
kernarg_storages[1]); | |
// allocate a fully device->device signal | |
// allocated on device[1] (as that'll be waiting) | |
// shared with both devices | |
// in fine-grained memory (system-wide atomics) | |
// | |
// MAYBE: point-to-point HSA_AMD_MEMORY_POOL_PCIE_FLAG? forces fine-grained | |
// even if pool is not? kmt_alloc_flags.ui32.CoarseGrain forced 0 | |
iree_amd_signal_t* device1_dispatch_signal = NULL; | |
err = hsa_amd_memory_pool_allocate( | |
device_fine_pools[1], sizeof(*device1_dispatch_signal), | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&device1_dispatch_signal); | |
err = hsa_amd_agents_allow_access(2, agents.gpu_agents, NULL, | |
device1_dispatch_signal); | |
iree_amd_signal_t device1_dispatch_signal_init; | |
memset(&device1_dispatch_signal_init, 0, | |
sizeof(device1_dispatch_signal_init)); | |
device1_dispatch_signal_init.kind = IREE_AMD_SIGNAL_KIND_USER; | |
device1_dispatch_signal_init.value = 1; | |
device1_dispatch_signal_init.queue_ptr = gpu_queues[1]; | |
err = hsa_memory_copy(device1_dispatch_signal, &device1_dispatch_signal_init, | |
sizeof(*device1_dispatch_signal)); | |
hsa_signal_t dispatch_signals[2]; | |
dispatch_signals[0].handle = (uint64_t)device1_dispatch_signal; | |
// err = hsa_amd_signal_create(1, 0, NULL, HSA_AMD_SIGNAL_AMD_GPU_ONLY, | |
// &dispatch_signals[0]); | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signals[1]); | |
typedef struct implicit_kernargs_t { | |
uint32_t block_count[3]; // + 0/4/8 | |
uint16_t group_size[3]; // + 12/14/16 | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
uint64_t global_offset[3]; // + 40/48/56 | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
uint32_t element_count = 65; | |
uint32_t* buffer = NULL; | |
err = hsa_amd_memory_pool_allocate( | |
cpu_coarse_pool, element_count * sizeof(uint32_t), | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
buffer[i] = i; | |
} | |
// buffer only used on 1 (and host) | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agents[1], NULL, buffer); | |
// gpu0: issue_dispatch | |
// gpu1: mul_x | |
// enqueue gpu1 before gpu0 to ensure wait happens | |
typedef struct issue_dispatch_args_t { | |
hsa_queue_t* queue; | |
uint64_t mul_x_object; | |
uint32_t mul_x_private_size; | |
uint32_t mul_x_group_size; | |
hsa_signal_t completion_signal; | |
void* buffer; | |
void* kernarg_storage; | |
uint32_t element_count; | |
uint32_t mul_by; | |
} issue_dispatch_args_t; | |
issue_dispatch_args_t* issue_dispatch_kernargs = | |
(issue_dispatch_args_t*)kernarg_storages[0]; | |
issue_dispatch_kernargs->queue = gpu_queues[1]; | |
issue_dispatch_kernargs->mul_x_object = kernel_info[1].handle; | |
issue_dispatch_kernargs->mul_x_private_size = kernel_info[1].private_size; | |
issue_dispatch_kernargs->mul_x_group_size = kernel_info[1].group_size; | |
issue_dispatch_kernargs->completion_signal = dispatch_signals[1]; | |
issue_dispatch_kernargs->buffer = buffer; | |
issue_dispatch_kernargs->kernarg_storage = kernarg_storages[1]; | |
issue_dispatch_kernargs->element_count = element_count; | |
issue_dispatch_kernargs->mul_by = 4; | |
// DO NOT SUBMIT no implicit needed? | |
#if 0 | |
implicit_kernargs_t* implicit_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storages[0] + | |
iree_host_align(sizeof(issue_dispatch_args_t), 8)); | |
implicit_kernargs->block_count[0] = 1; | |
implicit_kernargs->block_count[1] = 1; | |
implicit_kernargs->block_count[2] = 1; | |
implicit_kernargs->group_size[0] = 1; | |
implicit_kernargs->group_size[1] = 1; | |
implicit_kernargs->group_size[2] = 1; | |
implicit_kernargs->remainder[0] = 0; | |
implicit_kernargs->remainder[1] = 0; | |
implicit_kernargs->remainder[2] = 0; | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; | |
implicit_kernargs->global_offset[1] = 0; | |
implicit_kernargs->global_offset[2] = 0; | |
implicit_kernargs->grid_dims = 3; | |
#endif | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet0; | |
packet0.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
packet0.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet0.workgroup_size_x = 1; | |
packet0.workgroup_size_y = 1; | |
packet0.workgroup_size_z = 1; | |
packet0.reserved0 = 0; | |
packet0.grid_size_x = 1; | |
packet0.grid_size_y = 1; | |
packet0.grid_size_z = 1; | |
packet0.private_segment_size = kernel_info[0].private_size; | |
packet0.group_segment_size = kernel_info[0].group_size; | |
packet0.kernel_object = kernel_info[0].handle; | |
packet0.kernarg_address = kernarg_storages[0]; | |
packet0.reserved2 = 0; | |
packet0.completion_signal = dispatch_signals[0]; | |
uint16_t packet0_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet0_header_setup = packet0_header | (packet0.setup << 16); | |
uint64_t packet0_id = hsa_queue_add_write_index_screlease(gpu_queues[0], 1); | |
while ((packet0_id - hsa_queue_load_read_index_acquire(gpu_queues[0])) >= | |
gpu_queues[0]->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet0_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queues[0]->base_address + | |
(packet0_id & (gpu_queues[0]->size - 1)) * | |
64); | |
nontemporalMemcpy(packet0_ptr, &packet0, sizeof(packet0)); | |
atomic_store_explicit((volatile atomic_uint*)packet0_ptr, | |
packet0_header_setup, memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queues[0]->doorbell_signal, packet0_id); | |
hsa_signal_value_t wait_value = | |
hsa_signal_wait_scacquire(dispatch_signals[1], HSA_SIGNAL_CONDITION_EQ, 0, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
if (wait_value != 0) { | |
fprintf(stderr, "wait failed\n"); | |
} | |
for (uint32_t i = 0; i < element_count; ++i) { | |
fprintf(stderr, "%u ", buffer[i]); | |
} | |
fprintf(stderr, "\n"); | |
err = hsa_amd_memory_pool_free(buffer); | |
// user allocated signal | |
// err = hsa_signal_destroy(dispatch_signals[0]); | |
err = hsa_signal_destroy(dispatch_signals[1]); | |
err = hsa_amd_memory_pool_free(kernarg_storages[0]); | |
err = hsa_amd_memory_pool_free(kernarg_storages[1]); | |
err = hsa_queue_destroy(gpu_queues[0]); | |
err = hsa_queue_destroy(gpu_queues[1]); | |
err = hsa_executable_destroy(executable); | |
err = hsa_code_object_reader_destroy(object_reader); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// hip + hsa interop | |
#include <fcntl.h> // open | |
#include <memory.h> | |
#include <stdatomic.h> | |
#include <stdio.h> | |
#include <string.h> | |
#include <unistd.h> | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa.h" | |
#include "third_party/hsa-runtime-headers/include/hsa/hsa_ext_amd.h" | |
#define __HIP_PLATFORM_AMD__ | |
#include <hip/hip_runtime.h> | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
__attribute__((always_inline)) static inline void nontemporalMemcpy( | |
void* __restrict dst, const void* __restrict src, size_t size) { | |
memcpy(dst, src, size); | |
} | |
typedef struct { | |
uint32_t agent_count; | |
hsa_agent_t all_agents[32]; | |
hsa_agent_t cpu_agent; | |
hsa_agent_t gpu_agent; | |
} agents_t; | |
static hsa_status_t iterate_agent(hsa_agent_t agent, void* user_data) { | |
agents_t* agents = (agents_t*)user_data; | |
agents->all_agents[agents->agent_count++] = agent; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_amd_memory_pool_t pools[32]; | |
} memory_pools_t; | |
static hsa_status_t iterate_memory_pool(hsa_amd_memory_pool_t memory_pool, | |
void* user_data) { | |
memory_pools_t* memory_pools = (memory_pools_t*)user_data; | |
memory_pools->pools[memory_pools->count++] = memory_pool; | |
return HSA_STATUS_SUCCESS; | |
} | |
typedef struct { | |
uint32_t count; | |
hsa_region_t regions[32]; | |
} regions_t; | |
static hsa_status_t iterate_regions(hsa_region_t region, void* user_data) { | |
regions_t* regions = (regions_t*)user_data; | |
regions->regions[regions->count++] = region; | |
return HSA_STATUS_SUCCESS; | |
} | |
static void gpu_queue_callback(hsa_status_t status, hsa_queue_t* queue, | |
void* user_data) { | |
const char* status_str = NULL; | |
hsa_status_string(status, &status_str); | |
fprintf(stderr, "gpu_queue_callback %s", status_str); | |
} | |
int main(int argc, char** argv) { | |
hsa_status_t err; | |
err = hsa_init(); | |
agents_t agents; | |
memset(&agents, 0, sizeof(agents)); | |
err = hsa_iterate_agents(iterate_agent, &agents); | |
for (uint32_t i = 0; i < agents.agent_count; ++i) { | |
hsa_device_type_t device_type = 0; | |
err = hsa_agent_get_info(agents.all_agents[i], HSA_AGENT_INFO_DEVICE, | |
&device_type); | |
if (device_type == HSA_DEVICE_TYPE_GPU) { | |
agents.gpu_agent = agents.all_agents[i]; | |
err = hsa_agent_get_info(agents.gpu_agent, | |
(hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, | |
&agents.cpu_agent); | |
break; | |
} | |
} | |
hipError_t herr = hipInit(0); | |
// device_props[0].uuid; "0e12865a3bf5b7ab" | |
// HSA_AMD_AGENT_INFO_UUID "GPU-0e12865a3bf5b7ab" | |
hipDeviceProp_t device_props[2]; | |
herr = hipGetDeviceProperties(&device_props[0], 0); | |
herr = hipGetDeviceProperties(&device_props[1], 1); | |
hipModule_t hip_module; | |
herr = hipModuleLoad(&hip_module, "experimental/hsa_tests/kernels_hip.elf"); | |
// no way to interop events/signals? | |
// ipc events in hip are multiple signals with a ringbuffer | |
// external semaphores not supported | |
// regular events | |
// https://github.com/ROCm/clr/blob/d6193a2f23d08f24bdc2a26f9501ae6cfc73ccf0/hipamd/src/hip_event.cpp#L420 | |
// event record bottoms out on ProfilingSignal: | |
// https://github.com/ROCm/clr/blob/d6193a2f23d08f24bdc2a26f9501ae6cfc73ccf0/rocclr/device/rocm/rocdevice.hpp#L79 | |
// which has hsa_signal_t | |
memory_pools_t gpu_memory_pools; | |
memset(&gpu_memory_pools, 0, sizeof(gpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.gpu_agent, iterate_memory_pool, &gpu_memory_pools); | |
regions_t gpu_regions; | |
memset(&gpu_regions, 0, sizeof(gpu_regions)); | |
err = hsa_agent_iterate_regions(agents.gpu_agent, iterate_regions, | |
&gpu_regions); | |
memory_pools_t cpu_memory_pools; | |
memset(&cpu_memory_pools, 0, sizeof(cpu_memory_pools)); | |
err = hsa_amd_agent_iterate_memory_pools( | |
agents.cpu_agent, iterate_memory_pool, &cpu_memory_pools); | |
regions_t cpu_regions; | |
memset(&cpu_regions, 0, sizeof(cpu_regions)); | |
err = hsa_agent_iterate_regions(agents.cpu_agent, iterate_regions, | |
&cpu_regions); | |
hsa_amd_memory_pool_t cpu_coarse_pool = {0}; | |
hsa_amd_memory_pool_t kernarg_pool = {0}; | |
for (uint32_t i = 0; i < cpu_memory_pools.count; ++i) { | |
hsa_amd_memory_pool_t pool = cpu_memory_pools.pools[i]; | |
hsa_region_segment_t segment; | |
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, | |
&segment); | |
if (segment == HSA_REGION_SEGMENT_GLOBAL) { | |
hsa_region_global_flag_t global_flag; | |
err = hsa_amd_memory_pool_get_info( | |
pool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &global_flag); | |
if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) { | |
kernarg_pool = pool; | |
} else if (global_flag & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) { | |
cpu_coarse_pool = pool; | |
} | |
} | |
} | |
uint32_t gpu_queue_min_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MIN_SIZE, | |
&gpu_queue_min_size); | |
uint32_t gpu_queue_max_size = 0; | |
err = hsa_agent_get_info(agents.gpu_agent, HSA_AGENT_INFO_QUEUE_MAX_SIZE, | |
&gpu_queue_max_size); | |
uint32_t gpu_queue_size = gpu_queue_max_size; | |
hsa_queue_t* gpu_queue = NULL; | |
err = hsa_queue_create(agents.gpu_agent, gpu_queue_size, HSA_QUEUE_TYPE_MULTI, | |
gpu_queue_callback, | |
/*callback_data=*/NULL, | |
/*private_segment_size=*/UINT32_MAX, | |
/*group_segment_size=*/UINT32_MAX, &gpu_queue); | |
// | |
hsa_file_t object_file = | |
open("experimental/hsa_tests/kernels_cl.elf", O_RDONLY); | |
// hsa_file_t object_file = | |
// open("experimental/hsa_tests/kernels_hip.elf", O_RDONLY); | |
hsa_code_object_reader_t object_reader; | |
err = hsa_code_object_reader_create_from_file(object_file, &object_reader); | |
hsa_executable_t executable; | |
err = hsa_executable_create_alt(HSA_PROFILE_FULL, | |
HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, NULL, | |
&executable); | |
err = hsa_executable_load_agent_code_object(executable, agents.gpu_agent, | |
object_reader, NULL, NULL); | |
err = hsa_executable_freeze(executable, NULL); | |
struct kernel_info_t { | |
uint64_t handle; | |
uint32_t private_size; | |
uint32_t group_size; | |
uint32_t kernarg_alignment; | |
uint32_t kernarg_size; | |
} kernel_info; | |
hsa_executable_symbol_t symbol; | |
err = hsa_executable_get_symbol_by_name(executable, "add_one.kd", | |
&agents.gpu_agent, &symbol); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, &kernel_info.handle); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, | |
&kernel_info.private_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, | |
&kernel_info.group_size); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_ALIGNMENT, | |
&kernel_info.kernarg_alignment); | |
err = hsa_executable_symbol_get_info( | |
symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, | |
&kernel_info.kernarg_size); | |
void* kernarg_storage = NULL; | |
err = hsa_amd_memory_pool_allocate(kernarg_pool, kernel_info.kernarg_size, | |
HSA_AMD_MEMORY_POOL_STANDARD_FLAG, | |
&kernarg_storage); | |
err = | |
hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, kernarg_storage); | |
hsa_signal_t dispatch_signal; | |
err = hsa_signal_create(1, 0, NULL, &dispatch_signal); | |
typedef struct implicit_kernargs_t { | |
uint32_t block_count[3]; // + 0/4/8 | |
uint16_t group_size[3]; // + 12/14/16 | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
uint64_t global_offset[3]; // + 40/48/56 | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
uint32_t element_count = 65; | |
uint32_t* buffer = NULL; | |
// err = hsa_amd_memory_pool_allocate( | |
// cpu_coarse_pool, element_count * sizeof(uint32_t), | |
// HSA_AMD_MEMORY_POOL_STANDARD_FLAG, (void**)&buffer); | |
herr = hipHostMalloc(&buffer, element_count * sizeof(uint32_t), 0); | |
for (uint32_t i = 0; i < element_count; ++i) { | |
buffer[i] = i; | |
} | |
err = hsa_amd_agents_allow_access(1, &agents.gpu_agent, NULL, buffer); | |
uint32_t grid_size[3] = {element_count, 1, 1}; | |
uint16_t workgroup_size[3] = {32, 1, 1}; | |
typedef struct add_one_args_t { | |
uint32_t n; | |
void* buffer; | |
} add_one_args_t; | |
add_one_args_t* explicit_kernargs = (add_one_args_t*)kernarg_storage; | |
explicit_kernargs->n = element_count; | |
explicit_kernargs->buffer = buffer; | |
implicit_kernargs_t* implicit_kernargs = | |
(implicit_kernargs_t*)((uint8_t*)kernarg_storage + | |
iree_host_align(sizeof(add_one_args_t), 8)); | |
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit_kernargs->group_size[0] = workgroup_size[0]; | |
implicit_kernargs->group_size[1] = workgroup_size[1]; | |
implicit_kernargs->group_size[2] = workgroup_size[2]; | |
implicit_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet; | |
packet.header = HSA_PACKET_TYPE_INVALID; | |
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet.workgroup_size_x = workgroup_size[0]; | |
packet.workgroup_size_y = workgroup_size[1]; | |
packet.workgroup_size_z = workgroup_size[2]; | |
packet.reserved0 = 0; | |
packet.grid_size_x = grid_size[0]; | |
packet.grid_size_y = grid_size[1]; | |
packet.grid_size_z = grid_size[2]; | |
packet.private_segment_size = kernel_info.private_size; | |
packet.group_segment_size = kernel_info.group_size; | |
packet.kernel_object = kernel_info.handle; | |
packet.kernarg_address = kernarg_storage; | |
packet.completion_signal = dispatch_signal; | |
uint16_t packet_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet_header_setup = packet_header | (packet.setup << 16); | |
uint64_t packet_id = hsa_queue_add_write_index_screlease(gpu_queue, 1); | |
while ((packet_id - hsa_queue_load_read_index_acquire(gpu_queue)) >= | |
gpu_queue->size) { | |
sleep(0); | |
} | |
hsa_kernel_dispatch_packet_t* packet_ptr = | |
(hsa_kernel_dispatch_packet_t*)((uint8_t*)gpu_queue->base_address + | |
(packet_id & (gpu_queue->size - 1)) * 64); | |
nontemporalMemcpy(packet_ptr, &packet, sizeof(packet)); | |
atomic_store_explicit((volatile atomic_uint*)packet_ptr, packet_header_setup, | |
memory_order_release); | |
// value ignored in MULTI cases | |
hsa_signal_store_relaxed(gpu_queue->doorbell_signal, packet_id); | |
hsa_signal_value_t wait_value = | |
hsa_signal_wait_scacquire(dispatch_signal, HSA_SIGNAL_CONDITION_EQ, 0, | |
UINT64_MAX, HSA_WAIT_STATE_BLOCKED); | |
if (wait_value != 0) { | |
fprintf(stderr, "wait failed\n"); | |
} | |
for (uint32_t i = 0; i < element_count; ++i) { | |
fprintf(stderr, "%u ", buffer[i]); | |
} | |
fprintf(stderr, "\n"); | |
// err = hsa_amd_memory_pool_free(buffer); | |
herr = hipHostFree(buffer); | |
err = hsa_signal_destroy(dispatch_signal); | |
err = hsa_amd_memory_pool_free(kernarg_storage); | |
err = hsa_queue_destroy(gpu_queue); | |
err = hsa_executable_destroy(executable); | |
err = hsa_code_object_reader_destroy(object_reader); | |
hipModuleUnload(hip_module); | |
err = hsa_shut_down(); | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright 2024 The IREE Authors | |
// | |
// Licensed under the Apache License v2.0 with LLVM Exceptions. | |
// See https://llvm.org/LICENSE.txt for license information. | |
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |
#ifndef IREE_HAL_DRIVERS_AMDGPU_DEVICE_SUPPORT_OPENCL_H_ | |
#define IREE_HAL_DRIVERS_AMDGPU_DEVICE_SUPPORT_OPENCL_H_ | |
//===----------------------------------------------------------------------===// | |
// Compiler Configuration | |
//===----------------------------------------------------------------------===// | |
#if defined(__AMDGPU__) | |
#define IREE_AMDGPU_TARGET_DEVICE 1 | |
#else | |
#define IREE_AMDGPU_TARGET_HOST 1 | |
#endif // __AMDGPU__ | |
//===----------------------------------------------------------------------===// | |
// OpenCL Attributes | |
//===----------------------------------------------------------------------===// | |
#if defined(IREE_AMDGPU_TARGET_DEVICE) | |
#define __kernel [[clang::amdgpu_kernel, gnu::visibility("protected")]] | |
#define IREE_AMDGPU_RESTRICT __restrict__ | |
#define IREE_AMDGPU_ALIGNAS(x) __attribute__((aligned(x))) | |
#define IREE_AMDGPU_GLOBAL | |
#define IREE_AMDGPU_CONSTANT | |
#define IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline)) | |
#define IREE_AMDGPU_ATTRIBUTE_CONST | |
#define IREE_AMDGPU_ATTRIBUTE_SINGLE_WORK_ITEM __attribute__((work_group_size_hint(1, 1, 1)) | |
#define IREE_AMDGPU_ATTRIBUTE_PACKED __attribute__((__packed__)) | |
#define IREE_AMDGPU_LIKELY(x) (__builtin_expect(!!(x), 1)) | |
#define IREE_AMDGPU_UNLIKELY(x) (__builtin_expect(!!(x), 0)) | |
#define IREE_AMDGPU_STATIC_ASSERT(x, y) \ | |
IREE_AMDGPU_STATIC_ASSERT__(x, __COUNTER__) | |
#define IREE_AMDGPU_STATIC_ASSERT__(x, y) IREE_AMDGPU_STATIC_ASSERT___(x, y) | |
#define IREE_AMDGPU_STATIC_ASSERT___(x, y) \ | |
typedef char __assert_##y[(x) ? 1 : -1] __attribute__((__unused__)) | |
#else | |
#define IREE_AMDGPU_RESTRICT IREE_RESTRICT | |
#define IREE_AMDGPU_ALIGNAS(x) iree_alignas(x) | |
#define IREE_AMDGPU_GLOBAL | |
#define IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE IREE_ATTRIBUTE_ALWAYS_INLINE | |
#define IREE_AMDGPU_ATTRIBUTE_CONST | |
#define IREE_AMDGPU_ATTRIBUTE_SINGLE_WORK_ITEM | |
#define IREE_AMDGPU_ATTRIBUTE_PACKED IREE_ATTRIBUTE_PACKED | |
#define IREE_AMDGPU_LIKELY(x) IREE_LIKELY(x) | |
#define IREE_AMDGPU_UNLIKELY(x) IREE_UNLIKELY(x) | |
#define IREE_AMDGPU_STATIC_ASSERT(x, y) static_assert(x, y) | |
#endif // IREE_AMDGPU_TARGET_DEVICE | |
//===----------------------------------------------------------------------===// | |
// Types | |
//===----------------------------------------------------------------------===// | |
#if defined(IREE_AMDGPU_TARGET_DEVICE) | |
typedef char int8_t; | |
typedef unsigned char uint8_t; | |
typedef short int16_t; | |
typedef unsigned short uint16_t; | |
typedef int int32_t; | |
typedef unsigned int uint32_t; | |
typedef long int64_t; | |
typedef unsigned long uint64_t; | |
typedef int64_t ssize_t; | |
typedef uint64_t size_t; | |
#else | |
#include <stdint.h> | |
#endif // IREE_AMDGPU_TARGET_DEVICE | |
//===----------------------------------------------------------------------===// | |
// Alignment | |
//===----------------------------------------------------------------------===// | |
#define IREE_AMDGPU_MIN(a, b) (((a) < (b)) ? (a) : (b)) | |
#define IREE_AMDGPU_MAX(a, b) (((a) > (b)) ? (a) : (b)) | |
#define IREE_AMDGPU_CEIL_DIV(lhs, rhs) (((lhs) + (rhs) - 1) / (rhs)) | |
//===----------------------------------------------------------------------===// | |
// OpenCL Atomics | |
//===----------------------------------------------------------------------===// | |
#define iree_hal_amdgpu_device_destructive_interference_size 64 | |
#define iree_hal_amdgpu_device_constructive_interference_size 64 | |
#if defined(IREE_AMDGPU_TARGET_DEVICE) | |
// #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable | |
// #pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable | |
#define memory_scope_all_svm_devices __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES | |
#define memory_scope_all_devices memory_scope_all_svm_devices | |
typedef uint32_t iree_hal_amdgpu_device_memory_order_t; | |
#define iree_hal_amdgpu_device_memory_order_relaxed __ATOMIC_RELAXED | |
#define iree_hal_amdgpu_device_memory_order_acquire __ATOMIC_ACQUIRE | |
#define iree_hal_amdgpu_device_memory_order_release __ATOMIC_RELEASE | |
#define iree_hal_amdgpu_device_memory_order_acq_rel __ATOMIC_ACQ_REL | |
#define iree_hal_amdgpu_device_memory_order_seq_cst __ATOMIC_SEQ_CST | |
#define iree_hal_amdgpu_device_memory_scope_work_item memory_scope_work_item | |
#define iree_hal_amdgpu_device_memory_scope_work_group memory_scope_work_group | |
#define iree_hal_amdgpu_device_memory_scope_device memory_scope_device | |
#define iree_hal_amdgpu_device_memory_scope_system memory_scope_all_svm_devices | |
#define iree_hal_amdgpu_device_memory_scope_sub_group memory_scope_sub_group | |
#define IREE_HAL_AMDGPU_DEVICE_ATOMIC_INIT(object, value) \ | |
atomic_init((object), (value)) | |
typedef _Atomic int32_t iree_hal_amdgpu_device_atomic_int32_t; | |
typedef _Atomic int64_t iree_hal_amdgpu_device_atomic_int64_t; | |
typedef _Atomic uint32_t iree_hal_amdgpu_device_atomic_uint32_t; | |
typedef _Atomic uint64_t iree_hal_amdgpu_device_atomic_uint64_t; | |
#define iree_hal_amdgpu_device_atomic_load_explicit(object, memory_order, \ | |
memory_scope) \ | |
__opencl_atomic_load((object), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_store_explicit( \ | |
object, desired, memory_order, memory_scope) \ | |
__opencl_atomic_store((object), (desired), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_fetch_add_explicit( \ | |
object, operand, memory_order, memory_scope) \ | |
__opencl_atomic_fetch_add((object), (operand), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_fetch_sub_explicit( \ | |
object, operand, memory_order, memory_scope) \ | |
__opencl_atomic_fetch_sub((object), (operand), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_fetch_and_explicit( \ | |
object, operand, memory_order, memory_scope) \ | |
__opencl_atomic_fetch_and((object), (operand), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_fetch_or_explicit( \ | |
object, operand, memory_order, memory_scope) \ | |
__opencl_atomic_fetch_or((object), (operand), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_fetch_xor_explicit( \ | |
object, operand, memory_order, memory_scope) \ | |
__opencl_atomic_fetch_xor((object), (operand), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_exchange_explicit( \ | |
object, desired, memory_order, memory_scope) \ | |
__opencl_atomic_exchange((object), (desired), (memory_order), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_compare_exchange_weak_explicit( \ | |
object, expected, desired, memory_order_success, memory_order_fail, \ | |
memory_scope) \ | |
__opencl_atomic_compare_exchange_weak((object), (expected), (desired), \ | |
(memory_order_success), \ | |
(memory_order_fail), (memory_scope)) | |
#define iree_hal_amdgpu_device_atomic_compare_exchange_strong_explicit( \ | |
object, expected, desired, memory_order_success, memory_order_fail, \ | |
memory_scope) \ | |
__opencl_atomic_compare_exchange_strong((object), (expected), (desired), \ | |
(memory_order_success), \ | |
(memory_order_fail), (memory_scope)) | |
#else | |
#define IREE_HAL_AMDGPU_DEVICE_ATOMIC_INIT(object, value) \ | |
*(object) = IREE_ATOMIC_VAR_INIT(value) | |
typedef iree_atomic_int32_t iree_hal_amdgpu_device_atomic_int32_t; | |
typedef iree_atomic_int64_t iree_hal_amdgpu_device_atomic_int64_t; | |
typedef iree_atomic_uint32_t iree_hal_amdgpu_device_atomic_uint32_t; | |
typedef iree_atomic_uint64_t iree_hal_amdgpu_device_atomic_uint64_t; | |
#endif // IREE_AMDGPU_TARGET_DEVICE | |
//===----------------------------------------------------------------------===// | |
// OpenCL Dispatch ABI | |
//===----------------------------------------------------------------------===// | |
typedef struct iree_hsa_signal_s { | |
// Opaque handle. The value 0 is reserved. | |
uint64_t handle; | |
} iree_hsa_signal_t; | |
#define iree_hsa_signal_null() \ | |
(iree_hsa_signal_t) { 0 } | |
#define iree_hsa_signal_is_null(signal) ((signal).handle == 0) | |
typedef enum { | |
HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS = 0 | |
} hsa_kernel_dispatch_packet_setup_t; | |
typedef struct hsa_kernel_dispatch_packet_s { | |
uint16_t header; | |
uint16_t setup; | |
uint16_t workgroup_size_x; | |
uint16_t workgroup_size_y; | |
uint16_t workgroup_size_z; | |
uint16_t reserved0; | |
uint32_t grid_size_x; | |
uint32_t grid_size_y; | |
uint32_t grid_size_z; | |
uint32_t private_segment_size; | |
uint32_t group_segment_size; | |
uint64_t kernel_object; | |
IREE_AMDGPU_GLOBAL void* kernarg_address; | |
uint64_t reserved2; | |
iree_hsa_signal_t completion_signal; | |
} hsa_kernel_dispatch_packet_t; | |
typedef enum { | |
HSA_PACKET_TYPE_VENDOR_SPECIFIC = 0, | |
HSA_PACKET_TYPE_INVALID = 1, | |
HSA_PACKET_TYPE_KERNEL_DISPATCH = 2, | |
HSA_PACKET_TYPE_BARRIER_AND = 3, | |
HSA_PACKET_TYPE_AGENT_DISPATCH = 4, | |
HSA_PACKET_TYPE_BARRIER_OR = 5 | |
} hsa_packet_type_t; | |
typedef enum { | |
HSA_PACKET_HEADER_TYPE = 0, | |
HSA_PACKET_HEADER_BARRIER = 8, | |
HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE = 9, | |
HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE = 11, | |
} hsa_packet_header_t; | |
typedef enum { | |
HSA_FENCE_SCOPE_NONE = 0, | |
HSA_FENCE_SCOPE_AGENT = 1, | |
HSA_FENCE_SCOPE_SYSTEM = 2 | |
} hsa_fence_scope_t; | |
typedef struct hsa_agent_dispatch_packet_s { | |
uint16_t header; | |
uint16_t type; | |
uint32_t reserved0; | |
void* return_address; | |
uint64_t arg[4]; | |
uint64_t reserved2; | |
iree_hsa_signal_t completion_signal; | |
} iree_hsa_agent_dispatch_packet_t; | |
// AMD Signal Kind Enumeration Values. | |
enum iree_amd_signal_kind_t { | |
IREE_AMD_SIGNAL_KIND_INVALID = 0, | |
IREE_AMD_SIGNAL_KIND_USER = 1, | |
IREE_AMD_SIGNAL_KIND_DOORBELL = -1, | |
IREE_AMD_SIGNAL_KIND_LEGACY_DOORBELL = -2 | |
}; | |
typedef int64_t iree_amd_signal_kind64_t; | |
typedef struct IREE_AMDGPU_ALIGNAS(64) iree_amd_signal_s { | |
iree_amd_signal_kind64_t kind; | |
union { | |
volatile int64_t value; | |
IREE_AMDGPU_GLOBAL volatile uint32_t* legacy_hardware_doorbell_ptr; | |
IREE_AMDGPU_GLOBAL volatile uint64_t* hardware_doorbell_ptr; | |
}; | |
uint64_t event_mailbox_ptr; | |
uint32_t event_id; | |
uint32_t reserved1; | |
uint64_t start_ts; | |
uint64_t end_ts; | |
union { | |
IREE_AMDGPU_GLOBAL /*iree_amd_queue_t*/ void* queue_ptr; | |
uint64_t reserved2; | |
}; | |
uint32_t reserved3[2]; | |
} iree_amd_signal_t; | |
#if defined(IREE_AMDGPU_TARGET_DEVICE) | |
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_global_id(unsigned dim); | |
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_local_id(unsigned dim); | |
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_group_id(unsigned dim); | |
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_local_size(unsigned dim); | |
extern IREE_AMDGPU_ATTRIBUTE_CONST size_t __ockl_get_num_groups(unsigned dim); | |
// #define iree_hal_amdgpu_device_global_id_x() __ockl_get_global_id(0) | |
#define iree_hal_amdgpu_device_global_id_y() __ockl_get_global_id(1) | |
#define iree_hal_amdgpu_device_global_id_z() __ockl_get_global_id(2) | |
#define iree_hal_amdgpu_device_group_id_x() __ockl_get_group_id(0) | |
#define iree_hal_amdgpu_device_group_id_y() __ockl_get_group_id(1) | |
#define iree_hal_amdgpu_device_group_id_z() __ockl_get_group_id(2) | |
#define iree_hal_amdgpu_device_group_count_x() __ockl_get_num_groups(0) | |
#define iree_hal_amdgpu_device_group_count_y() __ockl_get_num_groups(1) | |
#define iree_hal_amdgpu_device_group_count_z() __ockl_get_num_groups(2) | |
#define iree_hal_amdgpu_device_local_id_x() __ockl_get_local_id(0) | |
#define iree_hal_amdgpu_device_local_id_y() __ockl_get_local_id(1) | |
#define iree_hal_amdgpu_device_local_id_z() __ockl_get_local_id(2) | |
#define iree_hal_amdgpu_device_workgroup_size_x() __ockl_get_local_size(0) | |
#define iree_hal_amdgpu_device_workgroup_size_y() __ockl_get_local_size(1) | |
#define iree_hal_amdgpu_device_workgroup_size_z() __ockl_get_local_size(2) | |
extern IREE_AMDGPU_ATTRIBUTE_CONST IREE_AMDGPU_CONSTANT void* | |
iree_amdgcn_implicitarg_ptr(void) __asm("llvm.amdgcn.implicitarg.ptr"); | |
// Returns the pointer to the iree_hsa_kernel_dispatch_packet_t being executed. | |
#define iree_amdgcn_dispatch_ptr __builtin_amdgcn_dispatch_ptr | |
// __ockl_get_global_id(0) / get_global_id_x using OLD_ABI | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE size_t | |
iree_hal_amdgpu_device_global_id_x(void) { | |
const uint32_t local_id = __builtin_amdgcn_workitem_id_x(); | |
const uint32_t group_id = __builtin_amdgcn_workgroup_id_x(); | |
const uint32_t group_size = | |
((hsa_kernel_dispatch_packet_t*)iree_amdgcn_dispatch_ptr()) | |
->workgroup_size_x; | |
return (group_id * group_size + local_id); | |
} | |
#endif // IREE_AMDGPU_TARGET_DEVICE | |
//===----------------------------------------------------------------------===// | |
// Sleep | |
//===----------------------------------------------------------------------===// | |
#if defined(IREE_AMDGPU_TARGET_DEVICE) | |
extern void __builtin_amdgcn_s_sleep(int); | |
// Sleeps the current thread for some "short" amount of time. | |
// This maps to the S_SLEEP instruction that varies on different architectures | |
// in how long it can delay execution. The behavior cannot be mapped to wall | |
// time as it suspends for 64*arg + 1-64 clocks but archs have different limits, | |
// clock speed can vary over the course of execution, etc. This is mostly only | |
// useful as a "yield for a few instructions to stop hammering a memory | |
// location" primitive. | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void | |
iree_hal_amdgpu_device_yield(void) { | |
__builtin_amdgcn_s_sleep(1); | |
} | |
#endif // IREE_AMDGPU_TARGET_DEVICE | |
#endif // IREE_HAL_DRIVERS_AMDGPU_DEVICE_SUPPORT_OPENCL_H_ | |
// const IREE_AMDGPU_CONSTANT int __oclc_ABI_version = 500; | |
const IREE_AMDGPU_CONSTANT bool __oclc_unsafe_math_opt = false; | |
const IREE_AMDGPU_CONSTANT bool __oclc_daz_opt = false; | |
const IREE_AMDGPU_CONSTANT bool __oclc_correctly_rounded_sqrt32 = true; | |
const IREE_AMDGPU_CONSTANT bool __oclc_finite_only_opt = false; | |
const IREE_AMDGPU_CONSTANT bool __oclc_wavefrontsize64 = | |
__AMDGCN_WAVEFRONT_SIZE__ == 64 ? 1 : 0; | |
#if defined(__gfx700__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7000; | |
#elif defined(__gfx701__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7001; | |
#elif defined(__gfx702__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7002; | |
#elif defined(__gfx703__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7003; | |
#elif defined(__gfx704__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7004; | |
#elif defined(__gfx705__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 7005; | |
#elif defined(__gfx801__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8001; | |
#elif defined(__gfx802__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8002; | |
#elif defined(__gfx803__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8003; | |
#elif defined(__gfx805__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8005; | |
#elif defined(__gfx810__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 8100; | |
#elif defined(__gfx900__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9000; | |
#elif defined(__gfx902__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9002; | |
#elif defined(__gfx904__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9004; | |
#elif defined(__gfx906__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9006; | |
#elif defined(__gfx908__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9008; | |
#elif defined(__gfx909__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9009; | |
#elif defined(__gfx90a__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9010; | |
#elif defined(__gfx90c__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9012; | |
#elif defined(__gfx940__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9400; | |
#elif defined(__gfx941__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9401; | |
#elif defined(__gfx942__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 9402; | |
#elif defined(__gfx1010__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10100; | |
#elif defined(__gfx1011__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10101; | |
#elif defined(__gfx1012__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10102; | |
#elif defined(__gfx1013__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10103; | |
#elif defined(__gfx1030__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10300; | |
#elif defined(__gfx1031__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10301; | |
#elif defined(__gfx1032__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10302; | |
#elif defined(__gfx1033__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10303; | |
#elif defined(__gfx1034__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10304; | |
#elif defined(__gfx1035__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10305; | |
#elif defined(__gfx1036__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 10306; | |
#elif defined(__gfx1100__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11000; | |
#elif defined(__gfx1101__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11001; | |
#elif defined(__gfx1102__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11002; | |
#elif defined(__gfx1103__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11003; | |
#elif defined(__gfx1150__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11500; | |
#elif defined(__gfx1151__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 11501; | |
#elif defined(__gfx1200__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 12000; | |
#elif defined(__gfx1201__) | |
const IREE_AMDGPU_CONSTANT unsigned __oclc_ISA_version = 12001; | |
#else | |
#error "Unknown AMDGPU architecture" | |
#endif | |
__kernel void add_one( | |
uint32_t n, IREE_AMDGPU_GLOBAL uint32_t* IREE_AMDGPU_RESTRICT buffer) { | |
const size_t idx = iree_hal_amdgpu_device_global_id_x(); | |
if (idx < n) { | |
buffer[idx] += 1; | |
} | |
} | |
typedef struct iree_tracing_location_s { | |
IREE_AMDGPU_CONSTANT const char* name; | |
IREE_AMDGPU_CONSTANT const char* function; | |
IREE_AMDGPU_CONSTANT const char* file; | |
uint32_t line; | |
uint32_t color; | |
} iree_tracing_location_t; | |
__kernel void add_one_with_timestamp( | |
uint32_t n, IREE_AMDGPU_GLOBAL uint32_t* IREE_AMDGPU_RESTRICT buffer) { | |
const size_t idx = iree_hal_amdgpu_device_global_id_x(); | |
if (idx == 0) { | |
// correlates with signal start_ts/end_ts | |
// can be converted with hsa_amd_profiling_convert_tick_to_system_domain | |
uint64_t t = __builtin_readsteadycounter(); | |
buffer[0] = (uint32_t)(t >> 32); | |
buffer[1] = (uint32_t)t; | |
} else if (idx == 2) { | |
static const IREE_AMDGPU_CONSTANT iree_tracing_location_t | |
__tracy_source_location = { | |
"helloworld", __FUNCTION__, __FILE__, | |
(uint32_t)__LINE__, 0xCDCDCDCD, | |
}; | |
uint64_t t = (uint64_t)&__tracy_source_location; | |
buffer[2] = (uint32_t)(t >> 32); | |
buffer[3] = (uint32_t)t; | |
} else if (idx > 3) { | |
buffer[idx] += 1; | |
} | |
} | |
__kernel void mul_x(uint32_t x, uint32_t n, | |
IREE_AMDGPU_GLOBAL uint32_t* IREE_AMDGPU_RESTRICT buffer) { | |
const size_t idx = iree_hal_amdgpu_device_global_id_x(); | |
if (idx < n) { | |
buffer[idx] *= x; | |
} | |
} | |
//===----------------------------------------------------------------------===// | |
// Device Library Externs | |
//===----------------------------------------------------------------------===// | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void | |
iree_hsa_signal_update_mailbox( | |
const IREE_AMDGPU_GLOBAL iree_amd_signal_t* IREE_AMDGPU_RESTRICT signal) { | |
IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t* mailbox = | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*) | |
signal->event_mailbox_ptr; | |
if (mailbox) { | |
const uint32_t event_id = signal->event_id; | |
iree_hal_amdgpu_device_atomic_store_explicit( | |
mailbox, event_id, iree_hal_amdgpu_device_memory_order_release, | |
iree_hal_amdgpu_device_memory_scope_system); | |
__builtin_amdgcn_s_sendmsg(1 | (0 << 4), | |
__builtin_amdgcn_readfirstlane(event_id) & 0xff); | |
} | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE int64_t | |
iree_hsa_signal_load(const iree_hsa_signal_t signal, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
const IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(const IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
return iree_hal_amdgpu_device_atomic_load_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_add( | |
iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
iree_hal_amdgpu_device_atomic_fetch_add_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_and( | |
iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
iree_hal_amdgpu_device_atomic_fetch_and_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_or( | |
iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
iree_hal_amdgpu_device_atomic_fetch_or_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_xor( | |
iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
iree_hal_amdgpu_device_atomic_fetch_xor_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE int64_t | |
iree_hsa_signal_exchange(iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
int64_t existing = iree_hal_amdgpu_device_atomic_exchange_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
return existing; | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_subtract( | |
iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
iree_hal_amdgpu_device_atomic_fetch_sub_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE int64_t | |
iree_hsa_signal_cas(iree_hsa_signal_t signal, int64_t expected, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
int64_t existing = expected; | |
if (iree_hal_amdgpu_device_atomic_compare_exchange_strong_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
&existing, value, memory_order, | |
iree_hal_amdgpu_device_memory_order_relaxed, | |
iree_hal_amdgpu_device_memory_scope_system)) { | |
iree_hsa_signal_update_mailbox(s); | |
} | |
return existing; | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void iree_hsa_signal_store( | |
iree_hsa_signal_t signal, int64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_signal_t* s = | |
(IREE_AMDGPU_GLOBAL iree_amd_signal_t*)signal.handle; | |
if (s->kind == IREE_AMD_SIGNAL_KIND_USER) { | |
// User signal may need a mailbox poke. | |
iree_hal_amdgpu_device_atomic_store_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_int64_t*)&s->value, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
iree_hsa_signal_update_mailbox(s); | |
} else { | |
// Hardware doorbell supports AQL semantics. | |
// NOTE: this requires __oclc_ISA_version >= 9000; older hardware doesn't | |
// support the atomic store knocks and needs emulation. | |
iree_hal_amdgpu_device_atomic_store_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*) | |
s->hardware_doorbell_ptr, | |
(uint64_t)value, iree_hal_amdgpu_device_memory_order_release, | |
iree_hal_amdgpu_device_memory_scope_system); | |
} | |
} | |
//===----------------------------------------------------------------------===// | |
// HSA/AMDGPU AQL Queue | |
//===----------------------------------------------------------------------===// | |
typedef enum { | |
// Queue supports multiple producers. | |
IREE_HSA_QUEUE_TYPE_MULTI = 0, | |
// Queue only supports a single producer. | |
IREE_HSA_QUEUE_TYPE_SINGLE = 1 | |
} iree_hsa_queue_type_t; | |
typedef struct iree_hsa_queue_s { | |
// Queue type. | |
iree_hsa_queue_type_t type; | |
// Queue features mask. This is a bit-field of hsa_queue_feature_t | |
// values. Applications should ignore any unknown set bits. | |
uint32_t features; | |
IREE_AMDGPU_GLOBAL void* base_address; | |
// Signal object used by the application to indicate the ID of a packet that | |
// is ready to be processed. The HSA runtime manages the doorbell signal. If | |
// the application tries to replace or destroy this signal, the behavior is | |
// undefined. | |
// | |
// If type is HSA_QUEUE_TYPE_SINGLE the doorbell signal value must be | |
// updated in a monotonically increasing fashion. If type is | |
// HSA_QUEUE_TYPE_MULTI the doorbell signal value can be updated with any | |
// value. | |
iree_hsa_signal_t doorbell_signal; | |
// Maximum number of packets the queue can hold. Must be a power of 2. | |
uint32_t size; | |
// Reserved. Must be 0. | |
uint32_t reserved1; | |
// Queue identifier, which is unique over the lifetime of the application. | |
uint64_t id; | |
} iree_hsa_queue_t; | |
#define IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(name, shift, width) \ | |
name##_SHIFT = (shift), name##_WIDTH = (width), \ | |
name = (((1 << (width)) - 1) << (shift)) | |
enum iree_amd_queue_properties_t { | |
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES( | |
IREE_AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER, 0, 1), | |
// All devices we care about are 64-bit. | |
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(IREE_AMD_QUEUE_PROPERTIES_IS_PTR64, 1, | |
1), | |
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES( | |
IREE_AMD_QUEUE_PROPERTIES_ENABLE_TRAP_HANDLER_DEBUG_SGPRS, 2, 1), | |
// Timestamps will be stored on signals (start_ts/end_ts). | |
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES( | |
IREE_AMD_QUEUE_PROPERTIES_ENABLE_PROFILING, 3, 1), | |
IREE_AMD_HSA_BITS_CREATE_ENUM_ENTRIES(IREE_AMD_QUEUE_PROPERTIES_RESERVED1, 4, | |
28) | |
}; | |
typedef uint32_t iree_amd_queue_properties32_t; | |
// An AQL packet queue. | |
// We generally treat these as opaque except for if we need to read queue | |
// properties to check modes - otherwise we just treat any queue handle as | |
// an iree_hsa_queue_t. | |
typedef struct IREE_AMDGPU_ALIGNAS(64) iree_amd_queue_s { | |
iree_hsa_queue_t hsa_queue; | |
uint32_t reserved1[4]; | |
volatile uint64_t write_dispatch_id; | |
uint32_t group_segment_aperture_base_hi; | |
uint32_t private_segment_aperture_base_hi; | |
uint32_t max_cu_id; | |
uint32_t max_wave_id; | |
volatile uint64_t max_legacy_doorbell_dispatch_id_plus_1; | |
volatile uint32_t legacy_doorbell_lock; | |
uint32_t reserved2[9]; | |
volatile uint64_t read_dispatch_id; | |
uint32_t read_dispatch_id_field_base_byte_offset; | |
uint32_t compute_tmpring_size; | |
uint32_t scratch_resource_descriptor[4]; | |
uint64_t scratch_backing_memory_location; | |
uint64_t scratch_backing_memory_byte_size; | |
uint32_t scratch_workitem_byte_size; | |
iree_amd_queue_properties32_t queue_properties; | |
uint32_t reserved3[2]; | |
iree_hsa_signal_t queue_inactive_signal; | |
uint32_t reserved4[14]; | |
} iree_amd_queue_t; | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t | |
iree_hsa_queue_load_read_index( | |
const IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
const IREE_AMDGPU_GLOBAL iree_amd_queue_t* q = | |
(const IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue; | |
return iree_hal_amdgpu_device_atomic_load_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q | |
->read_dispatch_id, | |
memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t | |
iree_hsa_queue_load_write_index( | |
const IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
const IREE_AMDGPU_GLOBAL iree_amd_queue_t* q = | |
(const IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue; | |
return iree_hal_amdgpu_device_atomic_load_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q | |
->write_dispatch_id, | |
memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t | |
iree_hsa_queue_add_write_index( | |
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
uint64_t value, iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_queue_t* q = | |
(IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue; | |
return iree_hal_amdgpu_device_atomic_fetch_add_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q | |
->write_dispatch_id, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE uint64_t | |
iree_hsa_queue_cas_write_index( | |
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
uint64_t expected, uint64_t value, | |
iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_queue_t* q = | |
(IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue; | |
uint64_t e = expected; | |
iree_hal_amdgpu_device_atomic_compare_exchange_strong_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q | |
->write_dispatch_id, | |
&e, value, memory_order, iree_hal_amdgpu_device_memory_order_relaxed, | |
iree_hal_amdgpu_device_memory_scope_system); | |
return e; | |
} | |
static inline IREE_AMDGPU_ATTRIBUTE_ALWAYS_INLINE void | |
iree_hsa_queue_store_write_index( | |
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
uint64_t value, iree_hal_amdgpu_device_memory_order_t memory_order) { | |
IREE_AMDGPU_GLOBAL iree_amd_queue_t* q = | |
(IREE_AMDGPU_GLOBAL iree_amd_queue_t*)queue; | |
iree_hal_amdgpu_device_atomic_store_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint64_t*)&q | |
->write_dispatch_id, | |
value, memory_order, iree_hal_amdgpu_device_memory_scope_system); | |
} | |
void iree_hal_amdgpu_device_host_enqueue( | |
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
uint16_t type, uint64_t return_address, uint64_t arg0, uint64_t arg1, | |
uint64_t arg2, uint64_t arg3, iree_hsa_signal_t completion_signal) { | |
// Reserve a packet write index and wait for it to become available in cases | |
// where the queue is exhausted. | |
uint64_t packet_id = iree_hsa_queue_add_write_index( | |
queue, 1u, iree_hal_amdgpu_device_memory_order_relaxed); | |
while (packet_id - iree_hsa_queue_load_read_index( | |
queue, iree_hal_amdgpu_device_memory_order_acquire) >= | |
queue->size) { | |
iree_hal_amdgpu_device_yield(); // spinning | |
} | |
const uint64_t queue_mask = queue->size - 1; // power of two | |
IREE_AMDGPU_GLOBAL iree_hsa_agent_dispatch_packet_t* agent_packet = | |
queue->base_address + (packet_id & queue_mask) * 64; | |
// Populate all of the packet besides the header. | |
agent_packet->reserved0 = 0; | |
agent_packet->return_address = (void*)return_address; | |
agent_packet->arg[0] = arg0; | |
agent_packet->arg[1] = arg1; | |
agent_packet->arg[2] = arg2; | |
agent_packet->arg[3] = arg3; | |
agent_packet->reserved2 = 0; | |
agent_packet->completion_signal = completion_signal; | |
// Populate the header and release the packet to the queue. | |
// Note that we need to release to all devices so that the host can see it. | |
uint16_t header = HSA_PACKET_TYPE_AGENT_DISPATCH << HSA_PACKET_HEADER_TYPE; | |
header |= 0 << HSA_PACKET_HEADER_BARRIER; | |
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE; | |
header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE; | |
uint32_t header_type = header | (type << 16); | |
iree_hal_amdgpu_device_atomic_store_explicit( | |
(IREE_AMDGPU_GLOBAL iree_hal_amdgpu_device_atomic_uint32_t*)agent_packet, | |
header_type, iree_hal_amdgpu_device_memory_order_release, | |
iree_hal_amdgpu_device_memory_scope_system); | |
// Signal the queue doorbell. | |
// https://sourcegraph.com/github.com/ROCm/rocMLIR/-/blob/external/llvm-project/amd/device-libs/ockl/src/hsaqs.cl?L69 | |
iree_hsa_signal_store(queue->doorbell_signal, packet_id, | |
iree_hal_amdgpu_device_memory_order_relaxed); | |
} | |
__kernel void issue_host_call( | |
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
iree_hsa_signal_t completion_signal, uint32_t arg) { | |
iree_hal_amdgpu_device_host_enqueue(queue, 123, 0x100, arg, 0x201, 0x202, | |
0x203, iree_hsa_signal_null()); | |
iree_hal_amdgpu_device_host_enqueue(queue, 456, 0x100, arg, 0x201, 0x202, | |
0x203, iree_hsa_signal_null()); | |
iree_hal_amdgpu_device_host_enqueue(queue, 789, 0x100, arg, 0x201, 0x202, | |
0x203, completion_signal); | |
} | |
static inline size_t iree_host_align(size_t value, size_t alignment) { | |
return (value + (alignment - 1)) & ~(alignment - 1); | |
} | |
typedef struct implicit_kernargs_t { | |
uint32_t block_count[3]; // + 0/4/8 | |
uint16_t group_size[3]; // + 12/14/16 | |
uint16_t remainder[3]; // + 18/20/22 | |
uint64_t reserved0; // + 24 hidden_tool_correlation_id | |
uint64_t reserved1; // + 32 | |
uint64_t global_offset[3]; // + 40/48/56 | |
uint16_t grid_dims; // + 64 | |
} implicit_kernargs_t; | |
__kernel void issue_dispatch( | |
IREE_AMDGPU_GLOBAL iree_hsa_queue_t* IREE_AMDGPU_RESTRICT queue, | |
uint64_t mul_x_object, uint32_t mul_x_private_size, | |
uint32_t mul_x_group_size, iree_hsa_signal_t completion_signal, | |
IREE_AMDGPU_GLOBAL void* buffer, | |
IREE_AMDGPU_GLOBAL void* IREE_AMDGPU_RESTRICT kernarg_storage, | |
uint32_t element_count, uint32_t mul_by) { | |
typedef struct mul_x_args_t { | |
uint32_t x; | |
uint32_t n; | |
IREE_AMDGPU_GLOBAL void* buffer; | |
} mul_x_args_t; | |
IREE_AMDGPU_GLOBAL mul_x_args_t* mul_x_kernargs = | |
(IREE_AMDGPU_GLOBAL mul_x_args_t*)kernarg_storage; | |
mul_x_kernargs->x = mul_by; | |
mul_x_kernargs->n = element_count; | |
mul_x_kernargs->buffer = buffer; | |
uint32_t grid_size[3] = {element_count, 1, 1}; | |
uint16_t workgroup_size[3] = {32, 1, 1}; | |
IREE_AMDGPU_GLOBAL implicit_kernargs_t* implicit_kernargs = | |
(IREE_AMDGPU_GLOBAL implicit_kernargs_t*)((uint8_t*)kernarg_storage + | |
iree_host_align( | |
sizeof(mul_x_args_t), 8)); | |
implicit_kernargs->block_count[0] = grid_size[0] / workgroup_size[0]; | |
implicit_kernargs->block_count[1] = grid_size[1] / workgroup_size[1]; | |
implicit_kernargs->block_count[2] = grid_size[2] / workgroup_size[2]; | |
implicit_kernargs->group_size[0] = workgroup_size[0]; | |
implicit_kernargs->group_size[1] = workgroup_size[1]; | |
implicit_kernargs->group_size[2] = workgroup_size[2]; | |
implicit_kernargs->remainder[0] = | |
(uint16_t)(grid_size[0] % workgroup_size[0]); | |
implicit_kernargs->remainder[1] = | |
(uint16_t)(grid_size[1] % workgroup_size[1]); | |
implicit_kernargs->remainder[2] = | |
(uint16_t)(grid_size[2] % workgroup_size[2]); | |
implicit_kernargs->reserved0 = 0; | |
implicit_kernargs->reserved1 = 0; | |
implicit_kernargs->global_offset[0] = 0; // newOffset[0]; | |
implicit_kernargs->global_offset[1] = 0; // newOffset[1]; | |
implicit_kernargs->global_offset[2] = 0; // newOffset[2]; | |
implicit_kernargs->grid_dims = 3; | |
// DO NOT SUBMIT should do nontemporal kernarg update? | |
hsa_kernel_dispatch_packet_t packet; | |
packet.header = HSA_PACKET_TYPE_INVALID << HSA_PACKET_HEADER_TYPE; | |
packet.setup = 3 << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS; | |
packet.workgroup_size_x = workgroup_size[0]; | |
packet.workgroup_size_y = workgroup_size[1]; | |
packet.workgroup_size_z = workgroup_size[2]; | |
packet.reserved0 = 0; | |
packet.grid_size_x = grid_size[0]; | |
packet.grid_size_y = grid_size[1]; | |
packet.grid_size_z = grid_size[2]; | |
packet.private_segment_size = mul_x_private_size; | |
packet.group_segment_size = mul_x_group_size; | |
packet.kernel_object = mul_x_object; | |
packet.kernarg_address = kernarg_storage; | |
packet.reserved2 = 0; | |
packet.completion_signal = completion_signal; | |
uint16_t packet_header = | |
(HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) | | |
(1 << HSA_PACKET_HEADER_BARRIER) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) | | |
(HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE); | |
uint32_t packet_header_setup = packet_header | (packet.setup << 16); | |
uint64_t packet_id = iree_hsa_queue_add_write_index( | |
queue, 1, iree_hal_amdgpu_device_memory_order_release); | |
while ( | |
(packet_id - iree_hsa_queue_load_read_index( | |
queue, iree_hal_amdgpu_device_memory_order_acquire)) >= | |
queue->size) { | |
iree_hal_amdgpu_device_yield(); | |
} | |
IREE_AMDGPU_GLOBAL hsa_kernel_dispatch_packet_t* packet_ptr = | |
(IREE_AMDGPU_GLOBAL hsa_kernel_dispatch_packet_t*)((IREE_AMDGPU_GLOBAL uint8_t*) | |
queue | |
->base_address + | |
(packet_id & | |
(queue->size - 1)) * | |
64); | |
// memcpy(packet_ptr, &packet, sizeof(packet)); | |
*packet_ptr = packet; | |
iree_hal_amdgpu_device_atomic_store_explicit( | |
(volatile iree_hal_amdgpu_device_atomic_uint32_t*)packet_ptr, | |
packet_header_setup, iree_hal_amdgpu_device_memory_order_release, | |
iree_hal_amdgpu_device_memory_scope_system); | |
// value ignored in MULTI cases | |
iree_hsa_signal_store(queue->doorbell_signal, packet_id, | |
iree_hal_amdgpu_device_memory_order_relaxed); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type": "lldb-dap", | |
"request": "launch", | |
"name": "run-active-hsa-c-file", | |
"program": "${command:cmake.buildDirectory}/vscode/${fileBasenameNoExtension}", | |
"args": [ | |
], | |
"env": [ | |
"LD_LIBRARY_PATH=/home/nod/src/ROCR-Runtime/build/rocr/lib/", | |
], | |
"cwd": "${workspaceRoot}", | |
"preLaunchTask": "build-active-hsa-c-file", | |
"presentation": { | |
"clear": true, | |
"reveal": "silent", | |
"revealProblems": "never", | |
"panel": "shared", | |
"showReuseMessage": false | |
}, | |
}, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"type": "cppbuild", | |
"label": "build-active-hsa-c-file", | |
"command": "/usr/bin/clang", | |
"args": [ | |
"-fcolor-diagnostics", | |
"-fansi-escape-codes", | |
"-g", | |
"${file}", | |
"-I${workspaceFolder}", | |
"-I${workspaceFolder}/runtime/src/", | |
"-I/opt/rocm/include/", | |
"-L/home/nod/src/ROCR-Runtime/build/rocr/lib/", | |
"-L/opt/rocm/lib/", | |
"-lhsa-runtime64", | |
"-lhsakmt", | |
"-lamdhip64", | |
"-ldl", | |
"-lpthread", | |
"-lnuma", // mbind | |
"-lrt", | |
"-o", | |
"${command:cmake.buildDirectory}/vscode/${fileBasenameNoExtension}" | |
], | |
"options": { | |
"cwd": "${fileDirname}" | |
}, | |
"problemMatcher": ["$gcc"], | |
"group": { | |
"kind": "build", | |
"isDefault": true | |
}, | |
"presentation": { | |
"clear": true, | |
"reveal": "silent", | |
"revealProblems": "never", | |
"panel": "shared", | |
"showReuseMessage": false | |
}, | |
}, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment