Created
September 19, 2024 01:10
-
-
Save makslevental/597293ac5d1e36ac21fd43b6b23fb825 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ctypes | |
import fcntl | |
from ctypes import CDLL, c_int, c_void_p, c_size_t, c_long, c_uint32 | |
from mmap import PROT_READ, PROT_WRITE, MAP_SHARED, MAP_PRIVATE | |
from pathlib import Path | |
import numpy as np | |
from ioctlpy.amdxdna_accel import ( | |
AMDXDNA_BO_CMD, | |
AMDXDNA_BO_DEV, | |
AMDXDNA_BO_DEV_HEAP, | |
DRM_AMDXDNA_HWCTX_CONFIG_CU, | |
struct_amdxdna_cmd, | |
struct_amdxdna_cmd_chain, | |
struct_amdxdna_cu_config, | |
struct_amdxdna_drm_config_hwctx, | |
struct_amdxdna_drm_create_bo, | |
struct_amdxdna_drm_create_hwctx, | |
struct_amdxdna_drm_exec_cmd, | |
struct_amdxdna_drm_get_bo_info, | |
struct_amdxdna_drm_sync_bo, | |
struct_amdxdna_hwctx_param_config_cu, | |
struct_amdxdna_qos_info, | |
AMDXDNA_CMD_SUBMIT_EXEC_BUF, | |
struct_amdxdna_drm_wait_cmd, | |
) | |
from ioctlpy.amdxdna_ioctl import ( | |
print_aie_metadata, | |
ioctls, | |
get_void_ptr_to_struct, | |
) | |
from rocrtst.suites.aie.ioctlpy.amdxdna_ioctl import format_struct | |
HEAP_SIZE = 64 << 20 # 64MB | |
libc = CDLL("libc.so.6") | |
get_errno_loc = libc.__errno_location | |
get_errno_loc.restype = ctypes.POINTER(c_int) | |
libc.getpagesize.argtypes = [] | |
libc.getpagesize.restype = c_int | |
libc.mmap.argtypes = [c_void_p, c_size_t, c_int, c_int, c_int, c_long] | |
libc.mmap.restype = c_void_p | |
libc.memcpy.argtypes = [c_void_p, c_void_p, c_size_t] | |
libc.memcpy.restype = c_void_p | |
libc.munmap.argtypes = [c_void_p, c_size_t] | |
libc.munmap.restype = c_int | |
def errcheck(ret, _func, _args): | |
if ret == -1: | |
e = get_errno_loc()[0] | |
raise OSError(e) | |
return ret | |
libc.mmap.errcheck = errcheck | |
def alloc_heap(drv_fd): | |
create_bo_params = struct_amdxdna_drm_create_bo() | |
create_bo_params.type = AMDXDNA_BO_DEV_HEAP | |
create_bo_params.size = HEAP_SIZE | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CREATE_BO, create_bo_params) | |
get_bo_info = struct_amdxdna_drm_get_bo_info() | |
get_bo_info.handle = create_bo_params.handle | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_BO_INFO, get_bo_info) | |
heap_buf = ctypes.cast(0, c_void_p) | |
libc.posix_memalign(ctypes.pointer(heap_buf), HEAP_SIZE, HEAP_SIZE) | |
libc.free(heap_buf) | |
heap_buf = libc.mmap( | |
heap_buf, | |
HEAP_SIZE, | |
PROT_READ | PROT_WRITE, | |
MAP_SHARED, | |
drv_fd.fileno(), | |
get_bo_info.map_offset, | |
) | |
return heap_buf | |
def create_bo(drv_fd, size_in_bytes, type): | |
create_bo = struct_amdxdna_drm_create_bo() | |
create_bo.type = type | |
create_bo.size = size_in_bytes | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CREATE_BO, create_bo) | |
get_bo_info = struct_amdxdna_drm_get_bo_info() | |
get_bo_info.handle = create_bo.handle | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_BO_INFO, get_bo_info) | |
assert get_bo_info.handle == create_bo.handle | |
return get_bo_info | |
def create_dev_bo(drv_fd, size_in_bytes): | |
return create_bo(drv_fd, size_in_bytes, AMDXDNA_BO_DEV) | |
def load_pdi(drv_fd, pdi_path: Path): | |
pdi_file_size_in_bytes = pdi_path.stat().st_size | |
pdi_file = open(pdi_path, "r") | |
pdi_file_data = libc.mmap( | |
0, | |
pdi_file_size_in_bytes, | |
PROT_READ, | |
MAP_PRIVATE, | |
pdi_file.fileno(), | |
0, | |
) | |
dev_bo = create_dev_bo(drv_fd, pdi_file_size_in_bytes) | |
libc.memcpy(dev_bo.vaddr, pdi_file_data, pdi_file_size_in_bytes) | |
libc.munmap(pdi_file_data, pdi_file_size_in_bytes) | |
return dev_bo | |
def load_ipu_instructions(drv_fd, ipu_instrs_path: Path): | |
ipu_instrs = list( | |
map(lambda x: int(x.strip(), 16), open(ipu_instrs_path, "r").readlines()) | |
) | |
arr = (c_uint32 * len(ipu_instrs))(*ipu_instrs) | |
ipu_instrs_size_in_bytes = len(ipu_instrs) * ctypes.sizeof(c_uint32) | |
dev_bo = create_dev_bo(drv_fd, ipu_instrs_size_in_bytes) | |
libc.memcpy(dev_bo.vaddr, arr, ipu_instrs_size_in_bytes) | |
return dev_bo | |
def create_hw_ctx(drv_fd): | |
qos = struct_amdxdna_qos_info() | |
ctx = struct_amdxdna_drm_create_hwctx() | |
ctx.qos_p = get_void_ptr_to_struct(qos).value | |
ctx.num_tiles = 4 | |
ctx.max_opc = 0x800 | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CREATE_HWCTX, ctx) | |
return ctx | |
def config_hw_ctx(drv_fd, pdi_handle, hw_ctx_handle): | |
cu_config = struct_amdxdna_cu_config(pdi_handle, 0) | |
param_config_cu = struct_amdxdna_hwctx_param_config_cu(1, [cu_config]) | |
assert ctypes.sizeof(param_config_cu) == 16 | |
config_hw_ctx = struct_amdxdna_drm_config_hwctx( | |
hw_ctx_handle, | |
DRM_AMDXDNA_HWCTX_CONFIG_CU, | |
get_void_ptr_to_struct(param_config_cu).value, | |
ctypes.sizeof(param_config_cu), | |
) | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, config_hw_ctx) | |
def sync_bo(drv_fd, handle): | |
sync_params = struct_amdxdna_drm_sync_bo() | |
sync_params.handle = handle | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_SYNC_BO, sync_params) | |
PACKET_SIZE = 64 | |
def create_command_bo(drv_fd, size): | |
return create_bo(drv_fd, size, AMDXDNA_BO_CMD) | |
def create_command( | |
drv_fd, command_bo, size, state, extra_cu_masks, count, opcode, data | |
): | |
ptr = libc.mmap( | |
0, | |
size, | |
PROT_READ | PROT_WRITE, | |
MAP_SHARED, | |
drv_fd.fileno(), | |
command_bo.map_offset, | |
) | |
struct_amdxdna_cmd_ctor = struct_amdxdna_cmd(count) | |
cmd_ptr = ctypes.cast(ptr, ctypes.POINTER(struct_amdxdna_cmd_ctor)) | |
cmd_ptr.contents.state = state | |
cmd_ptr.contents.extra_cu_masks = extra_cu_masks | |
cmd_ptr.contents.count = count | |
cmd_ptr.contents.opcode = opcode | |
if len(data): | |
cmd_ptr.contents.data = (c_uint32 * count)(*data) | |
return cmd_ptr | |
def create_command_chain( | |
drv_fd, command_bo, state, extra_cu_masks, count, opcode, data | |
): | |
ptr = libc.mmap( | |
0, | |
4096, | |
PROT_READ | PROT_WRITE, | |
MAP_SHARED, | |
drv_fd.fileno(), | |
command_bo.map_offset, | |
) | |
struct_amdxdna_cmd_ctor = struct_amdxdna_cmd(count) | |
cmd_ptr = ctypes.cast(ptr, ctypes.POINTER(struct_amdxdna_cmd_ctor)) | |
cmd_ptr.contents.state = state | |
cmd_ptr.contents.extra_cu_masks = extra_cu_masks | |
cmd_ptr.contents.count = count | |
cmd_ptr.contents.opcode = opcode | |
cmd_ptr.contents.data = (c_uint32 * count)(*data) | |
return cmd_ptr | |
def exec_cmd(drv_fd, hw_ctx, cmd_chain_bo, arg_handles): | |
exec_cmd = struct_amdxdna_drm_exec_cmd() | |
exec_cmd.ext = 0 | |
exec_cmd.ext_flags = 0 | |
exec_cmd.hwctx = hw_ctx.handle | |
exec_cmd.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF | |
exec_cmd.cmd_handles = cmd_chain_bo.handle | |
arg_handles = (c_uint32 * len(arg_handles))(*arg_handles) | |
exec_cmd.args = ctypes.addressof(arg_handles) | |
exec_cmd.cmd_count = 1 | |
exec_cmd.arg_count = len(arg_handles) | |
print(format_struct(exec_cmd)) | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd) | |
return exec_cmd | |
def wait_cmd(drv_fd, hw_ctx, exec_cmd, timeout=50): | |
wait_cmd = struct_amdxdna_drm_wait_cmd(hw_ctx.handle, timeout, exec_cmd.seq) | |
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_WAIT_CMD, wait_cmd) | |
if __name__ == "__main__": | |
drv_path = "/dev/accel/accel0" | |
drv_fd = open(drv_path, "r+") | |
print("aie metadata:") | |
print_aie_metadata(drv_fd) | |
heap_buf = alloc_heap(drv_fd) | |
pdi_dev_bo = load_pdi(drv_fd, Path(__file__).parent / "add_one.pdi") | |
ipu_instr_0_dev_bo = load_ipu_instructions( | |
drv_fd, Path(__file__).parent / "add_one_insts.txt" | |
) | |
ipu_instr_1_dev_bo = load_ipu_instructions( | |
drv_fd, Path(__file__).parent / "add_one_insts.txt" | |
) | |
DATA_BUFFER_SIZE = 4 << 10 | |
input_0_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE) | |
output_0_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE) | |
input_1_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE) | |
output_1_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE) | |
arr_size = DATA_BUFFER_SIZE // ctypes.sizeof(c_uint32) | |
input_0_arr = np.ctypeslib.as_array( | |
ctypes.cast(input_0_dev_bo.vaddr, ctypes.POINTER(c_uint32)), | |
shape=(arr_size,), | |
) | |
input_1_arr = np.ctypeslib.as_array( | |
ctypes.cast(input_1_dev_bo.vaddr, ctypes.POINTER(c_uint32)), | |
shape=(arr_size,), | |
) | |
output_0_arr = np.ctypeslib.as_array( | |
ctypes.cast(output_0_dev_bo.vaddr, ctypes.POINTER(c_uint32)), | |
shape=(arr_size,), | |
) | |
output_1_arr = np.ctypeslib.as_array( | |
ctypes.cast(output_1_dev_bo.vaddr, ctypes.POINTER(c_uint32)), | |
shape=(arr_size,), | |
) | |
for i in range(arr_size): | |
input_0_arr[i] = i | |
input_1_arr[i] = i + 0xFEEDED1E | |
output_0_arr[i] = 0xDEFACE | |
output_1_arr[i] = 0xDEADBEEF | |
assert output_0_arr[i] != input_0_arr[i] + 1 | |
assert output_1_arr[i] != input_1_arr[i] + 1 | |
sync_bo(drv_fd, input_0_dev_bo.handle) | |
sync_bo(drv_fd, output_0_dev_bo.handle) | |
sync_bo(drv_fd, input_1_dev_bo.handle) | |
sync_bo(drv_fd, output_1_dev_bo.handle) | |
sync_bo(drv_fd, pdi_dev_bo.handle) | |
sync_bo(drv_fd, ipu_instr_0_dev_bo.handle) | |
sync_bo(drv_fd, ipu_instr_1_dev_bo.handle) | |
sync_bo(drv_fd, input_0_dev_bo.handle) | |
sync_bo(drv_fd, output_0_dev_bo.handle) | |
hw_ctx = create_hw_ctx(drv_fd) | |
config_hw_ctx(drv_fd, pdi_dev_bo.handle, hw_ctx.handle) | |
cmd_bo_0 = create_command_bo(drv_fd, PACKET_SIZE) | |
cmd_0 = create_command( | |
drv_fd, | |
cmd_bo_0, | |
size=PACKET_SIZE, | |
state=1, | |
extra_cu_masks=0, | |
count=0xF, | |
opcode=0x0, | |
data=[ | |
0x3, | |
0x3, | |
0x0, | |
ipu_instr_0_dev_bo.xdna_addr, | |
0x0, | |
0x44, | |
input_0_dev_bo.vaddr & 0xFFFFFFFF, | |
(input_0_dev_bo.vaddr >> 32) & 0xFFFFFFFF, | |
output_0_dev_bo.vaddr & 0xFFFFFFFF, | |
(output_0_dev_bo.vaddr >> 32) & 0xFFFFFFFF, | |
], | |
) | |
cmd_bo_1 = create_command_bo(drv_fd, PACKET_SIZE) | |
cmd_1 = create_command( | |
drv_fd, | |
cmd_bo_1, | |
size=PACKET_SIZE, | |
state=1, | |
extra_cu_masks=0, | |
count=10, | |
opcode=0x0, | |
data=[ | |
0x3, | |
0x3, | |
0x0, | |
ipu_instr_1_dev_bo.xdna_addr, | |
0x0, | |
0x44, | |
input_1_dev_bo.vaddr & 0xFFFFFFFF, | |
(input_1_dev_bo.vaddr >> 32) & 0xFFFFFFFF, | |
output_1_dev_bo.vaddr & 0xFFFFFFFF, | |
(output_1_dev_bo.vaddr >> 32) & 0xFFFFFFFF, | |
], | |
) | |
cmd_chain_bo = create_command_bo(drv_fd, 4096) | |
cmd_chain = create_command( | |
drv_fd, | |
cmd_chain_bo, | |
size=4096, | |
state=1, | |
extra_cu_masks=0, | |
count=0xA, | |
opcode=0x13, | |
data=[], | |
) | |
command_count = 2 | |
struct_amdxdna_cmd_chain = struct_amdxdna_cmd_chain(command_count) | |
cmd_chain_payload = ctypes.cast( | |
cmd_chain.contents.data, ctypes.POINTER(struct_amdxdna_cmd_chain) | |
) | |
cmd_chain_payload.contents.command_count = command_count | |
cmd_chain_payload.contents.submit_index = 0 | |
cmd_chain_payload.contents.error_index = 0 | |
cmd_chain_payload.contents.data = (ctypes.c_uint64 * command_count)( | |
cmd_bo_0.handle, cmd_bo_1.handle | |
) | |
exec_cmd_0 = exec_cmd( | |
drv_fd, | |
hw_ctx, | |
cmd_chain_bo, | |
arg_handles=[ | |
ipu_instr_0_dev_bo.handle, | |
ipu_instr_1_dev_bo.handle, | |
input_0_dev_bo.handle, | |
output_0_dev_bo.handle, | |
input_1_dev_bo.handle, | |
output_1_dev_bo.handle, | |
], | |
) | |
wait_cmd(drv_fd, hw_ctx, exec_cmd_0) | |
sync_bo(drv_fd, input_0_dev_bo.handle) | |
sync_bo(drv_fd, output_0_dev_bo.handle) | |
sync_bo(drv_fd, input_1_dev_bo.handle) | |
sync_bo(drv_fd, output_1_dev_bo.handle) | |
for i in range(arr_size): | |
assert output_0_arr[i] == input_0_arr[i] + 1 | |
assert output_1_arr[i] == input_1_arr[i] + 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment