Skip to content

Instantly share code, notes, and snippets.

@makslevental
Created September 19, 2024 01:10
Show Gist options
  • Save makslevental/597293ac5d1e36ac21fd43b6b23fb825 to your computer and use it in GitHub Desktop.
Save makslevental/597293ac5d1e36ac21fd43b6b23fb825 to your computer and use it in GitHub Desktop.
import ctypes
import fcntl
from ctypes import CDLL, c_int, c_void_p, c_size_t, c_long, c_uint32
from mmap import PROT_READ, PROT_WRITE, MAP_SHARED, MAP_PRIVATE
from pathlib import Path
import numpy as np
from ioctlpy.amdxdna_accel import (
AMDXDNA_BO_CMD,
AMDXDNA_BO_DEV,
AMDXDNA_BO_DEV_HEAP,
DRM_AMDXDNA_HWCTX_CONFIG_CU,
struct_amdxdna_cmd,
struct_amdxdna_cmd_chain,
struct_amdxdna_cu_config,
struct_amdxdna_drm_config_hwctx,
struct_amdxdna_drm_create_bo,
struct_amdxdna_drm_create_hwctx,
struct_amdxdna_drm_exec_cmd,
struct_amdxdna_drm_get_bo_info,
struct_amdxdna_drm_sync_bo,
struct_amdxdna_hwctx_param_config_cu,
struct_amdxdna_qos_info,
AMDXDNA_CMD_SUBMIT_EXEC_BUF,
struct_amdxdna_drm_wait_cmd,
)
from ioctlpy.amdxdna_ioctl import (
print_aie_metadata,
ioctls,
get_void_ptr_to_struct,
)
from rocrtst.suites.aie.ioctlpy.amdxdna_ioctl import format_struct
HEAP_SIZE = 64 << 20 # 64MB
libc = CDLL("libc.so.6")
get_errno_loc = libc.__errno_location
get_errno_loc.restype = ctypes.POINTER(c_int)
libc.getpagesize.argtypes = []
libc.getpagesize.restype = c_int
libc.mmap.argtypes = [c_void_p, c_size_t, c_int, c_int, c_int, c_long]
libc.mmap.restype = c_void_p
libc.memcpy.argtypes = [c_void_p, c_void_p, c_size_t]
libc.memcpy.restype = c_void_p
libc.munmap.argtypes = [c_void_p, c_size_t]
libc.munmap.restype = c_int
def errcheck(ret, _func, _args):
if ret == -1:
e = get_errno_loc()[0]
raise OSError(e)
return ret
libc.mmap.errcheck = errcheck
def alloc_heap(drv_fd):
create_bo_params = struct_amdxdna_drm_create_bo()
create_bo_params.type = AMDXDNA_BO_DEV_HEAP
create_bo_params.size = HEAP_SIZE
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CREATE_BO, create_bo_params)
get_bo_info = struct_amdxdna_drm_get_bo_info()
get_bo_info.handle = create_bo_params.handle
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_BO_INFO, get_bo_info)
heap_buf = ctypes.cast(0, c_void_p)
libc.posix_memalign(ctypes.pointer(heap_buf), HEAP_SIZE, HEAP_SIZE)
libc.free(heap_buf)
heap_buf = libc.mmap(
heap_buf,
HEAP_SIZE,
PROT_READ | PROT_WRITE,
MAP_SHARED,
drv_fd.fileno(),
get_bo_info.map_offset,
)
return heap_buf
def create_bo(drv_fd, size_in_bytes, type):
create_bo = struct_amdxdna_drm_create_bo()
create_bo.type = type
create_bo.size = size_in_bytes
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CREATE_BO, create_bo)
get_bo_info = struct_amdxdna_drm_get_bo_info()
get_bo_info.handle = create_bo.handle
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_BO_INFO, get_bo_info)
assert get_bo_info.handle == create_bo.handle
return get_bo_info
def create_dev_bo(drv_fd, size_in_bytes):
return create_bo(drv_fd, size_in_bytes, AMDXDNA_BO_DEV)
def load_pdi(drv_fd, pdi_path: Path):
pdi_file_size_in_bytes = pdi_path.stat().st_size
pdi_file = open(pdi_path, "r")
pdi_file_data = libc.mmap(
0,
pdi_file_size_in_bytes,
PROT_READ,
MAP_PRIVATE,
pdi_file.fileno(),
0,
)
dev_bo = create_dev_bo(drv_fd, pdi_file_size_in_bytes)
libc.memcpy(dev_bo.vaddr, pdi_file_data, pdi_file_size_in_bytes)
libc.munmap(pdi_file_data, pdi_file_size_in_bytes)
return dev_bo
def load_ipu_instructions(drv_fd, ipu_instrs_path: Path):
ipu_instrs = list(
map(lambda x: int(x.strip(), 16), open(ipu_instrs_path, "r").readlines())
)
arr = (c_uint32 * len(ipu_instrs))(*ipu_instrs)
ipu_instrs_size_in_bytes = len(ipu_instrs) * ctypes.sizeof(c_uint32)
dev_bo = create_dev_bo(drv_fd, ipu_instrs_size_in_bytes)
libc.memcpy(dev_bo.vaddr, arr, ipu_instrs_size_in_bytes)
return dev_bo
def create_hw_ctx(drv_fd):
qos = struct_amdxdna_qos_info()
ctx = struct_amdxdna_drm_create_hwctx()
ctx.qos_p = get_void_ptr_to_struct(qos).value
ctx.num_tiles = 4
ctx.max_opc = 0x800
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CREATE_HWCTX, ctx)
return ctx
def config_hw_ctx(drv_fd, pdi_handle, hw_ctx_handle):
cu_config = struct_amdxdna_cu_config(pdi_handle, 0)
param_config_cu = struct_amdxdna_hwctx_param_config_cu(1, [cu_config])
assert ctypes.sizeof(param_config_cu) == 16
config_hw_ctx = struct_amdxdna_drm_config_hwctx(
hw_ctx_handle,
DRM_AMDXDNA_HWCTX_CONFIG_CU,
get_void_ptr_to_struct(param_config_cu).value,
ctypes.sizeof(param_config_cu),
)
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, config_hw_ctx)
def sync_bo(drv_fd, handle):
sync_params = struct_amdxdna_drm_sync_bo()
sync_params.handle = handle
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_SYNC_BO, sync_params)
PACKET_SIZE = 64
def create_command_bo(drv_fd, size):
return create_bo(drv_fd, size, AMDXDNA_BO_CMD)
def create_command(
drv_fd, command_bo, size, state, extra_cu_masks, count, opcode, data
):
ptr = libc.mmap(
0,
size,
PROT_READ | PROT_WRITE,
MAP_SHARED,
drv_fd.fileno(),
command_bo.map_offset,
)
struct_amdxdna_cmd_ctor = struct_amdxdna_cmd(count)
cmd_ptr = ctypes.cast(ptr, ctypes.POINTER(struct_amdxdna_cmd_ctor))
cmd_ptr.contents.state = state
cmd_ptr.contents.extra_cu_masks = extra_cu_masks
cmd_ptr.contents.count = count
cmd_ptr.contents.opcode = opcode
if len(data):
cmd_ptr.contents.data = (c_uint32 * count)(*data)
return cmd_ptr
def create_command_chain(
drv_fd, command_bo, state, extra_cu_masks, count, opcode, data
):
ptr = libc.mmap(
0,
4096,
PROT_READ | PROT_WRITE,
MAP_SHARED,
drv_fd.fileno(),
command_bo.map_offset,
)
struct_amdxdna_cmd_ctor = struct_amdxdna_cmd(count)
cmd_ptr = ctypes.cast(ptr, ctypes.POINTER(struct_amdxdna_cmd_ctor))
cmd_ptr.contents.state = state
cmd_ptr.contents.extra_cu_masks = extra_cu_masks
cmd_ptr.contents.count = count
cmd_ptr.contents.opcode = opcode
cmd_ptr.contents.data = (c_uint32 * count)(*data)
return cmd_ptr
def exec_cmd(drv_fd, hw_ctx, cmd_chain_bo, arg_handles):
exec_cmd = struct_amdxdna_drm_exec_cmd()
exec_cmd.ext = 0
exec_cmd.ext_flags = 0
exec_cmd.hwctx = hw_ctx.handle
exec_cmd.type = AMDXDNA_CMD_SUBMIT_EXEC_BUF
exec_cmd.cmd_handles = cmd_chain_bo.handle
arg_handles = (c_uint32 * len(arg_handles))(*arg_handles)
exec_cmd.args = ctypes.addressof(arg_handles)
exec_cmd.cmd_count = 1
exec_cmd.arg_count = len(arg_handles)
print(format_struct(exec_cmd))
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_EXEC_CMD, exec_cmd)
return exec_cmd
def wait_cmd(drv_fd, hw_ctx, exec_cmd, timeout=50):
wait_cmd = struct_amdxdna_drm_wait_cmd(hw_ctx.handle, timeout, exec_cmd.seq)
fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_WAIT_CMD, wait_cmd)
if __name__ == "__main__":
drv_path = "/dev/accel/accel0"
drv_fd = open(drv_path, "r+")
print("aie metadata:")
print_aie_metadata(drv_fd)
heap_buf = alloc_heap(drv_fd)
pdi_dev_bo = load_pdi(drv_fd, Path(__file__).parent / "add_one.pdi")
ipu_instr_0_dev_bo = load_ipu_instructions(
drv_fd, Path(__file__).parent / "add_one_insts.txt"
)
ipu_instr_1_dev_bo = load_ipu_instructions(
drv_fd, Path(__file__).parent / "add_one_insts.txt"
)
DATA_BUFFER_SIZE = 4 << 10
input_0_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE)
output_0_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE)
input_1_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE)
output_1_dev_bo = create_dev_bo(drv_fd, DATA_BUFFER_SIZE)
arr_size = DATA_BUFFER_SIZE // ctypes.sizeof(c_uint32)
input_0_arr = np.ctypeslib.as_array(
ctypes.cast(input_0_dev_bo.vaddr, ctypes.POINTER(c_uint32)),
shape=(arr_size,),
)
input_1_arr = np.ctypeslib.as_array(
ctypes.cast(input_1_dev_bo.vaddr, ctypes.POINTER(c_uint32)),
shape=(arr_size,),
)
output_0_arr = np.ctypeslib.as_array(
ctypes.cast(output_0_dev_bo.vaddr, ctypes.POINTER(c_uint32)),
shape=(arr_size,),
)
output_1_arr = np.ctypeslib.as_array(
ctypes.cast(output_1_dev_bo.vaddr, ctypes.POINTER(c_uint32)),
shape=(arr_size,),
)
for i in range(arr_size):
input_0_arr[i] = i
input_1_arr[i] = i + 0xFEEDED1E
output_0_arr[i] = 0xDEFACE
output_1_arr[i] = 0xDEADBEEF
assert output_0_arr[i] != input_0_arr[i] + 1
assert output_1_arr[i] != input_1_arr[i] + 1
sync_bo(drv_fd, input_0_dev_bo.handle)
sync_bo(drv_fd, output_0_dev_bo.handle)
sync_bo(drv_fd, input_1_dev_bo.handle)
sync_bo(drv_fd, output_1_dev_bo.handle)
sync_bo(drv_fd, pdi_dev_bo.handle)
sync_bo(drv_fd, ipu_instr_0_dev_bo.handle)
sync_bo(drv_fd, ipu_instr_1_dev_bo.handle)
sync_bo(drv_fd, input_0_dev_bo.handle)
sync_bo(drv_fd, output_0_dev_bo.handle)
hw_ctx = create_hw_ctx(drv_fd)
config_hw_ctx(drv_fd, pdi_dev_bo.handle, hw_ctx.handle)
cmd_bo_0 = create_command_bo(drv_fd, PACKET_SIZE)
cmd_0 = create_command(
drv_fd,
cmd_bo_0,
size=PACKET_SIZE,
state=1,
extra_cu_masks=0,
count=0xF,
opcode=0x0,
data=[
0x3,
0x3,
0x0,
ipu_instr_0_dev_bo.xdna_addr,
0x0,
0x44,
input_0_dev_bo.vaddr & 0xFFFFFFFF,
(input_0_dev_bo.vaddr >> 32) & 0xFFFFFFFF,
output_0_dev_bo.vaddr & 0xFFFFFFFF,
(output_0_dev_bo.vaddr >> 32) & 0xFFFFFFFF,
],
)
cmd_bo_1 = create_command_bo(drv_fd, PACKET_SIZE)
cmd_1 = create_command(
drv_fd,
cmd_bo_1,
size=PACKET_SIZE,
state=1,
extra_cu_masks=0,
count=10,
opcode=0x0,
data=[
0x3,
0x3,
0x0,
ipu_instr_1_dev_bo.xdna_addr,
0x0,
0x44,
input_1_dev_bo.vaddr & 0xFFFFFFFF,
(input_1_dev_bo.vaddr >> 32) & 0xFFFFFFFF,
output_1_dev_bo.vaddr & 0xFFFFFFFF,
(output_1_dev_bo.vaddr >> 32) & 0xFFFFFFFF,
],
)
cmd_chain_bo = create_command_bo(drv_fd, 4096)
cmd_chain = create_command(
drv_fd,
cmd_chain_bo,
size=4096,
state=1,
extra_cu_masks=0,
count=0xA,
opcode=0x13,
data=[],
)
command_count = 2
struct_amdxdna_cmd_chain = struct_amdxdna_cmd_chain(command_count)
cmd_chain_payload = ctypes.cast(
cmd_chain.contents.data, ctypes.POINTER(struct_amdxdna_cmd_chain)
)
cmd_chain_payload.contents.command_count = command_count
cmd_chain_payload.contents.submit_index = 0
cmd_chain_payload.contents.error_index = 0
cmd_chain_payload.contents.data = (ctypes.c_uint64 * command_count)(
cmd_bo_0.handle, cmd_bo_1.handle
)
exec_cmd_0 = exec_cmd(
drv_fd,
hw_ctx,
cmd_chain_bo,
arg_handles=[
ipu_instr_0_dev_bo.handle,
ipu_instr_1_dev_bo.handle,
input_0_dev_bo.handle,
output_0_dev_bo.handle,
input_1_dev_bo.handle,
output_1_dev_bo.handle,
],
)
wait_cmd(drv_fd, hw_ctx, exec_cmd_0)
sync_bo(drv_fd, input_0_dev_bo.handle)
sync_bo(drv_fd, output_0_dev_bo.handle)
sync_bo(drv_fd, input_1_dev_bo.handle)
sync_bo(drv_fd, output_1_dev_bo.handle)
for i in range(arr_size):
assert output_0_arr[i] == input_0_arr[i] + 1
assert output_1_arr[i] == input_1_arr[i] + 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment