Skip to content

Instantly share code, notes, and snippets.

@juniorprincewang
Created February 21, 2020 14:22
Show Gist options
  • Select an option

  • Save juniorprincewang/e4f6bc4b6b0c2a18c7b1eb7b4a48f6cd to your computer and use it in GitHub Desktop.

Select an option

Save juniorprincewang/e4f6bc4b6b0c2a18c7b1eb7b4a48f6cd to your computer and use it in GitHub Desktop.
NVIDIA NVC0 compute process using MMIO
// https://people.freedesktop.org/~chrisbmr/90c0.c
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <float.h>
#include <libdrm/nouveau.h>
#include "nvc0_push.h"
#include "nv_object.xml.h"
#include "nvc0_compute.xml.h"
#include "nvc0_m2mf.xml.h"
static struct nouveau_device *dev = NULL;
static struct nouveau_object *chan = NULL;
static struct nouveau_object *m2mf = NULL;
static struct nouveau_object *djka = NULL;
struct nouveau_client *client = NULL;
struct nouveau_bufctx *bufctx = NULL;
struct nouveau_pushbuf *push = NULL;
/* buffers for various stuff */
static struct nouveau_bo *code;
static struct nouveau_bo *parm;
static struct nouveau_bo *heap;
static struct nouveau_bo *misc;
static struct nouveau_bo *temp;
static struct nouveau_bo *qery;
static int init();
static int alloc_buffers();
/* the program we're going to execute */
static uint32_t kernel[] =
{
0x00001de4, 0x28004000, // mov b32 $r0 c0[0x0]
0x10005de4, 0x28004000, // mov b32 $r1 c0[0x4]
0x40009c04, 0x2c000001, // mov b32 $r2 $clock
0x4400dc04, 0x2c000001, // mov b32 $r3 $clockhi
0x00009fa5, 0x94000000, // st b64 wt g[$r0d] $r2d
0x00001de7, 0x80000000 // exit
};
int main(int argc, char **argv)
{
uint32_t sequence = 0xfe4ce;
int i, ret;
ret = init();
if (ret) {
fprintf(stderr, "init failed\n");
return -1;
}
nouveau_pushbuf_bufctx(push, bufctx);
ret = nouveau_pushbuf_validate(push);
if (ret) {
fprintf(stderr, "failed to validate pushbuf: %s\n", strerror(-ret));
return -1;
}
/* thread group / block dimensions (see CUDA or OpenCL): */
const int gdx = 1, gdy = 1, gdz = 1, bdx = 1, bdy = 1, bdz = 1;
BEGIN_NVC0(push, SUBC_M2MF(NV01_SUBCHAN_OBJECT), 1);
PUSH_DATA (push, m2mf->oclass);
BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
PUSH_DATA (push, djka->oclass);
/* upload the GPU code */
BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2);
PUSH_DATAh(push, code->offset);
PUSH_DATA (push, code->offset);
BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2);
PUSH_DATA (push, sizeof(kernel));
PUSH_DATA (push, 1);
BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1);
PUSH_DATA (push, 0x100111);
BEGIN_NIC0(push, NVC0_M2MF(DATA), sizeof(kernel) / 4);
for (i = 0; i < sizeof(kernel) / 4; ++i)
PUSH_DATA(push, kernel[i]);
PUSH_KICK (push);
BEGIN_NVC0(push, NVC0_COMPUTE(MEM_BARRIER), 1);
PUSH_DATA (push, 0x1111);
BEGIN_NVC0(push, NVC0_COMPUTE(QUERY_ADDRESS_HIGH), 4);
PUSH_DATAh(push, qery->offset);
PUSH_DATA (push, qery->offset);
PUSH_DATA (push, ++sequence);
PUSH_DATA (push, 0);
/* various magic commands and setup of tls (local) memory area */
BEGIN_NVC0(push, SUBC_COMPUTE(0x0758), 1);
PUSH_DATA (push, 0xe);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0d64), 1);
PUSH_DATA (push, 0xf);
BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_ADDRESS_HIGH), 5);
PUSH_DATAh(push, temp->offset);
PUSH_DATA (push, temp->offset);
PUSH_DATA (push, temp->size >> 32);
PUSH_DATA (push, temp->size);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1);
PUSH_DATA (push, 0x10000000);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0204), 3);
PUSH_DATA (push, 0); // f0
PUSH_DATA (push, 0); // 600
PUSH_DATA (push, 0); // 1000
BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1);
PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_16K_SHARED_48K_L1);
/* if this gets near the g[] address I use, CPU doesn't see the writes */
BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1);
PUSH_DATA (push, 0x80000000);
BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1);
PUSH_DATA (push, 0x8000);
BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2);
PUSH_DATA (push, (gdy << 16) | gdx);
PUSH_DATA (push, gdz);
BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2);
PUSH_DATA (push, (bdy << 16) | bdx);
PUSH_DATA (push, bdz);
BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVC0_COMPUTE(GLOBAL_BASE), 1);
PUSH_DATA (push, (0xc << 28) | (0 << 16) | 0);
BEGIN_NVC0(push, NVC0_COMPUTE(GLOBAL_BASE), 1);
PUSH_DATA (push, (0xc << 28) | (1 << 16) | 1);
BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1);
PUSH_DATA (push, 1);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0210), 1);
PUSH_DATA (push, 0x33);
BEGIN_NVC0(push, SUBC_COMPUTE(0x1698), 1);
PUSH_DATA (push, 1);
/* setup the address of the code segment */
BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2);
PUSH_DATAh(push, code->offset);
PUSH_DATA (push, code->offset);
/* upload some data to the constant buffer */
BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
PUSH_DATA (push, 256);
PUSH_DATAh(push, parm->offset);
PUSH_DATA (push, parm->offset);
BEGIN_NVC0(push, NVC0_COMPUTE(CB_POS), 1);
PUSH_DATA (push, 0);
BEGIN_NIC0(push, NVC0_COMPUTE(CB_DATA(0)), 8);
PUSH_DATA (push, heap->offset);
PUSH_DATAh(push, heap->offset);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
PUSH_DATA (push, 0);
/* bind constant buffer to c0[] and c1[] */
BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3);
PUSH_DATA (push, 256);
PUSH_DATAh(push, parm->offset);
PUSH_DATA (push, parm->offset);
BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1);
PUSH_DATA (push, (0 << 8) | 1);
BEGIN_NVC0(push, SUBC_COMPUTE(0x1698), 1);
PUSH_DATA (push, 0x1000);
BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVC0_COMPUTE(THREADS_ALLOC), 2);
PUSH_DATA (push, bdx * bdy * bdz);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1);
PUSH_DATA (push, 24);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0780), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, SUBC_COMPUTE(0x1698), 1);
PUSH_DATA (push, 0x110);
BEGIN_NVC0(push, SUBC_COMPUTE(0x029c), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0a04), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1);
PUSH_DATA (push, 1);
BEGIN_NVC0(push, NVC0_COMPUTE(MEM_BARRIER), 1);
PUSH_DATA (push, 0x1111);
BEGIN_NVC0(push, NVC0_COMPUTE(MEM_BARRIER), 1);
PUSH_DATA (push, 0x0004);
BEGIN_NVC0(push, SUBC_COMPUTE(0x0110), 1);
PUSH_DATA (push, 0);
BEGIN_NVC0(push, NVC0_COMPUTE(QUERY_ADDRESS_HIGH), 4);
PUSH_DATAh(push, qery->offset);
PUSH_DATA (push, qery->offset);
PUSH_DATA (push, ++sequence);
PUSH_DATA (push, 0);
PUSH_KICK (push);
ret = nouveau_bo_map(qery, NOUVEAU_BO_RD, client);
if (ret) {
fprintf(stderr, "fence map failed: %s\n", strerror(-ret));
return ret;
}
while (((uint32_t *)qery->map)[0] != sequence)
sched_yield();
ret = nouveau_bo_map(heap, NOUVEAU_BO_RD, client);
if (ret) {
fprintf(stderr, "heap map failed: %s\n", strerror(-ret));
return ret;
}
for (i = 0; i < 8; ++i)
printf("HEAP[%i] = %08x\n", i * 4, ((uint32_t *)heap->map)[i]);
return 0;
}
static int
alloc_buffers()
{
int ret;
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, 1 << 20, NULL,
&code);
if (ret) {
fprintf(stderr, "nouveau_bo_alloc(code): %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, 1 << 16, NULL,
&parm);
if (ret) {
fprintf(stderr, "nouveau_bo_alloc(parm): %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 256, 1 << 20,
NULL, &heap);
if (ret) {
fprintf(stderr, "nouveau_bo_alloc(heap): %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bo_map(heap, 0, NULL);
if (ret) {
fprintf(stderr, "heap map failed: %s\n", strerror(-ret));
return ret;
}
memset(heap->map, 0, 1 << 20);
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, 1 << 20, NULL,
&temp);
if (ret) {
fprintf(stderr, "nouveau_bo_alloc(temp): %s\n", strerror(-ret));
return ret;
}
/* don't add the 'u', has to have 4 letters to look nice */
ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 256, 1 << 12,
NULL, &qery);
if (ret) {
fprintf(stderr, "nouveau_bo_alloc(qery): %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bo_map(qery, 0, NULL);
if (ret) {
fprintf(stderr, "qery map failed: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 256, 1 << 12,
NULL, &misc);
if (ret) {
fprintf(stderr, "nouveau_bo_alloc(misc): %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bo_map(misc, 0, NULL);
if (ret) {
fprintf(stderr, "misc map failed: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_bufctx_new(client, 1, &bufctx);
if (ret) {
fprintf(stderr, "failed to create bufctx: %s\n", strerror(-ret));
return ret;
}
nouveau_bufctx_refn(bufctx, 0, code, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
nouveau_bufctx_refn(bufctx, 0, parm, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
nouveau_bufctx_refn(bufctx, 0, heap, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
nouveau_bufctx_refn(bufctx, 0, temp, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR);
nouveau_bufctx_refn(bufctx, 0, qery, NOUVEAU_BO_GART | NOUVEAU_BO_RDWR);
return 0;
}
int init()
{
struct nvc0_fifo data = { };
int ret;
data.engine = 1; /* PGRAPH */
ret = nouveau_device_open(NULL, &dev);
if (ret) {
fprintf(stderr, "device open failed: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
&data, sizeof(data), &chan);
if (ret) {
fprintf(stderr, "channel alloc failed: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_client_new(dev, &client);
if (ret) {
fprintf(stderr, "failed to allocate client: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_pushbuf_new(client, chan, 4, 32 * 1024, 1, &push);
if (ret) {
fprintf(stderr, "failed to allocate pushbuf: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_object_new(chan, 0xbeef90c0, NVC0_COMPUTE_CLASS, NULL, 0,
&djka);
if (ret) {
fprintf(stderr, "failed to allocate compute object: %s\n", strerror(-ret));
return ret;
}
ret = nouveau_object_new(chan, 0xbeefa040, NVC0_M2MF_CLASS, NULL, 0,
&m2mf);
if (ret) {
fprintf(stderr, "failed to allocate p2mf object: %s\n", strerror(-ret));
return ret;
}
return alloc_buffers();
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment