Created
February 21, 2020 14:22
-
-
Save juniorprincewang/e4f6bc4b6b0c2a18c7b1eb7b4a48f6cd to your computer and use it in GitHub Desktop.
NVIDIA NVC0 compute process using MMIO
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| // https://people.freedesktop.org/~chrisbmr/90c0.c | |
| #include <stdint.h> | |
| #include <stdlib.h> | |
| #include <stdio.h> | |
| #include <string.h> | |
| #include <math.h> | |
| #include <float.h> | |
| #include <libdrm/nouveau.h> | |
| #include "nvc0_push.h" | |
| #include "nv_object.xml.h" | |
| #include "nvc0_compute.xml.h" | |
| #include "nvc0_m2mf.xml.h" | |
| static struct nouveau_device *dev = NULL; | |
| static struct nouveau_object *chan = NULL; | |
| static struct nouveau_object *m2mf = NULL; | |
| static struct nouveau_object *djka = NULL; | |
| struct nouveau_client *client = NULL; | |
| struct nouveau_bufctx *bufctx = NULL; | |
| struct nouveau_pushbuf *push = NULL; | |
| /* buffers for various stuff */ | |
| static struct nouveau_bo *code; | |
| static struct nouveau_bo *parm; | |
| static struct nouveau_bo *heap; | |
| static struct nouveau_bo *misc; | |
| static struct nouveau_bo *temp; | |
| static struct nouveau_bo *qery; | |
| static int init(); | |
| static int alloc_buffers(); | |
| /* the program we're going to execute */ | |
| static uint32_t kernel[] = | |
| { | |
| 0x00001de4, 0x28004000, // mov b32 $r0 c0[0x0] | |
| 0x10005de4, 0x28004000, // mov b32 $r1 c0[0x4] | |
| 0x40009c04, 0x2c000001, // mov b32 $r2 $clock | |
| 0x4400dc04, 0x2c000001, // mov b32 $r3 $clockhi | |
| 0x00009fa5, 0x94000000, // st b64 wt g[$r0d] $r2d | |
| 0x00001de7, 0x80000000 // exit | |
| }; | |
| int main(int argc, char **argv) | |
| { | |
| uint32_t sequence = 0xfe4ce; | |
| int i, ret; | |
| ret = init(); | |
| if (ret) { | |
| fprintf(stderr, "init failed\n"); | |
| return -1; | |
| } | |
| nouveau_pushbuf_bufctx(push, bufctx); | |
| ret = nouveau_pushbuf_validate(push); | |
| if (ret) { | |
| fprintf(stderr, "failed to validate pushbuf: %s\n", strerror(-ret)); | |
| return -1; | |
| } | |
| /* thread group / block dimensions (see CUDA or OpenCL): */ | |
| const int gdx = 1, gdy = 1, gdz = 1, bdx = 1, bdy = 1, bdz = 1; | |
| BEGIN_NVC0(push, SUBC_M2MF(NV01_SUBCHAN_OBJECT), 1); | |
| PUSH_DATA (push, m2mf->oclass); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1); | |
| PUSH_DATA (push, djka->oclass); | |
| /* upload the GPU code */ | |
| BEGIN_NVC0(push, NVC0_M2MF(OFFSET_OUT_HIGH), 2); | |
| PUSH_DATAh(push, code->offset); | |
| PUSH_DATA (push, code->offset); | |
| BEGIN_NVC0(push, NVC0_M2MF(LINE_LENGTH_IN), 2); | |
| PUSH_DATA (push, sizeof(kernel)); | |
| PUSH_DATA (push, 1); | |
| BEGIN_NVC0(push, NVC0_M2MF(EXEC), 1); | |
| PUSH_DATA (push, 0x100111); | |
| BEGIN_NIC0(push, NVC0_M2MF(DATA), sizeof(kernel) / 4); | |
| for (i = 0; i < sizeof(kernel) / 4; ++i) | |
| PUSH_DATA(push, kernel[i]); | |
| PUSH_KICK (push); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(MEM_BARRIER), 1); | |
| PUSH_DATA (push, 0x1111); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(QUERY_ADDRESS_HIGH), 4); | |
| PUSH_DATAh(push, qery->offset); | |
| PUSH_DATA (push, qery->offset); | |
| PUSH_DATA (push, ++sequence); | |
| PUSH_DATA (push, 0); | |
| /* various magic commands and setup of tls (local) memory area */ | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0758), 1); | |
| PUSH_DATA (push, 0xe); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0d64), 1); | |
| PUSH_DATA (push, 0xf); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_ADDRESS_HIGH), 5); | |
| PUSH_DATAh(push, temp->offset); | |
| PUSH_DATA (push, temp->offset); | |
| PUSH_DATA (push, temp->size >> 32); | |
| PUSH_DATA (push, temp->size); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(LOCAL_BASE), 1); | |
| PUSH_DATA (push, 0x10000000); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0204), 3); | |
| PUSH_DATA (push, 0); // f0 | |
| PUSH_DATA (push, 0); // 600 | |
| PUSH_DATA (push, 0); // 1000 | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CACHE_SPLIT), 1); | |
| PUSH_DATA (push, NVC0_COMPUTE_CACHE_SPLIT_16K_SHARED_48K_L1); | |
| /* if this gets near the g[] address I use, CPU doesn't see the writes */ | |
| BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_BASE), 1); | |
| PUSH_DATA (push, 0x80000000); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(SHARED_SIZE), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x02a0), 1); | |
| PUSH_DATA (push, 0x8000); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(GRIDDIM_YX), 2); | |
| PUSH_DATA (push, (gdy << 16) | gdx); | |
| PUSH_DATA (push, gdz); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(BLOCKDIM_YX), 2); | |
| PUSH_DATA (push, (bdy << 16) | bdx); | |
| PUSH_DATA (push, bdz); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(GLOBAL_BASE), 1); | |
| PUSH_DATA (push, (0xc << 28) | (0 << 16) | 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(GLOBAL_BASE), 1); | |
| PUSH_DATA (push, (0xc << 28) | (1 << 16) | 1); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x02c4), 1); | |
| PUSH_DATA (push, 1); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0210), 1); | |
| PUSH_DATA (push, 0x33); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x1698), 1); | |
| PUSH_DATA (push, 1); | |
| /* setup the address of the code segment */ | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CODE_ADDRESS_HIGH), 2); | |
| PUSH_DATAh(push, code->offset); | |
| PUSH_DATA (push, code->offset); | |
| /* upload some data to the constant buffer */ | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); | |
| PUSH_DATA (push, 256); | |
| PUSH_DATAh(push, parm->offset); | |
| PUSH_DATA (push, parm->offset); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CB_POS), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NIC0(push, NVC0_COMPUTE(CB_DATA(0)), 8); | |
| PUSH_DATA (push, heap->offset); | |
| PUSH_DATAh(push, heap->offset); | |
| PUSH_DATA (push, 0); | |
| PUSH_DATA (push, 0); | |
| PUSH_DATA (push, 0); | |
| PUSH_DATA (push, 0); | |
| PUSH_DATA (push, 0); | |
| PUSH_DATA (push, 0); | |
| /* bind constant buffer to c0[] and c1[] */ | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CB_SIZE), 3); | |
| PUSH_DATA (push, 256); | |
| PUSH_DATAh(push, parm->offset); | |
| PUSH_DATA (push, parm->offset); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CB_BIND), 1); | |
| PUSH_DATA (push, (0 << 8) | 1); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x1698), 1); | |
| PUSH_DATA (push, 0x1000); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CP_START_ID), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(THREADS_ALLOC), 2); | |
| PUSH_DATA (push, bdx * bdy * bdz); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(CP_GPR_ALLOC), 1); | |
| PUSH_DATA (push, 24); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0780), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x036c), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x1698), 1); | |
| PUSH_DATA (push, 0x110); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x029c), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0a08), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(LAUNCH), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0a04), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0360), 1); | |
| PUSH_DATA (push, 1); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(MEM_BARRIER), 1); | |
| PUSH_DATA (push, 0x1111); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(MEM_BARRIER), 1); | |
| PUSH_DATA (push, 0x0004); | |
| BEGIN_NVC0(push, SUBC_COMPUTE(0x0110), 1); | |
| PUSH_DATA (push, 0); | |
| BEGIN_NVC0(push, NVC0_COMPUTE(QUERY_ADDRESS_HIGH), 4); | |
| PUSH_DATAh(push, qery->offset); | |
| PUSH_DATA (push, qery->offset); | |
| PUSH_DATA (push, ++sequence); | |
| PUSH_DATA (push, 0); | |
| PUSH_KICK (push); | |
| ret = nouveau_bo_map(qery, NOUVEAU_BO_RD, client); | |
| if (ret) { | |
| fprintf(stderr, "fence map failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| while (((uint32_t *)qery->map)[0] != sequence) | |
| sched_yield(); | |
| ret = nouveau_bo_map(heap, NOUVEAU_BO_RD, client); | |
| if (ret) { | |
| fprintf(stderr, "heap map failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| for (i = 0; i < 8; ++i) | |
| printf("HEAP[%i] = %08x\n", i * 4, ((uint32_t *)heap->map)[i]); | |
| return 0; | |
| } | |
| static int | |
| alloc_buffers() | |
| { | |
| int ret; | |
| ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, 1 << 20, NULL, | |
| &code); | |
| if (ret) { | |
| fprintf(stderr, "nouveau_bo_alloc(code): %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, 1 << 16, NULL, | |
| &parm); | |
| if (ret) { | |
| fprintf(stderr, "nouveau_bo_alloc(parm): %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM | NOUVEAU_BO_MAP, 256, 1 << 20, | |
| NULL, &heap); | |
| if (ret) { | |
| fprintf(stderr, "nouveau_bo_alloc(heap): %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bo_map(heap, 0, NULL); | |
| if (ret) { | |
| fprintf(stderr, "heap map failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| memset(heap->map, 0, 1 << 20); | |
| ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 256, 1 << 20, NULL, | |
| &temp); | |
| if (ret) { | |
| fprintf(stderr, "nouveau_bo_alloc(temp): %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| /* don't add the 'u', has to have 4 letters to look nice */ | |
| ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 256, 1 << 12, | |
| NULL, &qery); | |
| if (ret) { | |
| fprintf(stderr, "nouveau_bo_alloc(qery): %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bo_map(qery, 0, NULL); | |
| if (ret) { | |
| fprintf(stderr, "qery map failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bo_new(dev, NOUVEAU_BO_GART | NOUVEAU_BO_MAP, 256, 1 << 12, | |
| NULL, &misc); | |
| if (ret) { | |
| fprintf(stderr, "nouveau_bo_alloc(misc): %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bo_map(misc, 0, NULL); | |
| if (ret) { | |
| fprintf(stderr, "misc map failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_bufctx_new(client, 1, &bufctx); | |
| if (ret) { | |
| fprintf(stderr, "failed to create bufctx: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| nouveau_bufctx_refn(bufctx, 0, code, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR); | |
| nouveau_bufctx_refn(bufctx, 0, parm, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR); | |
| nouveau_bufctx_refn(bufctx, 0, heap, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR); | |
| nouveau_bufctx_refn(bufctx, 0, temp, NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR); | |
| nouveau_bufctx_refn(bufctx, 0, qery, NOUVEAU_BO_GART | NOUVEAU_BO_RDWR); | |
| return 0; | |
| } | |
| int init() | |
| { | |
| struct nvc0_fifo data = { }; | |
| int ret; | |
| data.engine = 1; /* PGRAPH */ | |
| ret = nouveau_device_open(NULL, &dev); | |
| if (ret) { | |
| fprintf(stderr, "device open failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS, | |
| &data, sizeof(data), &chan); | |
| if (ret) { | |
| fprintf(stderr, "channel alloc failed: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_client_new(dev, &client); | |
| if (ret) { | |
| fprintf(stderr, "failed to allocate client: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_pushbuf_new(client, chan, 4, 32 * 1024, 1, &push); | |
| if (ret) { | |
| fprintf(stderr, "failed to allocate pushbuf: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_object_new(chan, 0xbeef90c0, NVC0_COMPUTE_CLASS, NULL, 0, | |
| &djka); | |
| if (ret) { | |
| fprintf(stderr, "failed to allocate compute object: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| ret = nouveau_object_new(chan, 0xbeefa040, NVC0_M2MF_CLASS, NULL, 0, | |
| &m2mf); | |
| if (ret) { | |
| fprintf(stderr, "failed to allocate p2mf object: %s\n", strerror(-ret)); | |
| return ret; | |
| } | |
| return alloc_buffers(); | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment