Created
May 10, 2024 16:23
-
-
Save stgatilov/0bb58bf5296c3dfabd2eecd8dbf42237 to your computer and use it in GitHub Desktop.
numba.cuda: device buffer shared with OpenGL
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ctypes | |
import numpy as np | |
import numba.cuda as cuda | |
import cuda.cudart as curt | |
# source: https://numba.discourse.group/t/cuda-opengl-interop/1898 | |
class OpenglBufferInCuda: | |
"""A mapping of OpenGL buffer into CUDA device memory.""" | |
def __init__(self, glbuf): | |
"""Establishes mapping for given GL buffer object.""" | |
self._glbuf = glbuf | |
err, self._cuglresource = curt.cudaGraphicsGLRegisterBuffer( | |
self._glbuf, | |
curt.cudaGraphicsRegisterFlags.cudaGraphicsRegisterFlagsNone | |
) | |
assert err == 0 | |
(err,) = curt.cudaGraphicsMapResources(1, self._cuglresource, 0) | |
assert err == 0 | |
(err, self._cuptr, self._cusize) = curt.cudaGraphicsResourceGetMappedPointer(self._cuglresource) | |
assert err == 0 | |
def shutdown(self): | |
"""Breaks mapping.""" | |
if self._cuglresource is not None: | |
(err,) = curt.cudaGraphicsUnmapResources(1, self._cuglresource, 0) | |
(err,) = curt.cudaGraphicsUnregisterResource(self._cuglresource) | |
self._cuglresource = None | |
def asarray(self, dtype = np.float32, shape = None, *, strides = None, order = "C"): | |
""" | |
Returns DeviceNDArray view on mapped buffer compatible with numba.cuda. | |
Unless shape is specified, 1D array of maximum size is returned. | |
""" | |
if shape is None: | |
shape = self._cusize // np.dtype(dtype).itemsize | |
shape, strides, dtype = cuda.api.prepare_shape_strides_dtype(shape, strides, dtype, order) | |
datasize = cuda.driver.memory_size_from_info(shape, strides, dtype.itemsize) | |
assert datasize <= self._cusize | |
ctx = cuda.current_context() | |
cptr = ctypes.c_uint64(self._cuptr) | |
mem = cuda.driver.MemoryPointer(ctx, cptr, datasize) | |
return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype, gpu_data = mem) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import contextlib | |
import glfw | |
from OpenGL.GL import * | |
import numpy as np | |
def raii(exitstack, obj, method = 'shutdown'): | |
"""Ensures given method is called on given object when ExitStack scope ends.""" | |
exitstack.callback(getattr(obj, method)) | |
return obj | |
# source: https://gist.github.com/DGriffin91/c986d9bce5a0549d6e5c207261636e62 | |
@contextlib.contextmanager | |
def createGlfwWindow(window_name = "Python GLFW sample", width = 1280, height = 720, core = False, vsync = False): | |
assert glfw.init(), "Could not initialize GLFW" | |
glfw.window_hint(glfw.CONTEXT_VERSION_MAJOR, 3) | |
glfw.window_hint(glfw.CONTEXT_VERSION_MINOR, 3) | |
if core: | |
glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_CORE_PROFILE) | |
glfw.window_hint(glfw.OPENGL_FORWARD_COMPAT, 1) | |
else: | |
glfw.window_hint(glfw.OPENGL_PROFILE, glfw.OPENGL_COMPAT_PROFILE) | |
window = glfw.create_window(int(width), int(height), window_name, None, None) | |
assert window, "Could not create window and GL context" | |
glfw.make_context_current(window) | |
glfw.swap_interval(int(vsync)) | |
try: | |
yield window | |
finally: | |
glfw.terminate() | |
class OpenglScreenBuffer: | |
"""Represents OpenGL buffer object with RGB8 data that can be blit to default framebuffer.""" | |
def __init__(self, imgsize, *, channels = 3, dtype = np.uint8): | |
self.size = imgsize | |
self.channels = channels | |
self.dtype = dtype | |
# numba.cuda does not support images, so I'd better use buffers for interop | |
# since I'm lazy to write shaders, PBO is the only option | |
self.buffer = glGenBuffers(1) | |
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer) | |
glBufferStorage(GL_PIXEL_UNPACK_BUFFER, self.size[0] * self.size[1] * channels * np.dtype(dtype).itemsize, None, 0) | |
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) | |
# PBO cannot be bound to framebuffer, so we have to copy it to a texture | |
self._tex = glGenTextures(1) | |
glBindTexture(GL_TEXTURE_2D, self._tex) | |
formatmode = { | |
(3, np.uint8) : GL_RGB8, | |
(4, np.uint8) : GL_RGBA8, | |
(3, np.float32) : GL_RGB32F, | |
(4, np.float32) : GL_RGBA32F, | |
}[(self.channels, self.dtype)] | |
glTexStorage2D(GL_TEXTURE_2D, 1, formatmode, self.size[0], self.size[1]) | |
glBindTexture(GL_TEXTURE_2D, 0) | |
# texture is attached to framebuffer, and framebuffer can be blit with scaling | |
self._fbo = glGenFramebuffers(1) | |
glBindFramebuffer(GL_FRAMEBUFFER, self._fbo) | |
glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, self._tex, 0) | |
glBindFramebuffer(GL_FRAMEBUFFER, 0) | |
def blittoscreen(self, screenSize): | |
# copy PBO context to texture | |
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, self.buffer) | |
glBindTexture(GL_TEXTURE_2D, self._tex) | |
colormode = [GL_RED, GL_RG, GL_RGB, GL_RGBA][self.channels - 1] | |
typemode = {np.uint8 : GL_UNSIGNED_BYTE, np.float32 : GL_FLOAT}[self.dtype] | |
glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, self.size[0], self.size[1], colormode, typemode, None) | |
glBindTexture(GL_TEXTURE_2D, 0) | |
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0) | |
# and blit framebuffer with texture into screen/default framebuffer | |
glBindFramebuffer(GL_READ_FRAMEBUFFER, self._fbo) | |
glBlitFramebuffer(0, 0, self.size[0], self.size[1], 0, 0, screenSize[0], screenSize[1], GL_COLOR_BUFFER_BIT, GL_LINEAR) | |
glBindFramebuffer(GL_READ_FRAMEBUFFER, 0) | |
def shutdown(self): | |
if self.buffer is not None: | |
glDeleteFramebuffers(1, [self._fbo]) | |
glDeleteTextures(1, [self._tex]) | |
glDeleteBuffers(1, [self.buffer]) | |
self.buffer = None |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys, contextlib, random, math | |
import glm | |
import imgui | |
import glfw | |
from OpenGL.GL import * | |
from imgui.integrations.glfw import GlfwRenderer | |
from PIL import Image | |
import numpy as np | |
import numba.cuda as cuda | |
from numba.cuda.random import create_xoroshiro128p_states | |
from numba_cuda_opengl import OpenglBufferInCuda | |
from opengl_utils import createGlfwWindow, OpenglScreenBuffer, raii | |
Vec3f = np.dtype([ | |
('x', np.float32), | |
('y', np.float32), | |
('z', np.float32), | |
]) | |
Vec3b = np.dtype([ | |
('x', np.uint8), | |
('y', np.uint8), | |
('z', np.uint8), | |
]) | |
Point = np.dtype([ | |
('position', Vec3f), | |
('color', Vec3b), | |
('padding', np.uint8) | |
]) | |
Cylinder = np.dtype([ | |
('center', Vec3f), | |
('axisX', Vec3f), | |
('axisY', Vec3f), | |
('axisZ', Vec3f), | |
('radius', np.float32), | |
('height', np.float32), | |
('color', Vec3f), | |
]) | |
@cuda.jit(device = True) | |
def clamp(x, l, r): | |
return min(max(x, l), r) | |
@cuda.jit | |
def generatePointCloud_kernel(cylinders, rngStates, points): | |
globalIdx = cuda.grid(1) | |
if globalIdx >= points.size: | |
return | |
cylIdx = int(math.floor(globalIdx * cylinders.size / points.size)) | |
u = cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) * math.pi * 2 | |
v = 2.0 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1.0 | |
lx = math.cos(u) | |
ly = math.sin(u) | |
ctr = cylinders[cylIdx].center | |
axisX = cylinders[cylIdx].axisX | |
axisY = cylinders[cylIdx].axisY | |
axisZ = cylinders[cylIdx].axisZ | |
radius = cylinders[cylIdx].radius | |
height = cylinders[cylIdx].height | |
col = cylinders[cylIdx].color | |
points[globalIdx].position.x = ctr.x + radius * (axisX.x * lx + axisY.x * ly) + height * (axisZ.x * v) | |
points[globalIdx].position.y = ctr.y + radius * (axisX.y * lx + axisY.y * ly) + height * (axisZ.y * v) | |
points[globalIdx].position.z = ctr.z + radius * (axisX.z * lx + axisY.z * ly) + height * (axisZ.z * v) | |
dr = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1 | |
dg = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1 | |
db = (2 * cuda.random.xoroshiro128p_uniform_float32(rngStates, globalIdx) - 1) * 0.1 | |
points[globalIdx].color.x = round(255.0 * clamp(col.x + dr, 0.0, 1.0)) | |
points[globalIdx].color.y = round(255.0 * clamp(col.y + dg, 0.0, 1.0)) | |
points[globalIdx].color.z = round(255.0 * clamp(col.z + db, 0.0, 1.0)) | |
def generatePointCloud(pointsBuffer): | |
cylinders = np.empty(10, dtype = Cylinder) | |
for c in range(cylinders.size): | |
center = glm.vec3() | |
center.x = random.uniform(-1.0, 1.0) | |
center.y = random.uniform(-1.0, 1.0) | |
center.z = random.uniform(-1.0, 1.0) | |
axisX = glm.vec3() | |
axisY = glm.vec3() | |
while True: | |
axisX.x = random.uniform(-1.0, 1.0) | |
axisX.y = random.uniform(-1.0, 1.0) | |
axisX.z = random.uniform(-1.0, 1.0) | |
if glm.length(axisX) > 1.0 and glm.length(axisX) < 0.5: | |
continue | |
axisX /= glm.length(axisX) | |
axisY.x = random.uniform(-1.0, 1.0) | |
axisY.y = random.uniform(-1.0, 1.0) | |
axisY.z = random.uniform(-1.0, 1.0) | |
if glm.length(axisY) > 1.0 and glm.length(axisY) < 0.5: | |
continue | |
axisY -= glm.dot(axisX, axisY) * axisX | |
if glm.length(axisY) < 0.3: | |
continue | |
axisY /= glm.length(axisY) | |
break | |
radius = random.uniform(0.1, 0.2) | |
height = random.uniform(0.5, 1.0) | |
color = glm.vec3() | |
color.x = random.uniform(0.3, 1.0) | |
color.y = random.uniform(0.3, 1.0) | |
color.z = random.uniform(0.3, 1.0) | |
def setVec(dst, src): | |
dst['x'] = src.x | |
dst['y'] = src.y | |
dst['z'] = src.z | |
setVec(cylinders[c]['center'], center) | |
setVec(cylinders[c]['axisX'], axisX) | |
setVec(cylinders[c]['axisY'], axisY) | |
setVec(cylinders[c]['axisZ'], glm.cross(axisX, axisY)) | |
cylinders[c]['radius'] = radius | |
cylinders[c]['height'] = height | |
setVec(cylinders[c]['color'], color) | |
numPoints = pointsBuffer.size | |
rngStates = cuda.random.create_xoroshiro128p_states(numPoints, random.getrandbits(64)) | |
generatePointCloud_kernel[math.ceil(numPoints / 256), 256](cylinders, rngStates, pointsBuffer) | |
cuda.synchronize() | |
@cuda.jit | |
def clearBuffers_kernel(depthBuffer, colorBuffer): | |
(x, y) = cuda.grid(2) | |
(ny, nx) = depthBuffer.shape | |
if x >= nx or y >= ny: | |
return | |
depthBuffer[y][x] = 1.0 | |
colorBuffer[y][x].x = 0 | |
colorBuffer[y][x].y = 0 | |
colorBuffer[y][x].z = 0 | |
@cuda.jit(device = True) | |
def transformPoint(worldPos, mvp, screenShape): | |
cposX = mvp[0][0] * worldPos.x + mvp[1][0] * worldPos.y + mvp[2][0] * worldPos.z + mvp[3][0] | |
cposY = mvp[0][1] * worldPos.x + mvp[1][1] * worldPos.y + mvp[2][1] * worldPos.z + mvp[3][1] | |
cposZ = mvp[0][2] * worldPos.x + mvp[1][2] * worldPos.y + mvp[2][2] * worldPos.z + mvp[3][2] | |
cposW = mvp[0][3] * worldPos.x + mvp[1][3] * worldPos.y + mvp[2][3] * worldPos.z + mvp[3][3] | |
if max(abs(cposX), abs(cposY), abs(cposZ)) >= cposW: | |
return -1, -1, -1 | |
invW = 1.0 / cposW | |
sx = (((cposX * invW) + 1.0) * 0.5) * screenShape[1] | |
sy = (((cposY * invW) + 1.0) * 0.5) * screenShape[0] | |
depth = (((cposZ * invW) + 1.0) * 0.5) | |
return np.uint32(math.floor(sx)), np.uint32(math.floor(sy)), np.float32(depth) | |
@cuda.jit | |
def generateDepthBuffer_kernel(depthBuffer, pointsBuffer, mvp): | |
idx = cuda.grid(1) | |
if idx >= pointsBuffer.size: | |
return | |
pos = pointsBuffer[idx].position | |
pres = transformPoint(pos, mvp, depthBuffer.shape) | |
if pres[2] < 0.0: | |
return | |
cuda.atomic.min(depthBuffer, (pres[1], pres[0]), pres[2]) | |
@cuda.jit | |
def generateColorBuffer_kernel(depthBuffer, pointsBuffer, colorBuffer, mvp): | |
idx = cuda.grid(1) | |
if idx >= pointsBuffer.size: | |
return | |
pos = pointsBuffer[idx].position | |
pres = transformPoint(pos, mvp, depthBuffer.shape) | |
if pres[2] < 0: | |
return | |
if pres[2] != depthBuffer[pres[1]][pres[0]]: | |
return | |
colorBuffer[pres[1]][pres[0]] = pointsBuffer[idx].color | |
def renderPointCloud(pointsBuffer, depthBuffer, colorBuffer, mvp): | |
numThreads = (16, 16) | |
numBlocks = (math.ceil(depthBuffer.shape[1] / 16), math.ceil(depthBuffer.shape[0] / 16)) | |
clearBuffers_kernel[numBlocks, numThreads](depthBuffer, colorBuffer) | |
cuda.synchronize() | |
numThreads = 256 | |
numBlocks = math.ceil(pointsBuffer.size / 256) | |
generateDepthBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, np.array(glm.transpose(mvp))) | |
cuda.synchronize() | |
numThreads = 256 | |
numBlocks = math.ceil(pointsBuffer.size / 256) | |
generateColorBuffer_kernel[numBlocks, numThreads](depthBuffer, pointsBuffer, colorBuffer, np.array(glm.transpose(mvp))) | |
cuda.synchronize() | |
class GuiApp: | |
@staticmethod | |
def run(): | |
with contextlib.ExitStack() as exitstack: | |
app = GuiApp(exitstack) | |
app.loop() | |
def __init__(self, exitstack, *, filename = None, numPoints = 10000000): | |
self.window = exitstack.enter_context(createGlfwWindow( | |
window_name = "Experimental point cloud viewer", | |
core = True | |
)) | |
self.windowSize = glfw.get_framebuffer_size(self.window) | |
self.projectionMatrix = glm.perspective(math.pi / 3, self.windowSize[0] / self.windowSize[1], 1.0e-2, 100.0) | |
imgui.create_context() | |
self.imguiImpl = raii(exitstack, GlfwRenderer(self.window)) | |
self.showImguiDemo = False | |
self.pitch = 0.0 | |
self.yaw = 0.0 | |
self.eye = glm.vec3(-3.0, 0.0, 0.0) | |
self.mouselook = False | |
self.pointCloudGl = glGenBuffers(1) | |
glBindBuffer(GL_ARRAY_BUFFER, self.pointCloudGl) | |
glBufferStorage(GL_ARRAY_BUFFER, numPoints * Point.itemsize, None, 0) | |
glBindBuffer(GL_ARRAY_BUFFER, 0) | |
self.pointCloudCuda = raii(exitstack, OpenglBufferInCuda(self.pointCloudGl)).asarray(dtype = Point) | |
generatePointCloud(self.pointCloudCuda) | |
windowShape = (self.windowSize[1], self.windowSize[0]) | |
self.depthBufferCuda = cuda.device_array(windowShape, dtype = np.float32) | |
self.screenBuffer = raii(exitstack, OpenglScreenBuffer(self.windowSize)) | |
self.screenBufferCuda = raii(exitstack, OpenglBufferInCuda(self.screenBuffer.buffer)).asarray(shape = windowShape, dtype = Vec3b) | |
def onKey(self, window, key, scancode, action, mods): | |
if action == glfw.PRESS: | |
if key == glfw.KEY_ESCAPE: | |
if self.mouselook: | |
self.mouselook = False | |
glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.FALSE) | |
glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_NORMAL) | |
else: | |
self.mouselook = True | |
glfw.set_input_mode(self.window, glfw.CURSOR, glfw.CURSOR_DISABLED) | |
glfw.set_input_mode(self.window, glfw.RAW_MOUSE_MOTION, glfw.TRUE) | |
glfw.set_cursor_pos(self.window, 0, 0) | |
def onMouseMove(self, window, xpos, ypos): | |
if self.mouselook: | |
self.pitch -= ypos / self.windowSize[1] * 3 | |
self.yaw -= xpos / self.windowSize[1] * 3 | |
self.pitch = glm.clamp(self.pitch, -math.pi/2, math.pi/2) | |
self.yaw = math.fmod(self.yaw, math.pi*2) | |
glfw.set_cursor_pos(self.window, 0, 0) | |
def loop(self): | |
lastTime = glfw.get_time() | |
frameTime = 0.0 | |
glfw.set_key_callback(self.window, self.onKey) | |
glfw.set_cursor_pos_callback(self.window, self.onMouseMove) | |
while not glfw.window_should_close(self.window): | |
glfw.poll_events() | |
self.imguiImpl.process_inputs() | |
forward = glm.vec3( | |
math.cos(self.pitch) * math.cos(self.yaw), | |
math.cos(self.pitch) * math.sin(self.yaw), | |
math.sin(self.pitch) | |
) | |
up = glm.vec3( | |
math.cos(self.pitch + math.pi/2) * math.cos(self.yaw), | |
math.cos(self.pitch + math.pi/2) * math.sin(self.yaw), | |
math.sin(self.pitch + math.pi/2) | |
) | |
df = 0 | |
dr = 0 | |
if glfw.get_key(self.window, glfw.KEY_W) == glfw.PRESS: | |
df += 1 | |
if glfw.get_key(self.window, glfw.KEY_S) == glfw.PRESS: | |
df -= 1 | |
if glfw.get_key(self.window, glfw.KEY_D) == glfw.PRESS: | |
dr += 1 | |
if glfw.get_key(self.window, glfw.KEY_A) == glfw.PRESS: | |
dr -= 1 | |
self.eye += (forward * df + glm.cross(forward, up) * dr) * frameTime * 3.0 | |
self.viewMatrix = glm.lookAt(self.eye, self.eye + forward * (glm.length(self.eye) + 1), up) | |
mvp = self.projectionMatrix * self.viewMatrix | |
renderPointCloud(self.pointCloudCuda, self.depthBufferCuda, self.screenBufferCuda, mvp) | |
self.screenBuffer.blittoscreen(self.windowSize) | |
imgui.new_frame() | |
if imgui.begin("Settings", flags = imgui.WINDOW_ALWAYS_AUTO_RESIZE): | |
imgui.text("Frame time: %4.1f ms" % (frameTime * 1000.0)) | |
_, self.showImguiDemo = imgui.checkbox("Show ImGui demo", self.showImguiDemo) | |
if self.showImguiDemo: | |
imgui.show_test_window() | |
imgui.end() | |
imgui.render() | |
self.imguiImpl.render(imgui.get_draw_data()) | |
glfw.swap_buffers(self.window) | |
newTime = glfw.get_time() | |
frameTime = newTime - lastTime | |
lastTime = newTime | |
def main(argv): | |
GuiApp.run() | |
if __name__ == "__main__": | |
main(sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can put all files to same directory, then run pcviewer.py under Python 3 after installing enough pip modules.
You'll see some cylinders comprised of points on the screen.
Press Escape, then use WASD + mouselook to fly around.
This demo provides two pieces which might be useful:
OpenglBufferInCuda
--- takes OpenGL buffer object and returns numba.cuda array which points (uses CUDA + OpenGL interop).OpenglScreenBuffer
--- maintains image buffer that can be blit to screen. Share this buffer to numba.cuda, and you can do any kind of realtime software rendering with numba.cuda!