3D world points from ARKit depth
import ARKit
import SceneKit
let horizontalPoints = 256 / 2
let verticalPoints = 192 / 2
var depthNodes = [SCNNode]()
var parentDebugNodes = SCNNode()
var sceneView: ARSCNView!
// Somewhere during setup
func setup() {
let configuration = ARWorldTrackingConfiguration()
configuration.frameSemantics = .smoothedSceneDepth
let sizeGeomPredictions = 0.005
let geom = SCNBox(width: sizeGeomPredictions, height: sizeGeomPredictions, length: sizeGeomPredictions, chamferRadius: 0)
geom.firstMaterial?.diffuse.contents =
for _ in 0..<(horizontalPoints * verticalPoints) {
let node = SCNNode(geometry: geom)
func session(_ session: ARSession, didUpdate frame: ARFrame) {
guard let smoothedDepth = frame.smoothedSceneDepth?.depthMap else {
let capturedImage = frame.capturedImage
let lockFlags = CVPixelBufferLockFlags.readOnly
CVPixelBufferLockBaseAddress(smoothedDepth, lockFlags)
defer {
CVPixelBufferUnlockBaseAddress(smoothedDepth, lockFlags)
let baseAddress = CVPixelBufferGetBaseAddressOfPlane(smoothedDepth, 0)!
let depthByteBuffer = baseAddress.assumingMemoryBound(to: Float32.self)
// The `.size` accessor simply read the CVPixelBuffer's width and height in pixels.
// They are the same ratio:
// 1920 x 1440 = 1440 x 1920 = 0.75
let depthMapSize = smoothedDepth.size
// 192 x 256 = 0.75
let capturedImageSize = capturedImage.size
var cameraIntrinsics =
let depthResolution = simd_float2(x: Float(depthMapSize.x), y: Float(depthMapSize.y))
let scaleRes = simd_float2(x: Float(capturedImageSize.x) / depthResolution.x,
y: Float(capturedImageSize.y) / depthResolution.y )
// Make the camera intrinsics be with respect to Depth.
cameraIntrinsics[0][0] /= scaleRes.x
cameraIntrinsics[1][1] /= scaleRes.y
cameraIntrinsics[2][0] /= scaleRes.x
cameraIntrinsics[2][1] /= scaleRes.y
// This will be the long size, because of the rotation
let horizontalStep = Float(depthMapSize.x) / Float(self.horizontalPoints)
let halfHorizontalStep = horizontalStep / 2
// This will be the short size, because of the rotation
let verticalStep = Float(depthMapSize.y) / Float(self.verticalPoints)
let halfVerticalStep = verticalStep / 2
for h in 0..<horizontalPoints {
for v in 0..<verticalPoints {
let x = Float(h) * horizontalStep + halfHorizontalStep
let y = Float(v) * verticalStep + halfVerticalStep
let depthMapPoint = simd_float2(x, y)
// Sample depth
let metricDepth = sampleDepthRaw(depthByteBuffer, size: depthMapSize, at: .init(depthMapPoint))
let wp = worldPoint(depthMapPixelPoint: depthMapPoint,
depth: metricDepth,
cameraIntrinsics: cameraIntrinsics,
// This is crucial: you need to always use the view matrix for Landscape Right.
viewMatrixInverted: .landscapeRight).inverse)
let node = self.depthNodes[v * horizontalPoints + h]
node.simdWorldPosition = wp
func sampleDepthRaw(_ pointer: UnsafeMutablePointer<Float32>, size: SIMD2<Int>, at: SIMD2<Int>) -> Float {
let baseAddressIndex = at.y * size.x + at.x
return Float(pointer[baseAddressIndex])
// This also works. Adapted from:
func worldPoint(depthMapPixelPoint: SIMD2<Float>, depth: Float, cameraIntrinsicsInverted: simd_float3x3, viewMatrixInverted: simd_float4x4) -> SIMD3<Float> {
let localPoint = cameraIntrinsicsInverted * simd_float3(depthMapPixelPoint, 1) * -depth
let localPointSwappedX = simd_float3(-localPoint.x, localPoint.y, localPoint.z)
let worldPoint = viewMatrixInverted * simd_float4(localPointSwappedX, 1)
return (worldPoint / worldPoint.w)[SIMD3(0,1,2)]
// This one is adapted from:
func worldPoint(depthMapPixelPoint: SIMD2<Float>, depth: Float, cameraIntrinsics: simd_float3x3, viewMatrixInverted: simd_float4x4) -> SIMD3<Float> {
let xrw = ((depthMapPixelPoint.x - cameraIntrinsics[2][0]) * depth / cameraIntrinsics[0][0])
let yrw = (depthMapPixelPoint.y - cameraIntrinsics[2][1]) * depth / cameraIntrinsics[1][1]
// Y is UP in camera space, vs it being DOWN in image space.
let localPoint = simd_float3(xrw, -yrw, -depth)
let worldPoint = viewMatrixInverted * simd_float4(localPoint, 1)
return simd_float3(worldPoint.x, worldPoint.y, worldPoint.z)
extension CVPixelBuffer {
var size: SIMD2<Int> {
let width = CVPixelBufferGetWidthOfPlane(self, 0)
let height = CVPixelBufferGetHeightOfPlane(self, 0)
return .init(x: width, y: height)
fabio914 commented Feb 7, 2022

Hi 👋 I noticed a few issues when I was playing with your code.

This line should be replaced with:

         let localPoint = cameraIntrinsicsInverted * simd_float3(depthMapPixelPoint, 1) * -depth

We're also missing an extension on CVPixelBuffer:

extension CVPixelBuffer {

    var size: SIMD2<Int> {
        let width = CVPixelBufferGetWidthOfPlane(self, 0)
        let height = CVPixelBufferGetHeightOfPlane(self, 0)
        return  .init(x: width, y: height)

For anyone else trying this code, make sure to run this with this ARKit configuration:

        let configuration = ARWorldTrackingConfiguration()
        configuration.frameSemantics = .smoothedSceneDepth

@fabio914 thank so much for the feedback and trying out the code.
You're right, those parts are missing as I didn't intend this to be "ready-to-use".
I'll fix the typos though, and add your suggestions so that it's more complete.

fabio914 commented Feb 7, 2022

Btw @snowzurfer, I've managed to build a version with color.


My version is still not ideal but here's the updated code if you're interested:

   func session(_ session: ARSession, didUpdate frame: ARFrame) {
        guard let smoothedDepth = frame.smoothedSceneDepth?.depthMap else {
        let capturedImage = frame.capturedImage

        let lockFlags = CVPixelBufferLockFlags.readOnly
        CVPixelBufferLockBaseAddress(smoothedDepth, lockFlags)
        defer {
            CVPixelBufferUnlockBaseAddress(smoothedDepth, lockFlags)

        CVPixelBufferLockBaseAddress(capturedImage, lockFlags)
        defer {
            CVPixelBufferUnlockBaseAddress(capturedImage, lockFlags)

        let baseAddress = CVPixelBufferGetBaseAddressOfPlane(smoothedDepth, 0)!
        let depthByteBuffer = baseAddress.assumingMemoryBound(to: Float32.self)

        let lumaBaseAddress = CVPixelBufferGetBaseAddressOfPlane(capturedImage, 0)!
        let lumaByteBuffer = lumaBaseAddress.assumingMemoryBound(to: UInt8.self)

        let chromaBaseAddress = CVPixelBufferGetBaseAddressOfPlane(capturedImage, 1)!
        let chromaByteBuffer = chromaBaseAddress.assumingMemoryBound(to: UInt16.self)

        // The `.size` accessor simply read the CVPixelBuffer's width and height in pixels.
        // They are the same ratio:
        // 1920 x 1440 = 1440 x 1920 = 0.75
        let depthMapSize = smoothedDepth.size(ofPlane: 0)
        // 192 x 256 = 0.75
        let capturedImageSize = capturedImage.size(ofPlane: 0)
        let lumaSize = capturedImageSize
        let chromaSize = capturedImage.size(ofPlane: 1)

        var cameraIntrinsics =
        let depthResolution = simd_float2(x: Float(depthMapSize.x), y: Float(depthMapSize.y))
        let scaleRes = simd_float2(x: Float(capturedImageSize.x) / depthResolution.x,
                                   y: Float(capturedImageSize.y) / depthResolution.y )
        // Make the camera intrinsics be with respect to Depth.
        cameraIntrinsics[0][0] /= scaleRes.x
        cameraIntrinsics[1][1] /= scaleRes.y

        cameraIntrinsics[2][0] /= scaleRes.x
        cameraIntrinsics[2][1] /= scaleRes.y

        // This will be the long size, because of the rotation
        let horizontalStep = Float(depthMapSize.x) / Float(self.horizontalPoints)
        let halfHorizontalStep = horizontalStep / 2
        // This will be the short size, because of the rotation
        let verticalStep = Float(depthMapSize.y) / Float(self.verticalPoints)
        let halfVerticalStep = verticalStep / 2

        let depthWidthToLumaWidth = Float(lumaSize.x)/Float(depthMapSize.x)
        let depthHeightToLumaHeight = Float(lumaSize.y)/Float(depthMapSize.y)

        let depthWidthToChromaWidth = Float(chromaSize.x)/Float(depthMapSize.x)
        let depthHeightToChromaHeight = Float(chromaSize.y)/Float(depthMapSize.y)

         for h in 0..<horizontalPoints {
            for v in 0..<verticalPoints {
                let x = Float(h) * horizontalStep + halfHorizontalStep
                let y = Float(v) * verticalStep + halfVerticalStep
                let depthMapPoint = simd_float2(x, y)

                // Sample depth
                let metricDepth = sampleDepthRaw(depthByteBuffer, size: depthMapSize, at: .init(depthMapPoint))

                let wp = worldPoint(depthMapPixelPoint: depthMapPoint,
                                    depth: metricDepth,
                                    cameraIntrinsics: cameraIntrinsics,
                                    // This is crucial: you need to always use the view matrix for Landscape Right.
                                    viewMatrixInverted: .landscapeRight).inverse)

                // Sample Image
                let lumaPoint = simd_float2(x * depthWidthToLumaWidth, y * depthHeightToLumaHeight)
                let luma = sampleLuma(lumaByteBuffer, size: lumaSize, at: .init(lumaPoint))

                let chromaPoint = simd_float2(x * depthWidthToChromaWidth, y * depthHeightToChromaHeight)
                let chroma = sampleChroma(chromaByteBuffer, size: chromaSize, at: .init(chromaPoint))

                let cr = UInt8(chroma >> 8)
                let cb = UInt8((chroma << 8) >> 8)

                let node = self.depthNodes[v * horizontalPoints + h]
                node.simdWorldPosition = wp
                node.geometry?.materials.first?.diffuse.contents = UIColor(y: luma, cb: cb, cr: cr)

where the setup() function is also a bit different (so that different nodes can have different materials):

func setup() {

    let sizeGeomPredictions = 0.005

    for _ in 0 ..< (horizontalPoints * verticalPoints) {
        let geom = SCNBox(width: sizeGeomPredictions, height: sizeGeomPredictions, length: sizeGeomPredictions, chamferRadius: 0)
        geom.firstMaterial?.diffuse.contents =

        let node = SCNNode(geometry: geom)

And these are the other auxiliary functions I wrote:

func sampleLuma(_ pointer: UnsafeMutablePointer<UInt8>, size: SIMD2<Int>, at: SIMD2<Int>) -> UInt8 {
    let baseAddressIndex = at.y * size.x + at.x
    return UInt8(pointer[baseAddressIndex])

func sampleChroma(_ pointer: UnsafeMutablePointer<UInt16>, size: SIMD2<Int>, at: SIMD2<Int>) -> UInt16 {
    let baseAddressIndex = at.y * size.x + at.x
    return UInt16(pointer[baseAddressIndex])

and this extension on UIColor to convert from YCbCr to RGB:

extension UIColor {

    private static let encoding: (r: CGFloat, g: CGFloat, b: CGFloat) = (0.299, 0.587, 0.114)

    convenience init(y: UInt8, cb: UInt8, cr: UInt8, alpha: CGFloat = 1.0) {
        let Y  = (Double(y)  / 255.0)
        let Cb = (Double(cb) / 255.0) - 0.5
        let Cr = (Double(cr) / 255.0) - 0.5

        let k = UIColor.encoding
        let kr = (Cr * ((1.0 - k.r) / 0.5))
        let kgb = (Cb * ((k.b * (1.0 - k.b)) / (0.5 * k.g)))
        let kgr = (Cr * ((k.r * (1.0 - k.r)) / (0.5 * k.g)))
        let kb = (Cb * ((1.0 - k.b) / 0.5))

        let r = Y + kr
        let g = Y - kgb - kgr
        let b = Y + kb

        self.init(red: r, green: g, blue: b, alpha: alpha)

and a different extension on CVPixelBuffer:

extension CVPixelBuffer {

    func size(ofPlane plane: Int = 0) -> SIMD2<Int> {
        let width = CVPixelBufferGetWidthOfPlane(self, plane)
        let height = CVPixelBufferGetHeightOfPlane(self, plane)
        return  .init(x: width, y: height)


I've uploaded my project to this repository.

It looks great, and thanks for posting the rest of your code!

