Last active
July 12, 2024 07:39
-
-
Save snowzurfer/1e90678d0d23d3295dda9a0cc93b2453 to your computer and use it in GitHub Desktop.
3D world points from ARKit depth
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ARKit | |
import SceneKit | |
let horizontalPoints = 256 / 2 | |
let verticalPoints = 192 / 2 | |
var depthNodes = [SCNNode]() | |
var parentDebugNodes = SCNNode() | |
var sceneView: ARSCNView! | |
// Somewhere during setup | |
func setup() { | |
let configuration = ARWorldTrackingConfiguration() | |
configuration.frameSemantics = .smoothedSceneDepth | |
sceneView.session.run(configuration) | |
sceneView.scene.rootNode.addChildNode(parentDebugNodes) | |
let sizeGeomPredictions = 0.005 | |
let geom = SCNBox(width: sizeGeomPredictions, height: sizeGeomPredictions, length: sizeGeomPredictions, chamferRadius: 0) | |
geom.firstMaterial?.diffuse.contents = UIColor.green | |
for _ in 0..<(horizontalPoints * verticalPoints) { | |
let node = SCNNode(geometry: geom) | |
self.parentDebugNodes.addChildNode(node) | |
self.depthNodes.append(node) | |
} | |
} | |
func session(_ session: ARSession, didUpdate frame: ARFrame) { | |
guard let smoothedDepth = frame.smoothedSceneDepth?.depthMap else { | |
return | |
} | |
let capturedImage = frame.capturedImage | |
let lockFlags = CVPixelBufferLockFlags.readOnly | |
CVPixelBufferLockBaseAddress(smoothedDepth, lockFlags) | |
defer { | |
CVPixelBufferUnlockBaseAddress(smoothedDepth, lockFlags) | |
} | |
let baseAddress = CVPixelBufferGetBaseAddressOfPlane(smoothedDepth, 0)! | |
let depthByteBuffer = baseAddress.assumingMemoryBound(to: Float32.self) | |
// The `.size` accessor simply read the CVPixelBuffer's width and height in pixels. | |
// | |
// They are the same ratio: | |
// 1920 x 1440 = 1440 x 1920 = 0.75 | |
let depthMapSize = smoothedDepth.size | |
// 192 x 256 = 0.75 | |
let capturedImageSize = capturedImage.size | |
var cameraIntrinsics = frame.camera.intrinsics | |
let depthResolution = simd_float2(x: Float(depthMapSize.x), y: Float(depthMapSize.y)) | |
let scaleRes = simd_float2(x: Float(capturedImageSize.x) / depthResolution.x, | |
y: Float(capturedImageSize.y) / depthResolution.y ) | |
// Make the camera intrinsics be with respect to Depth. | |
cameraIntrinsics[0][0] /= scaleRes.x | |
cameraIntrinsics[1][1] /= scaleRes.y | |
cameraIntrinsics[2][0] /= scaleRes.x | |
cameraIntrinsics[2][1] /= scaleRes.y | |
// This will be the long size, because of the rotation | |
let horizontalStep = Float(depthMapSize.x) / Float(self.horizontalPoints) | |
let halfHorizontalStep = horizontalStep / 2 | |
// This will be the short size, because of the rotation | |
let verticalStep = Float(depthMapSize.y) / Float(self.verticalPoints) | |
let halfVerticalStep = verticalStep / 2 | |
for h in 0..<horizontalPoints { | |
for v in 0..<verticalPoints { | |
let x = Float(h) * horizontalStep + halfHorizontalStep | |
let y = Float(v) * verticalStep + halfVerticalStep | |
let depthMapPoint = simd_float2(x, y) | |
// Sample depth | |
let metricDepth = sampleDepthRaw(depthByteBuffer, size: depthMapSize, at: .init(depthMapPoint)) | |
let wp = worldPoint(depthMapPixelPoint: depthMapPoint, | |
depth: metricDepth, | |
cameraIntrinsics: cameraIntrinsics, | |
// This is crucial: you need to always use the view matrix for Landscape Right. | |
viewMatrixInverted: frame.camera.viewMatrix(for: .landscapeRight).inverse) | |
let node = self.depthNodes[v * horizontalPoints + h] | |
node.simdWorldPosition = wp | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
func sampleDepthRaw(_ pointer: UnsafeMutablePointer<Float32>, size: SIMD2<Int>, at: SIMD2<Int>) -> Float { | |
let baseAddressIndex = at.y * size.x + at.x | |
return Float(pointer[baseAddressIndex]) | |
} | |
// This also works. Adapted from: | |
// https://developer.apple.com/forums/thread/676368 | |
func worldPoint(depthMapPixelPoint: SIMD2<Float>, depth: Float, cameraIntrinsicsInverted: simd_float3x3, viewMatrixInverted: simd_float4x4) -> SIMD3<Float> { | |
let localPoint = cameraIntrinsicsInverted * simd_float3(depthMapPixelPoint, 1) * -depth | |
let localPointSwappedX = simd_float3(-localPoint.x, localPoint.y, localPoint.z) | |
let worldPoint = viewMatrixInverted * simd_float4(localPointSwappedX, 1) | |
return (worldPoint / worldPoint.w)[SIMD3(0,1,2)] | |
} | |
// This one is adapted from: | |
// http://nicolas.burrus.name/index.php/Research/KinectCalibration | |
func worldPoint(depthMapPixelPoint: SIMD2<Float>, depth: Float, cameraIntrinsics: simd_float3x3, viewMatrixInverted: simd_float4x4) -> SIMD3<Float> { | |
let xrw = ((depthMapPixelPoint.x - cameraIntrinsics[2][0]) * depth / cameraIntrinsics[0][0]) | |
let yrw = (depthMapPixelPoint.y - cameraIntrinsics[2][1]) * depth / cameraIntrinsics[1][1] | |
// Y is UP in camera space, vs it being DOWN in image space. | |
let localPoint = simd_float3(xrw, -yrw, -depth) | |
let worldPoint = viewMatrixInverted * simd_float4(localPoint, 1) | |
return simd_float3(worldPoint.x, worldPoint.y, worldPoint.z) | |
} | |
extension CVPixelBuffer { | |
var size: SIMD2<Int> { | |
let width = CVPixelBufferGetWidthOfPlane(self, 0) | |
let height = CVPixelBufferGetHeightOfPlane(self, 0) | |
return .init(x: width, y: height) | |
} | |
} |
Btw @snowzurfer, I've managed to build a version with color.
My version is still not ideal but here's the updated code if you're interested:
func session(_ session: ARSession, didUpdate frame: ARFrame) {
guard let smoothedDepth = frame.smoothedSceneDepth?.depthMap else {
return
}
let capturedImage = frame.capturedImage
let lockFlags = CVPixelBufferLockFlags.readOnly
CVPixelBufferLockBaseAddress(smoothedDepth, lockFlags)
defer {
CVPixelBufferUnlockBaseAddress(smoothedDepth, lockFlags)
}
CVPixelBufferLockBaseAddress(capturedImage, lockFlags)
defer {
CVPixelBufferUnlockBaseAddress(capturedImage, lockFlags)
}
let baseAddress = CVPixelBufferGetBaseAddressOfPlane(smoothedDepth, 0)!
let depthByteBuffer = baseAddress.assumingMemoryBound(to: Float32.self)
let lumaBaseAddress = CVPixelBufferGetBaseAddressOfPlane(capturedImage, 0)!
let lumaByteBuffer = lumaBaseAddress.assumingMemoryBound(to: UInt8.self)
let chromaBaseAddress = CVPixelBufferGetBaseAddressOfPlane(capturedImage, 1)!
let chromaByteBuffer = chromaBaseAddress.assumingMemoryBound(to: UInt16.self)
// The `.size` accessor simply read the CVPixelBuffer's width and height in pixels.
//
// They are the same ratio:
// 1920 x 1440 = 1440 x 1920 = 0.75
let depthMapSize = smoothedDepth.size(ofPlane: 0)
// 192 x 256 = 0.75
let capturedImageSize = capturedImage.size(ofPlane: 0)
let lumaSize = capturedImageSize
let chromaSize = capturedImage.size(ofPlane: 1)
var cameraIntrinsics = frame.camera.intrinsics
let depthResolution = simd_float2(x: Float(depthMapSize.x), y: Float(depthMapSize.y))
let scaleRes = simd_float2(x: Float(capturedImageSize.x) / depthResolution.x,
y: Float(capturedImageSize.y) / depthResolution.y )
// Make the camera intrinsics be with respect to Depth.
cameraIntrinsics[0][0] /= scaleRes.x
cameraIntrinsics[1][1] /= scaleRes.y
cameraIntrinsics[2][0] /= scaleRes.x
cameraIntrinsics[2][1] /= scaleRes.y
// This will be the long size, because of the rotation
let horizontalStep = Float(depthMapSize.x) / Float(self.horizontalPoints)
let halfHorizontalStep = horizontalStep / 2
// This will be the short size, because of the rotation
let verticalStep = Float(depthMapSize.y) / Float(self.verticalPoints)
let halfVerticalStep = verticalStep / 2
let depthWidthToLumaWidth = Float(lumaSize.x)/Float(depthMapSize.x)
let depthHeightToLumaHeight = Float(lumaSize.y)/Float(depthMapSize.y)
let depthWidthToChromaWidth = Float(chromaSize.x)/Float(depthMapSize.x)
let depthHeightToChromaHeight = Float(chromaSize.y)/Float(depthMapSize.y)
for h in 0..<horizontalPoints {
for v in 0..<verticalPoints {
let x = Float(h) * horizontalStep + halfHorizontalStep
let y = Float(v) * verticalStep + halfVerticalStep
let depthMapPoint = simd_float2(x, y)
// Sample depth
let metricDepth = sampleDepthRaw(depthByteBuffer, size: depthMapSize, at: .init(depthMapPoint))
let wp = worldPoint(depthMapPixelPoint: depthMapPoint,
depth: metricDepth,
cameraIntrinsics: cameraIntrinsics,
// This is crucial: you need to always use the view matrix for Landscape Right.
viewMatrixInverted: frame.camera.viewMatrix(for: .landscapeRight).inverse)
// Sample Image
let lumaPoint = simd_float2(x * depthWidthToLumaWidth, y * depthHeightToLumaHeight)
let luma = sampleLuma(lumaByteBuffer, size: lumaSize, at: .init(lumaPoint))
let chromaPoint = simd_float2(x * depthWidthToChromaWidth, y * depthHeightToChromaHeight)
let chroma = sampleChroma(chromaByteBuffer, size: chromaSize, at: .init(chromaPoint))
let cr = UInt8(chroma >> 8)
let cb = UInt8((chroma << 8) >> 8)
let node = self.depthNodes[v * horizontalPoints + h]
node.simdWorldPosition = wp
node.geometry?.materials.first?.diffuse.contents = UIColor(y: luma, cb: cb, cr: cr)
}
}
}
where the setup()
function is also a bit different (so that different nodes can have different materials):
func setup() {
scene.rootNode.addChildNode(parentDebugNodes)
let sizeGeomPredictions = 0.005
for _ in 0 ..< (horizontalPoints * verticalPoints) {
let geom = SCNBox(width: sizeGeomPredictions, height: sizeGeomPredictions, length: sizeGeomPredictions, chamferRadius: 0)
geom.firstMaterial?.diffuse.contents = UIColor.green
let node = SCNNode(geometry: geom)
parentDebugNodes.addChildNode(node)
depthNodes.append(node)
}
}
And these are the other auxiliary functions I wrote:
func sampleLuma(_ pointer: UnsafeMutablePointer<UInt8>, size: SIMD2<Int>, at: SIMD2<Int>) -> UInt8 {
let baseAddressIndex = at.y * size.x + at.x
return UInt8(pointer[baseAddressIndex])
}
func sampleChroma(_ pointer: UnsafeMutablePointer<UInt16>, size: SIMD2<Int>, at: SIMD2<Int>) -> UInt16 {
let baseAddressIndex = at.y * size.x + at.x
return UInt16(pointer[baseAddressIndex])
}
and this extension on UIColor
to convert from YCbCr to RGB:
extension UIColor {
private static let encoding: (r: CGFloat, g: CGFloat, b: CGFloat) = (0.299, 0.587, 0.114)
convenience init(y: UInt8, cb: UInt8, cr: UInt8, alpha: CGFloat = 1.0) {
let Y = (Double(y) / 255.0)
let Cb = (Double(cb) / 255.0) - 0.5
let Cr = (Double(cr) / 255.0) - 0.5
let k = UIColor.encoding
let kr = (Cr * ((1.0 - k.r) / 0.5))
let kgb = (Cb * ((k.b * (1.0 - k.b)) / (0.5 * k.g)))
let kgr = (Cr * ((k.r * (1.0 - k.r)) / (0.5 * k.g)))
let kb = (Cb * ((1.0 - k.b) / 0.5))
let r = Y + kr
let g = Y - kgb - kgr
let b = Y + kb
self.init(red: r, green: g, blue: b, alpha: alpha)
}
}
and a different extension on CVPixelBuffer
:
extension CVPixelBuffer {
func size(ofPlane plane: Int = 0) -> SIMD2<Int> {
let width = CVPixelBufferGetWidthOfPlane(self, plane)
let height = CVPixelBufferGetHeightOfPlane(self, plane)
return .init(x: width, y: height)
}
}
EDIT
I've uploaded my project to this repository.
It looks great, and thanks for posting the rest of your code!
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@fabio914 thank so much for the feedback and trying out the code.
You're right, those parts are missing as I didn't intend this to be "ready-to-use".
I'll fix the typos though, and add your suggestions so that it's more complete.