Last active
May 29, 2025 13:23
-
-
Save eldar/902f85e64569d15a4b04925b693a56b8 to your computer and use it in GitHub Desktop.
ParallelDomain-4D
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
from pathlib import Path | |
import json | |
from pyquaternion import Quaternion | |
import numpy as np | |
import torch | |
import torch.nn.functional as F | |
from imageio.v3 import imread | |
def to_4x4( | |
m | |
): | |
m_ = torch.eye(4, dtype=m.dtype) | |
m_[:3, :3] = m | |
return m_ | |
def invert_se3( | |
T | |
): | |
R_ = T[:3, :3].transpose(0, 1) | |
t = T[:3, 3] | |
t_ = -torch.einsum("ij,j->i", R_, t) | |
T_ = torch.eye(4, dtype=T.dtype) | |
T_[:3, :3] = R_ | |
T_[:3, 3] = t_ | |
return T_ | |
def transform_pts( | |
T, | |
pts | |
): | |
""" | |
Args: | |
T (torch.Tensor): transformation matrix of shape (d, d) | |
pts (torch.Tensor): Input points of shape (n, c) | |
""" | |
if pts.shape[-1] == (T.shape[-1] - 1): | |
pts = F.pad(pts, (0, 1), value=1) | |
pts = torch.einsum("...ji,...ni->...nj", T, pts) | |
return pts[..., :3] | |
def inside_image( | |
pts2d, | |
image_size | |
): | |
H, W = image_size | |
px, py = pts2d.unbind(-1) | |
return ( | |
(0 <= px) & (px < W) & | |
(0 <= py) & (py < H) | |
) | |
def json_load(filename): | |
with open(filename, "r") as f: | |
data = json.load(f) | |
return data | |
def get_pardom_camera_matrices_torch(calibration): | |
''' | |
Adapted from convert_pcl_pardom.py. | |
:return all_views: List of str with view names corresponding to camera matrix ordering. | |
:return all_intrinsics: (V, 3, 3) tensor of float32. | |
:return all_extrinsics: (V, 4, 4) tensor of float32. | |
''' | |
# NOTE: Camera parameters do not vary over time in this dataset. | |
view_names = [] | |
all_intrinsics = dict() # Maps view_name to (3, 3) tensor of float. | |
all_extrinsics = dict() # Maps view_name to (4, 4) tensor of float. | |
for (view_name, intrinsics_dict, extrinsics_dict) in zip( | |
calibration['names'], calibration['intrinsics'], calibration['extrinsics']): | |
if 'velodyne' in view_name.lower(): | |
continue | |
# NOTE: Unlike Kubric, the intrinsics matrix is already unnormalized (pixel coordinates). | |
intrinsics_matrix = get_pardom_intrinsics_matrix(intrinsics_dict) # (3, 3) tensor of float. | |
extrinsics_matrix = get_pardom_extrinsics_matrix(extrinsics_dict) # (4, 4) tensor of float. | |
all_intrinsics[view_name] = intrinsics_matrix | |
all_extrinsics[view_name] = extrinsics_matrix | |
view_names.append(view_name) | |
view_names = sorted(view_names) # (V) list of str. | |
all_intrinsics = torch.stack([all_intrinsics[view_name] for view_name in view_names], dim=0) | |
# (V, 3, 3) tensor of float. | |
all_extrinsics = torch.stack([all_extrinsics[view_name] for view_name in view_names], dim=0) | |
# (V, 4, 4) tensor of float. | |
# For reference: view_names = | |
# ['camera0', 'camera1', 'camera10', 'camera11', 'camera12', 'camera13', 'camera14', 'camera15', | |
# 'camera2', 'camera3', 'camera4', 'camera5', 'camera6', 'camera7', 'camera8', 'camera9', | |
# 'yaw-0', 'yaw-60', 'yaw-neg-60'] | |
return (view_names, all_intrinsics, all_extrinsics) | |
def get_pardom_intrinsics_matrix(intrinsics_dict): | |
''' | |
Adapted from convert_pcl_pardom.py. | |
''' | |
intrinsics_matrix = torch.tensor( | |
[[intrinsics_dict['fx'], 0.0, intrinsics_dict['cx']], | |
[0.0, intrinsics_dict['fy'], intrinsics_dict['cy']], | |
[0.0, 0.0, 1.0]], dtype=torch.float32) | |
return intrinsics_matrix | |
def get_pardom_extrinsics_matrix(extrinsics_dict): | |
''' | |
Adapted from convert_pcl_pardom.py. | |
''' | |
rot_q = extrinsics_dict['rotation'] | |
rot_t = extrinsics_dict['translation'] | |
rot_m = Quaternion(rot_q['qw'], rot_q['qx'], rot_q['qy'], rot_q['qz']).rotation_matrix | |
extrinsics_matrix = torch.eye(4, dtype=torch.float32) | |
extrinsics_matrix[0:3, 0:3] = torch.tensor(rot_m) | |
extrinsics_matrix[0:3, 3] = torch.tensor([rot_t['x'], rot_t['y'], rot_t['z']]) | |
return extrinsics_matrix | |
def find_first_element(seq_data, name): | |
for d in seq_data: | |
if d["id"]["name"] == name: | |
return d | |
return None | |
def extract_sequence(seq_data, name): | |
frames = [] | |
for d in seq_data: | |
if d["id"]["name"] == name: | |
frames.append(d) | |
return frames | |
def handle_scene(opts, scene_id): | |
data_dir = Path(opts.p4d_dir) / "data" | |
camera_id = "yaw-0" | |
scene_dir = data_dir / scene_id | |
calib_file = next(scene_dir.joinpath("calibration").glob("*.json")) | |
calibration = json_load(calib_file) | |
view_names, all_intrinsics, all_extrinsics = ( | |
get_pardom_camera_matrices_torch(calibration) | |
) | |
camera_index = view_names.index(camera_id) | |
K = all_intrinsics[camera_index] | |
metadata_file = next(scene_dir.glob("scene_*.json")) | |
metadata = json_load(metadata_file) | |
seq_data = metadata["data"] | |
camera_frames = extract_sequence(seq_data, camera_id) | |
pc_frames = extract_sequence(seq_data, "VelodyneVLS128_Center") | |
img_anno = camera_frames[0]["datum"]["image"] | |
H, W = img_anno["height"], img_anno["width"] | |
for k, f in enumerate(camera_frames): | |
T = get_pardom_extrinsics_matrix(f["datum"]["image"]["pose"]) | |
image_file = f["datum"]["image"]["filename"] | |
img = imread(scene_dir / image_file) | |
# Load depth | |
anno = f["datum"]["image"]["annotations"] | |
depth_file = anno["6"] | |
depth_np = np.load(scene_dir / depth_file)["data"] | |
depth = torch.from_numpy(depth_np) | |
# Load LiDAR | |
pc_frame = pc_frames[k] | |
# LiDAR camera-to-world transform | |
T_WC_lidar = get_pardom_extrinsics_matrix(pc_frame["datum"]["point_cloud"]["pose"]) | |
lidar_file = pc_frame["datum"]["point_cloud"]["filename"] | |
lidar = np.load(scene_dir / lidar_file)["data"] | |
XYZ_lidar = np.stack([lidar[k] for k in ["X", "Y", "Z"]], axis=-1) | |
# make sure LiDAR and depth match | |
verify_frame(depth, T, K, T_WC_lidar, XYZ_lidar) | |
def verify_frame(depth, T_WC_cam, K, T_WC_lidar, XYZ): | |
image_size = tuple(depth.shape) | |
# get yaw-0 world-to-camera transform | |
T_CW_cam = invert_se3(T_WC_cam) | |
# lidar-world-camera transform | |
T_proj = to_4x4(K) @ T_CW_cam @ T_WC_lidar | |
XYZ = torch.from_numpy(XYZ) | |
uv, z = persp_project(transform_pts(T_proj, XYZ)) | |
# uv = torch.round(uv + 0.5) - 0.5 | |
# ji = torch.round(uv - 0.5).to(torch.int64) | |
ji = uv.round().to(torch.int64) | |
mask = inside_image(ji, image_size) & (z[:, 0] > 0) | |
ji = ji[mask, :] | |
uv = uv[mask, :] | |
z = z[mask, 0] | |
jj, ii = ji.unbind(-1) | |
z_depth = depth[ii, jj] | |
valid_depth = z_depth < 250 | |
uv = uv[valid_depth, :] | |
z = z[valid_depth] | |
z_depth = z_depth[valid_depth] | |
diff = (z - z_depth).abs().mean() | |
print("mean discrepancy", diff.item()) | |
import pdb | |
pdb.set_trace() | |
def persp_project(xyz): | |
z = xyz[:, 2:] | |
uv = xyz[:, :2] / z | |
return uv, z | |
def get_opts(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument("-s", "--split", | |
type=str, | |
default="train", | |
choices=["train", "val", "test"]) | |
parser.add_argument("--p4d-dir", | |
default="/path/to/ParallelDomain-4D") | |
return parser.parse_args() | |
def main(): | |
opts = get_opts() | |
scene_id = "scene_002130" # pedestrians | |
handle_scene(opts, scene_id) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment