lebrice · April 23, 2025 17:09
diff --git a/README.md b/README.md
diff --git a/job.sh b/job.sh
 #!/bin/bash
 #SBATCH --nodes=2
 #SBATCH --ntasks-per-node=4
 #SBATCH --gpus-per-task=1
 #SBATCH --cpus-per-task=12
 #SBATCH --mem=0
 #SBATCH --time=00:05:00


 # Echo time and hostname into log
 echo "Date:     $(date)"
 echo "Hostname: $(hostname)"

 module --quiet purge
 # This example uses [UV](https://www.docs.astral.sh/uv) to manage package dependencies.

 # Copy the dataset archive into $SLURM_TMPDIR (only on the first worker of each node)
 srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c \
   'mkdir -p $SLURM_TMPDIR/data && ln -s $SCRATCH/data/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/cifar-10-python.tar.gz'

 # Get a unique port for this job based on the job ID
 export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
 export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
 export NCCL_DEBUG=INFO
 # Execute Python script in each task (one per GPU)
 srun uv run --offline --frozen python main.py "$@"
diff --git a/main.py b/main.py
 """Multi-GPU Training example."""

 import argparse
 import logging
 import os
 from datetime import timedelta
 from pathlib import Path
 import sys

 import rich.logging
 import torch
 import torch.distributed
 from torch import Tensor, nn
 from torch.distributed import ReduceOp
 from torch.nn import functional as F
 from torch.utils.data import DataLoader, random_split
 from torch.utils.data.distributed import DistributedSampler
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
 from torchvision.models import resnet18
 from tqdm import tqdm


 def main():
    # Use an argument parser so we can pass hyperparameters from the command line.
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--epochs", type=int, default=10)
    parser.add_argument("--learning-rate", type=float, default=5e-4)
    parser.add_argument("--weight-decay", type=float, default=1e-4)
    parser.add_argument("--batch-size", type=int, default=128)
    args = parser.parse_args()

    epochs: int = args.epochs
    learning_rate: float = args.learning_rate
    weight_decay: float = args.weight_decay
    # NOTE: This is the "local" batch size, per-GPU.
    batch_size: int = args.batch_size

    # Check that the GPU is available
    assert torch.cuda.is_available() and torch.cuda.device_count() > 0
    rank, world_size, local_rank = setup()
    is_master = rank == 0
    is_local_master = local_rank == 0
    # since we always use 1 gpu per task with `srun`.
    n_gpus_per_task = torch.cuda.device_count()
    assert n_gpus_per_task == 1, (
        "DDP isn't meant to be used with multiple GPUs per task."
    )

    # Setup logging (optional, but much better than using print statements)
    logging.basicConfig(
        level=logging.INFO,
        format=f"[{rank}/{world_size}] %(name)s - %(message)s ",
        handlers=[
            rich.logging.RichHandler(markup=True)
        ],  # Very pretty, uses the `rich` package.
    )

    logger = logging.getLogger(__name__)
    logger.info(
        f"World size: {world_size}, global rank: {rank}, local rank: {local_rank}, local device count: {n_gpus_per_task}"
    )

    # NOTE: We always see the GPU as `cuda:0` in each task because of `srun`, which sets the CUDA_VISIBLE_DEVICES
    # env var.
    # Otherwise, if you use `torch.distributed.launch`, accelerate, or similar, use `cuda:{local_rank}`.
    device = torch.device("cuda", 0)

    # Create a model and move it to the GPU.
    model = resnet18(num_classes=10)
    model = model.to(device)

    # Wrap the model with DistributedDataParallel
    # (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel)
    model = nn.parallel.DistributedDataParallel(
        model, device_ids=[local_rank], output_device=local_rank
    )

    optimizer = torch.optim.AdamW(
        model.parameters(), lr=learning_rate, weight_decay=weight_decay
    )

    # Setup CIFAR10
    num_workers = get_num_workers()

    dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
    train_dataset, valid_dataset, test_dataset = make_datasets(
        str(dataset_path), is_master=is_local_master
    )

    # Restricts data loading to a subset of the dataset exclusive to the current process
    train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
    valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
    test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False)

    # NOTE: Here `batch_size` is still the "local" (per-gpu) batch size.
    # This way, the effective batch size scales directly with number of GPUs, no need to specify it
    # in advance. You might want to adjust the learning rate and other hyper-parameters though.
    if is_master:
        logger.info(f"Effective batch size: {batch_size * world_size}")
    train_dataloader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,  # shuffling is now done in the sampler, not the dataloader.
        sampler=train_sampler,
    )
    valid_dataloader = DataLoader(
        valid_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        sampler=valid_sampler,
    )
    test_dataloader = DataLoader(  # NOTE: Not used in this example.
        test_dataset,
        batch_size=batch_size,
        num_workers=num_workers,
        shuffle=False,
        sampler=test_sampler,
    )

    # Checkout the "checkpointing and preemption" example for more info!
    logger.debug("Starting training from scratch.")

    for epoch in range(epochs):
        logger.debug(f"Starting epoch {epoch}/{epochs}")

        # NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch.
        train_sampler.set_epoch(epoch)

        # Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
        model.train()

        # NOTE: using a progress bar from tqdm because it's nicer than using `print`.
        progress_bar = tqdm(
            total=len(train_dataloader),
            desc=f"Train epoch {epoch}",
            disable=not (is_master and sys.stdout.isatty()),
        )

        # Training loop
        for batch in train_dataloader:
            # Move the batch to the GPU before we pass it to the model
            # batch = tuple(item.to(device) for item in batch)
            x, y = batch

            # Forward pass
            logits: Tensor = model(x)

            local_loss = F.cross_entropy(logits, y)

            optimizer.zero_grad()
            local_loss.backward()
            # NOTE: nn.DistributedDataParallel automatically averages the gradients across devices.
            optimizer.step()

            # Calculate some metrics:
            # local metrics
            local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
            local_n_samples = logits.shape[0]
            local_accuracy = local_n_correct_predictions / local_n_samples

            # "global" metrics: calculated with the results from all workers
            # NOTE: Creating new tensors to hold the "global" values, but this isn't required.
            n_correct_predictions = local_n_correct_predictions.clone()
            # Reduce the local metrics across all workers, sending the result to rank 0.
            torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM)
            # Actual (global) batch size for this step.
            n_samples = torch.as_tensor(local_n_samples, device=device)
            torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM)
            # Will store the average loss across all workers.
            loss = local_loss.clone()
            torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM)
            loss.div_(world_size)  # Report the average loss across all workers.

            accuracy = n_correct_predictions / n_samples

            logger.debug(f"(local) Accuracy: {local_accuracy:.2%}")
            logger.debug(f"(local) Loss: {local_loss.item()}")
            # NOTE: This would log the same values in all workers. Only logging on master:
            if is_master:
                logger.debug(f"Accuracy: {accuracy.item():.2%}")
                logger.debug(f"Average Loss: {loss.item()}")

            # Advance the progress bar one step and update the progress bar text.
            progress_bar.update(1)
            progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())

        progress_bar.close()

        val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
        # NOTE: This would log the same values in all workers. Only logging on master:
        if is_master:
            logger.info(
                f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}"
            )

    print("Done!")
    torch.distributed.destroy_process_group()


 @torch.no_grad()
 def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
    model.eval()

    total_loss = torch.as_tensor(0.0, device=device)
    n_samples = torch.as_tensor(0, device=device)
    correct_predictions = torch.as_tensor(0, device=device)

    for batch in dataloader:
        # batch = tuple(item.to(device) for item in batch)
        x, y = batch

        logits: Tensor = model(x)
        loss = F.cross_entropy(logits, y)

        batch_n_samples = x.shape[0]
        batch_correct_predictions = logits.argmax(-1).eq(y).sum()

        total_loss += loss
        n_samples += batch_n_samples
        correct_predictions += batch_correct_predictions

    # Sum up the metrics we gathered on each worker before returning the overall val metrics.
    torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM)
    torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM)
    torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM)

    accuracy = correct_predictions / n_samples
    return total_loss, accuracy


 def setup():
    assert torch.distributed.is_available()
    print("PyTorch Distributed available.")
    print("  Backends:")
    print(f"    Gloo: {torch.distributed.is_gloo_available()}")
    print(f"    NCCL: {torch.distributed.is_nccl_available()}")
    print(f"    MPI:  {torch.distributed.is_mpi_available()}")

    # NOTE: the env:// init method uses FileLocks, which sometimes causes deadlocks due to the
    # distributed filesystem configuration on the Mila cluster.
    # For multi-node jobs, use the TCP init method instead.
    master_addr = os.environ["MASTER_ADDR"]
    master_port = os.environ["MASTER_PORT"]

    # Default timeout is 30 minutes. Reducing the timeout here, so the job fails quicker if there's
    # a communication problem between nodes.
    timeout = timedelta(seconds=60)

    # DDP Job is being run via `srun` on a slurm cluster.
    rank = int(os.environ["SLURM_PROCID"])
    local_rank = int(os.environ["SLURM_LOCALID"])

    world_size = int(os.environ["SLURM_NTASKS"])

    # SLURM var -> torch.distributed vars in case needed
    # NOTE: Setting these values isn't exactly necessary, but some code might assume it's
    # being run via torchrun or torch.distributed.launch, so setting these can be a good idea.
    os.environ["RANK"] = str(rank)
    os.environ["LOCAL_RANK"] = str(local_rank)
    os.environ["WORLD_SIZE"] = str(world_size)

    torch.distributed.init_process_group(
        backend="nccl",
        init_method=f"tcp://{master_addr}:{master_port}",
        timeout=timeout,
        world_size=world_size,
        rank=rank,
    )
    return rank, world_size, local_rank


 def make_datasets(
    dataset_path: str,
    is_master: bool,
    val_split: float = 0.1,
    val_split_seed: int = 42,
 ):
    """Returns the training, validation, and test splits for CIFAR10.

    NOTE: We don't use image transforms here for simplicity.
    Having different transformations for train and validation would complicate things a bit.
    Later examples will show how to do the train/val/test split properly when using transforms.

    NOTE: Only the master process (rank-0) downloads the dataset if necessary.
    """
    # - Master: Download (if necessary) THEN Barrier
    # - others: Barrier THEN *NO* Download
    if not is_master:
        # Wait for the master process to finish downloading (reach the barrier below)
        torch.distributed.barrier()
    train_dataset = CIFAR10(
        root=dataset_path,
        transform=transforms.ToTensor(),
        download=is_master,
        train=True,
    )
    test_dataset = CIFAR10(
        root=dataset_path,
        transform=transforms.ToTensor(),
        download=is_master,
        train=False,
    )
    if is_master:
        # Join the workers waiting in the barrier above. They can now load the datasets from disk.
        torch.distributed.barrier()
    # Split the training dataset into a training and validation set.
    n_samples = len(train_dataset)
    n_valid = int(val_split * n_samples)
    n_train = n_samples - n_valid
    train_dataset, valid_dataset = random_split(
        train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
    )
    return train_dataset, valid_dataset, test_dataset


 def get_num_workers() -> int:
    """Gets the optimal number of DatLoader workers to use in the current job."""
    if "SLURM_CPUS_PER_TASK" in os.environ:
        return int(os.environ["SLURM_CPUS_PER_TASK"])
    if hasattr(os, "sched_getaffinity"):
        return len(os.sched_getaffinity(0))
    return torch.multiprocessing.cpu_count()


 if __name__ == "__main__":
    main()
diff --git a/pyproject.toml b/pyproject.toml
 [project]
 name = "torch-distributed-debug"
 version = "0.1.0"
 description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.12"
 dependencies = [
    "rich>=14.0.0",
    "torch>=2.7.0",
    "torchvision>=0.22.0",
    "tqdm>=4.67.1",
 ]
diff --git a/slurm-7014.out b/slurm-7014.out
 Date:     Wed Apr 23 01:06:08 PM EDT 2025
 Hostname: tg10907.tamia.ecpia.ca

 ======== GPU REPORT ========

 ======== GPU REPORT ========

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:09 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None


 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:09 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes                   : None

 Wed Apr 23 13:06:09 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 Wed Apr 23 13:06:09 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   33C    P0             68W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0             68W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   33C    P0             71W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   34C    P0             70W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0             72W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:20] INFO     [5/8] __main__ - World size: 8, global   main.py:60
                             rank: 5, local rank: 1, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:20] INFO     [7/8] __main__ - World size: 8, global   main.py:60
                             rank: 7, local rank: 3, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:20] INFO     [3/8] __main__ - World size: 8, global   main.py:60
                             rank: 3, local rank: 3, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:20] INFO     [1/8] __main__ - World size: 8, global   main.py:60
                             rank: 1, local rank: 1, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:21] INFO     [6/8] __main__ - World size: 8, global   main.py:60
                             rank: 6, local rank: 2, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:21] INFO     [4/8] __main__ - World size: 8, global   main.py:60
                             rank: 4, local rank: 0, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:21] INFO     [2/8] __main__ - World size: 8, global   main.py:60
                             rank: 2, local rank: 2, local device               
                             count: 1                                           
 PyTorch Distributed available.
  Backends:
    Gloo: True
    NCCL: True
    MPI:  False
 [04/23/25 13:06:21] INFO     [0/8] __main__ - World size: 8, global   main.py:60
                             rank: 0, local rank: 0, local device               
                             count: 1                                           
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory
 tg10907:995238:995238 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0>
 tg10907:995238:995238 [0] NCCL INFO cudaDriverVersion 12080
 tg10907:995238:995238 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10907:995238:995238 [0] NCCL INFO Comm config Blocking set to 1
 tg10907:995238:995314 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10907:995238:995314 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0>
 tg10907:995238:995314 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10907:995238:995314 [0] NCCL INFO Using network IB
 tg10907:995238:995314 [0] NCCL INFO ncclCommInitRankConfig comm 0x12a20e80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init START
 tg10907:995238:995314 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 tg10907:995238:995314 [0] NCCL INFO Bootstrap timings total 1.074140 (create 0.000026, send 0.000093, recv 0.000564, ring 1.032268, delay 0.000001)
 tg10907:995238:995314 [0] NCCL INFO Setting affinity for GPU 0 to 555555
 tg10907:995238:995314 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10907:995238:995314 [0] NCCL INFO comm 0x12a20e80 rank 0 nRanks 8 nNodes 2 localRanks 4 localRank 0 MNNVL 0
 tg10907:995238:995314 [0] NCCL INFO Channel 00/08 : 0 3 1 2 4 7 5 6
 tg10907:995238:995314 [0] NCCL INFO Channel 01/08 : 0 3 5 6 4 7 1 2
 tg10907:995238:995314 [0] NCCL INFO Channel 02/08 : 0 6 7 5 4 2 3 1
 tg10907:995238:995314 [0] NCCL INFO Channel 03/08 : 0 1 7 6 4 5 3 2
 tg10907:995238:995314 [0] NCCL INFO Channel 04/08 : 0 3 1 2 4 7 5 6
 tg10907:995238:995314 [0] NCCL INFO Channel 05/08 : 0 3 5 6 4 7 1 2
 tg10907:995238:995314 [0] NCCL INFO Channel 06/08 : 0 6 7 5 4 2 3 1
 tg10907:995238:995314 [0] NCCL INFO Channel 07/08 : 0 1 7 6 4 5 3 2
 tg10907:995238:995314 [0] NCCL INFO Trees [0] 1/4/-1->0->-1 [1] -1/-1/-1->0->3 [2] -1/-1/-1->0->1 [3] 2/-1/-1->0->3 [4] 1/-1/-1->0->4 [5] -1/-1/-1->0->3 [6] -1/-1/-1->0->1 [7] 2/-1/-1->0->3
 tg10907:995238:995314 [0] NCCL INFO P2P Chunksize set to 131072
 tg10907:995238:995314 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 1 directMode 0
 tg10907:995238:995346 [0] NCCL INFO [Proxy Service] Device 0 CPU core 6
 tg10907:995238:995347 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 8
 tg10907:995238:995314 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 tg10907:995238:995314 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10907:995238:995314 [0] NCCL INFO CC Off, workFifoBytes 1048576
 tg10907:995238:995314 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10907:995238:995314 [0] NCCL INFO ncclCommInitRankConfig comm 0x12a20e80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10907:995238:995314 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 8 total 1.58 (kernels 0.28, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00)
 tg10907:995238:995350 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[0] via P2P/CUMEM
 tg10907:995238:995350 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[0] via P2P/CUMEM
 tg10907:995238:995350 [0] NCCL INFO Channel 00/0 : 6[0] -> 0[0] [receive] via NET/IB/4
 tg10907:995238:995354 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 10
 tg10907:995238:995350 [0] NCCL INFO Channel 04/0 : 6[0] -> 0[0] [receive] via NET/IB/4
 tg10907:995238:995350 [0] NCCL INFO Channel 00/0 : 0[0] -> 3[0] via P2P/CUMEM
 tg10907:995238:995350 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[0] via P2P/CUMEM
 tg10907:995238:995350 [0] NCCL INFO Channel 04/0 : 0[0] -> 3[0] via P2P/CUMEM
 tg10907:995238:995350 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[0] via P2P/CUMEM
 tg10907:995238:995350 [0] NCCL INFO Channel 02/0 : 0[0] -> 6[0] [send] via NET/IB/5
 tg10907:995238:995350 [0] NCCL INFO Channel 06/0 : 0[0] -> 6[0] [send] via NET/IB/5

 tg10908:875805:875805 [0] NCCL INFO cudaDriverVersion 12080
 tg10908:875805:875805 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0>
 tg10908:875805:875805 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10908:875805:875805 [0] NCCL INFO Comm config Blocking set to 1
 tg10908:875805:875884 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10908:875805:875884 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0>
 tg10908:875805:875884 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10908:875805:875884 [0] NCCL INFO Using network IB
 tg10908:875805:875884 [0] NCCL INFO ncclCommInitRankConfig comm 0xa39eac0 rank 4 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init START
 tg10908:875805:875884 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 tg10908:875805:875884 [0] NCCL INFO Bootstrap timings total 1.033781 (create 0.000065, send 1.018227, recv 0.000403, ring 0.000452, delay 0.000001)
 tg10908:875805:875884 [0] NCCL INFO Setting affinity for GPU 0 to 555555
 tg10908:875805:875884 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10908:875805:875884 [0] NCCL INFO comm 0xa39eac0 rank 4 nRanks 8 nNodes 2 localRanks 4 localRank 0 MNNVL 0
 tg10908:875805:875884 [0] NCCL INFO Trees [0] 5/-1/-1->4->0 [1] -1/-1/-1->4->7 [2] -1/-1/-1->4->5 [3] 6/-1/-1->4->7 [4] 5/0/-1->4->-1 [5] -1/-1/-1->4->7 [6] -1/-1/-1->4->5 [7] 6/-1/-1->4->7
 tg10908:875805:875884 [0] NCCL INFO P2P Chunksize set to 131072
 tg10908:875805:875908 [0] NCCL INFO [Proxy Service] Device 0 CPU core 4
 tg10908:875805:875912 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 6
 tg10908:875805:875884 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 tg10908:875805:875884 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10908:875805:875884 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10908:875805:875884 [0] NCCL INFO ncclCommInitRankConfig comm 0xa39eac0 rank 4 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10908:875805:875884 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 4 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.00, topo 0.04, graphs 0.10, connections 0.01, rest 0.00)
 tg10908:875805:875917 [0] NCCL INFO Channel 03/0 : 4[0] -> 5[0] via P2P/CUMEM
 tg10908:875805:875917 [0] NCCL INFO Channel 07/0 : 4[0] -> 5[0] via P2P/CUMEM
 tg10908:875805:875917 [0] NCCL INFO Channel 00/0 : 2[0] -> 4[0] [receive] via NET/IB/4
 tg10908:875805:875917 [0] NCCL INFO Channel 04/0 : 2[0] -> 4[0] [receive] via NET/IB/4
 tg10908:875805:875922 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 10
 tg10908:875805:875917 [0] NCCL INFO Channel 00/0 : 4[0] -> 7[0] via P2P/CUMEM
 tg10908:875805:875917 [0] NCCL INFO Channel 01/0 : 4[0] -> 7[0] via P2P/CUMEM
 tg10908:875805:875917 [0] NCCL INFO Channel 04/0 : 4[0] -> 7[0] via P2P/CUMEM
 tg10908:875805:875917 [0] NCCL INFO Channel 05/0 : 4[0] -> 7[0] via P2P/CUMEM
 tg10908:875805:875917 [0] NCCL INFO Channel 02/0 : 4[0] -> 2[0] [send] via NET/IB/5
 tg10908:875805:875917 [0] NCCL INFO Channel 06/0 : 4[0] -> 2[0] [send] via NET/IB/5

 [2025-04-23 13:06:23] tg10908:875805:875917 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10908:875805:875917 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10908:875805:875917 [0] NCCL INFO transport/p2p.cc:489 -> 1
 tg10908:875805:875917 [0] NCCL INFO transport.cc:197 -> 1
 tg10908:875805:875917 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10908:875805:875917 [0] NCCL INFO group.cc:148 -> 1
 tg10908:875805:875917 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 tg10908:875805:875805 [0] NCCL INFO group.cc:460 -> 1
 tg10908:875805:875805 [0] NCCL INFO group.cc:581 -> 1
 tg10908:875805:875805 [0] NCCL INFO enqueue.cc:2299 -> 1
 tg10908:875807:875807 [0] NCCL INFO cudaDriverVersion 12080
 tg10908:875807:875807 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0>
 tg10908:875807:875807 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10908:875807:875807 [0] NCCL INFO Comm config Blocking set to 1
 tg10908:875807:875883 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10908:875807:875883 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0>
 tg10908:875807:875883 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10908:875807:875883 [0] NCCL INFO Using network IB
 tg10908:875807:875883 [0] NCCL INFO ncclCommInitRankConfig comm 0x12941f80 rank 5 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init START
 tg10908:875807:875883 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 tg10908:875807:875883 [0] NCCL INFO Bootstrap timings total 1.033800 (create 0.000027, send 0.000538, recv 0.000957, ring 0.000472, delay 0.000000)
 tg10908:875807:875883 [0] NCCL INFO Setting affinity for GPU 0 to aaaaaa
 tg10908:875807:875883 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10908:875807:875883 [0] NCCL INFO comm 0x12941f80 rank 5 nRanks 8 nNodes 2 localRanks 4 localRank 1 MNNVL 0
 tg10908:875807:875883 [0] NCCL INFO Trees [0] 7/-1/-1->5->4 [1] 6/-1/-1->5->1 [2] 4/-1/-1->5->7 [3] -1/-1/-1->5->6 [4] 7/-1/-1->5->4 [5] 6/1/-1->5->-1 [6] 4/-1/-1->5->7 [7] -1/-1/-1->5->6
 tg10908:875807:875883 [0] NCCL INFO P2P Chunksize set to 131072
 tg10908:875807:875914 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 15
 tg10908:875807:875910 [0] NCCL INFO [Proxy Service] Device 0 CPU core 13
 tg10908:875807:875883 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 tg10908:875807:875883 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10908:875807:875883 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10908:875807:875883 [0] NCCL INFO ncclCommInitRankConfig comm 0x12941f80 rank 5 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10908:875807:875883 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 5 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.04, topo 0.03, graphs 0.06, connections 0.01, rest 0.00)
 tg10908:875807:875918 [0] NCCL INFO Channel 00/0 : 5[0] -> 6[0] via P2P/CUMEM
 tg10908:875807:875918 [0] NCCL INFO Channel 01/0 : 5[0] -> 6[0] via P2P/CUMEM
 tg10908:875807:875918 [0] NCCL INFO Channel 04/0 : 5[0] -> 6[0] via P2P/CUMEM
 tg10908:875807:875918 [0] NCCL INFO Channel 05/0 : 5[0] -> 6[0] via P2P/CUMEM
 tg10908:875807:875918 [0] NCCL INFO Channel 01/0 : 3[0] -> 5[0] [receive] via NET/IB/6
 tg10908:875807:875923 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 17
 tg10908:875807:875918 [0] NCCL INFO Channel 05/0 : 3[0] -> 5[0] [receive] via NET/IB/6
 tg10908:875807:875918 [0] NCCL INFO Channel 03/0 : 5[0] -> 3[0] [send] via NET/IB/7
 tg10908:875807:875918 [0] NCCL INFO Channel 07/0 : 5[0] -> 3[0] [send] via NET/IB/7
 tg10908:875807:875918 [0] NCCL INFO Channel 02/0 : 5[0] -> 4[0] via P2P/CUMEM
 tg10908:875807:875918 [0] NCCL INFO Channel 06/0 : 5[0] -> 4[0] via P2P/CUMEM

 [2025-04-23 13:06:23] tg10908:875807:875918 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10908:875807:875918 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10908:875807:875918 [0] NCCL INFO transport/p2p.cc:489 -> 1
 tg10908:875807:875918 [0] NCCL INFO transport.cc:197 -> 1
 tg10908:875807:875918 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10908:875807:875918 [0] NCCL INFO group.cc:148 -> 1
 tg10908:875807:875918 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 tg10908:875807:875807 [0] NCCL INFO group.cc:460 -> 1
 tg10908:875807:875807 [0] NCCL INFO group.cc:581 -> 1
 tg10908:875807:875807 [0] NCCL INFO enqueue.cc:2299 -> 1
 tg10908:875804:875804 [0] NCCL INFO cudaDriverVersion 12080
 tg10908:875804:875804 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0>
 tg10908:875804:875804 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10908:875804:875804 [0] NCCL INFO Comm config Blocking set to 1
 tg10908:875804:875882 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10908:875804:875882 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0>
 tg10908:875804:875882 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10908:875804:875882 [0] NCCL INFO Using network IB
 tg10908:875804:875882 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ff86a80 rank 7 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init START
 tg10908:875804:875882 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 tg10908:875804:875882 [0] NCCL INFO Bootstrap timings total 1.033817 (create 0.000034, send 0.000473, recv 0.000650, ring 1.023471, delay 0.000001)
 tg10908:875804:875882 [0] NCCL INFO Setting affinity for GPU 0 to aaaa,aa000000
 tg10908:875804:875882 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10908:875804:875882 [0] NCCL INFO comm 0x1ff86a80 rank 7 nRanks 8 nNodes 2 localRanks 4 localRank 3 MNNVL 0
 tg10908:875804:875882 [0] NCCL INFO Trees [0] 6/-1/-1->7->5 [1] 4/-1/-1->7->6 [2] 5/-1/-1->7->6 [3] 4/-1/-1->7->3 [4] 6/-1/-1->7->5 [5] 4/-1/-1->7->6 [6] 5/-1/-1->7->6 [7] 4/3/-1->7->-1
 tg10908:875804:875882 [0] NCCL INFO P2P Chunksize set to 131072
 tg10908:875804:875915 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 31
 tg10908:875804:875911 [0] NCCL INFO [Proxy Service] Device 0 CPU core 29
 tg10908:875804:875882 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 tg10908:875804:875882 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10908:875804:875882 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10908:875804:875882 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ff86a80 rank 7 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10908:875804:875882 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 7 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.05, topo 0.03, graphs 0.06, connections 0.01, rest 0.00)
 tg10908:875804:875916 [0] NCCL INFO Channel 01/0 : 7[0] -> 1[0] [send] via NET/IB/6
 tg10908:875804:875916 [0] NCCL INFO Channel 05/0 : 7[0] -> 1[0] [send] via NET/IB/6
 tg10908:875804:875920 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 35
 tg10908:875804:875916 [0] NCCL INFO Channel 03/0 : 1[0] -> 7[0] [receive] via NET/IB/7
 [2025-04-23 13:06:23] tg10907:995238:995350 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10907:995238:995350 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10907:995238:995350 [0] NCCL INFO transport/p2p.cc:489 -> 1
 tg10907:995238:995350 [0] NCCL INFO transport.cc:197 -> 1
 tg10907:995238:995350 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10907:995238:995350 [0] NCCL INFO group.cc:148 -> 1
 tg10907:995238:995350 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 tg10907:995238:995238 [0] NCCL INFO group.cc:460 -> 1
 tg10907:995238:995238 [0] NCCL INFO group.cc:581 -> 1
 tg10907:995238:995238 [0] NCCL INFO enqueue.cc:2299 -> 1
 tg10908:875804:875916 [0] NCCL INFO Channel 07/0 : 1[0] -> 7[0] [receive] via NET/IB/7
 tg10908:875804:875916 [0] NCCL INFO Channel 00/0 : 7[0] -> 5[0] via P2P/CUMEM
 tg10908:875804:875916 [0] NCCL INFO Channel 02/0 : 7[0] -> 5[0] via P2P/CUMEM
 tg10908:875804:875916 [0] NCCL INFO Channel 04/0 : 7[0] -> 5[0] via P2P/CUMEM
 tg10908:875804:875916 [0] NCCL INFO Channel 06/0 : 7[0] -> 5[0] via P2P/CUMEM
 tg10908:875804:875916 [0] NCCL INFO Channel 03/0 : 7[0] -> 6[0] via P2P/CUMEM
 tg10908:875804:875916 [0] NCCL INFO Channel 07/0 : 7[0] -> 6[0] via P2P/CUMEM

 [2025-04-23 13:06:23] tg10908:875804:875916 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10908:875804:875916 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10908:875804:875916 [0] NCCL INFO transport/p2p.cc:537 -> 1
 tg10908:875804:875916 [0] NCCL INFO transport.cc:216 -> 1
 tg10908:875804:875916 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10908:875804:875916 [0] NCCL INFO group.cc:148 -> 1
 tg10907:995236:995236 [0] NCCL INFO cudaDriverVersion 12080
 tg10907:995236:995236 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0>
 tg10907:995236:995236 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10907:995236:995236 [0] NCCL INFO Comm config Blocking set to 1
 tg10907:995236:995316 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10907:995236:995316 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0>
 tg10907:995236:995316 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10907:995236:995316 [0] NCCL INFO Using network IB
 tg10907:995236:995316 [0] NCCL INFO ncclCommInitRankConfig comm 0x258ed380 rank 1 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init START
 tg10907:995236:995316 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 tg10908:875804:875916 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 tg10908:875804:875804 [0] NCCL INFO group.cc:460 -> 1
 tg10908:875804:875804 [0] NCCL INFO group.cc:581 -> 1
 tg10908:875804:875804 [0] NCCL INFO enqueue.cc:2299 -> 1
 tg10907:995236:995316 [0] NCCL INFO Bootstrap timings total 1.074027 (create 0.000032, send 0.000105, recv 0.000399, ring 1.063659, delay 0.000001)
 tg10907:995236:995316 [0] NCCL INFO Setting affinity for GPU 0 to aaaaaa
 tg10907:995236:995316 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10907:995236:995316 [0] NCCL INFO comm 0x258ed380 rank 1 nRanks 8 nNodes 2 localRanks 4 localRank 1 MNNVL 0
 tg10907:995236:995316 [0] NCCL INFO Trees [0] 3/-1/-1->1->0 [1] 2/5/-1->1->-1 [2] 0/-1/-1->1->3 [3] -1/-1/-1->1->2 [4] 3/-1/-1->1->0 [5] 2/-1/-1->1->5 [6] 0/-1/-1->1->3 [7] -1/-1/-1->1->2
 tg10907:995236:995316 [0] NCCL INFO P2P Chunksize set to 131072
 tg10907:995236:995341 [0] NCCL INFO [Proxy Service] Device 0 CPU core 5
 tg10907:995236:995345 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 7
 tg10907:995236:995316 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 tg10908:875806:875806 [0] NCCL INFO cudaDriverVersion 12080
 tg10908:875806:875806 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0>
 tg10908:875806:875806 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10908:875806:875806 [0] NCCL INFO Comm config Blocking set to 1
 tg10908:875806:875885 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10908:875806:875885 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0>
 tg10908:875806:875885 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10908:875806:875885 [0] NCCL INFO Using network IB
 tg10908:875806:875885 [0] NCCL INFO ncclCommInitRankConfig comm 0x2d22f240 rank 6 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init START
 tg10908:875806:875885 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 tg10907:995236:995316 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10907:995236:995316 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10907:995236:995316 [0] NCCL INFO ncclCommInitRankConfig comm 0x258ed380 rank 1 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10907:995236:995316 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 8 total 1.57 (kernels 0.27, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00)
 tg10907:995236:995349 [0] NCCL INFO Channel 00/0 : 1[0] -> 2[0] via P2P/CUMEM
 tg10907:995236:995349 [0] NCCL INFO Channel 01/0 : 1[0] -> 2[0] via P2P/CUMEM
 tg10907:995236:995349 [0] NCCL INFO Channel 04/0 : 1[0] -> 2[0] via P2P/CUMEM
 tg10907:995236:995349 [0] NCCL INFO Channel 05/0 : 1[0] -> 2[0] via P2P/CUMEM
 tg10907:995236:995349 [0] NCCL INFO Channel 01/0 : 7[0] -> 1[0] [receive] via NET/IB/6
 tg10908:875806:875885 [0] NCCL INFO Bootstrap timings total 1.033766 (create 0.000027, send 0.000474, recv 0.000752, ring 1.022451, delay 0.000000)
 tg10908:875806:875885 [0] NCCL INFO Setting affinity for GPU 0 to 5555,55000000
 tg10908:875806:875885 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10908:875806:875885 [0] NCCL INFO comm 0x2d22f240 rank 6 nRanks 8 nNodes 2 localRanks 4 localRank 2 MNNVL 0
 tg10908:875806:875885 [0] NCCL INFO Trees [0] -1/-1/-1->6->7 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->2 [3] 5/-1/-1->6->4 [4] -1/-1/-1->6->7 [5] 7/-1/-1->6->5 [6] 7/2/-1->6->-1 [7] 5/-1/-1->6->4
 tg10908:875806:875885 [0] NCCL INFO P2P Chunksize set to 131072
 tg10908:875806:875913 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 32
 tg10908:875806:875909 [0] NCCL INFO [Proxy Service] Device 0 CPU core 30
 tg10908:875806:875885 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 [rank4]: Traceback (most recent call last):
 [rank4]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank4]:     main()
 [rank4]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank4]:     model = nn.parallel.DistributedDataParallel(
 [rank4]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank4]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank4]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank4]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank4]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank4]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995236:995349 [0] NCCL INFO Channel 05/0 : 7[0] -> 1[0] [receive] via NET/IB/6
 tg10907:995236:995352 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 11
 tg10907:995236:995349 [0] NCCL INFO Channel 03/0 : 1[0] -> 7[0] [send] via NET/IB/7
 tg10907:995236:995349 [0] NCCL INFO Channel 07/0 : 1[0] -> 7[0] [send] via NET/IB/7
 tg10907:995236:995349 [0] NCCL INFO Channel 02/0 : 1[0] -> 0[0] via P2P/CUMEM
 tg10907:995236:995349 [0] NCCL INFO Channel 06/0 : 1[0] -> 0[0] via P2P/CUMEM

 [2025-04-23 13:06:23] tg10907:995236:995349 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10907:995236:995349 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10907:995236:995349 [0] NCCL INFO transport/p2p.cc:489 -> 1
 tg10907:995236:995349 [0] NCCL INFO transport.cc:197 -> 1
 tg10907:995236:995349 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10907:995236:995349 [0] NCCL INFO group.cc:148 -> 1
 tg10907:995236:995349 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 [rank4]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank4]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank4]: Last error:
 [rank4]: Cuda failure 101 'invalid device ordinal'
 tg10908:875806:875885 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10908:875806:875885 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10908:875806:875885 [0] NCCL INFO ncclCommInitRankConfig comm 0x2d22f240 rank 6 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10908:875806:875885 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 6 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.05, topo 0.03, graphs 0.06, connections 0.01, rest 0.00)
 tg10908:875806:875919 [0] NCCL INFO Channel 02/0 : 6[0] -> 7[0] via P2P/CUMEM
 tg10908:875806:875919 [0] NCCL INFO Channel 06/0 : 6[0] -> 7[0] via P2P/CUMEM
 tg10908:875806:875919 [0] NCCL INFO Channel 00/0 : 6[0] -> 0[0] [send] via NET/IB/4
 tg10908:875806:875919 [0] NCCL INFO Channel 04/0 : 6[0] -> 0[0] [send] via NET/IB/4
 tg10908:875806:875921 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 36
 [rank5]: Traceback (most recent call last):
 [rank5]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank5]:     main()
 [rank5]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank5]:     model = nn.parallel.DistributedDataParallel(
 [rank5]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank5]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank5]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank5]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank5]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank5]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995236:995236 [0] NCCL INFO group.cc:460 -> 1
 tg10907:995236:995236 [0] NCCL INFO group.cc:581 -> 1
 tg10907:995236:995236 [0] NCCL INFO enqueue.cc:2299 -> 1
 [rank5]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank5]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank5]: Last error:
 [rank5]: Cuda failure 101 'invalid device ordinal'
 tg10908:875806:875919 [0] NCCL INFO Channel 02/0 : 0[0] -> 6[0] [receive] via NET/IB/5
 tg10908:875806:875919 [0] NCCL INFO Channel 06/0 : 0[0] -> 6[0] [receive] via NET/IB/5
 tg10908:875806:875919 [0] NCCL INFO Channel 01/0 : 6[0] -> 4[0] via P2P/CUMEM
 tg10908:875806:875919 [0] NCCL INFO Channel 03/0 : 6[0] -> 4[0] via P2P/CUMEM
 tg10908:875806:875919 [0] NCCL INFO Channel 05/0 : 6[0] -> 4[0] via P2P/CUMEM
 tg10908:875806:875919 [0] NCCL INFO Channel 07/0 : 6[0] -> 4[0] via P2P/CUMEM

 [2025-04-23 13:06:23] tg10908:875806:875919 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10908:875806:875919 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10908:875806:875919 [0] NCCL INFO transport/p2p.cc:537 -> 1
 tg10908:875806:875919 [0] NCCL INFO transport.cc:216 -> 1
 tg10908:875806:875919 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10908:875806:875919 [0] NCCL INFO group.cc:148 -> 1
 tg10908:875806:875919 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 [rank7]: Traceback (most recent call last):
 [rank7]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank7]:     main()
 [rank7]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank7]:     model = nn.parallel.DistributedDataParallel(
 [rank7]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank7]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank7]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank7]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank7]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank7]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995237:995237 [0] NCCL INFO cudaDriverVersion 12080
 tg10907:995237:995237 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0>
 tg10907:995237:995237 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10907:995237:995237 [0] NCCL INFO Comm config Blocking set to 1
 tg10907:995237:995315 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10907:995237:995315 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0>
 tg10907:995237:995315 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10907:995237:995315 [0] NCCL INFO Using network IB
 tg10907:995237:995315 [0] NCCL INFO ncclCommInitRankConfig comm 0x2341dd40 rank 2 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init START
 tg10907:995237:995315 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 [rank7]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank7]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank7]: Last error:
 [rank7]: Cuda failure 101 'invalid device ordinal'
 tg10908:875806:875806 [0] NCCL INFO group.cc:460 -> 1
 tg10908:875806:875806 [0] NCCL INFO group.cc:581 -> 1
 tg10908:875806:875806 [0] NCCL INFO enqueue.cc:2299 -> 1
 [rank6]: Traceback (most recent call last):
 [rank6]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank6]:     main()
 [rank6]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank6]:     model = nn.parallel.DistributedDataParallel(
 [rank6]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank6]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank6]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank6]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank6]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank6]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995237:995315 [0] NCCL INFO Bootstrap timings total 1.074027 (create 0.000024, send 0.000083, recv 0.000637, ring 1.063644, delay 0.000000)
 tg10907:995237:995315 [0] NCCL INFO Setting affinity for GPU 0 to 5555,55000000
 tg10907:995237:995315 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10907:995237:995315 [0] NCCL INFO comm 0x2341dd40 rank 2 nRanks 8 nNodes 2 localRanks 4 localRank 2 MNNVL 0
 tg10907:995237:995315 [0] NCCL INFO Trees [0] -1/-1/-1->2->3 [1] 3/-1/-1->2->1 [2] 3/6/-1->2->-1 [3] 1/-1/-1->2->0 [4] -1/-1/-1->2->3 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->6 [7] 1/-1/-1->2->0
 tg10907:995237:995315 [0] NCCL INFO P2P Chunksize set to 131072
 tg10907:995237:995340 [0] NCCL INFO [Proxy Service] Device 0 CPU core 36
 tg10907:995237:995343 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 38
 tg10907:995237:995315 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 [rank6]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank6]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank6]: Last error:
 [rank6]: Cuda failure 101 'invalid device ordinal'
 tg10907:995237:995315 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10907:995237:995315 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10907:995237:995315 [0] NCCL INFO ncclCommInitRankConfig comm 0x2341dd40 rank 2 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10907:995237:995315 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 8 total 1.58 (kernels 0.27, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00)
 tg10907:995237:995348 [0] NCCL INFO Channel 02/0 : 2[0] -> 3[0] via P2P/CUMEM
 tg10907:995237:995348 [0] NCCL INFO Channel 06/0 : 2[0] -> 3[0] via P2P/CUMEM
 tg10907:995237:995348 [0] NCCL INFO Channel 00/0 : 2[0] -> 4[0] [send] via NET/IB/4
 tg10907:995237:995348 [0] NCCL INFO Channel 04/0 : 2[0] -> 4[0] [send] via NET/IB/4
 tg10907:995237:995355 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 42
 [rank1]: Traceback (most recent call last):
 [rank1]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank1]:     main()
 [rank1]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank1]:     model = nn.parallel.DistributedDataParallel(
 [rank1]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank1]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank1]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank1]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank1]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995237:995348 [0] NCCL INFO Channel 02/0 : 4[0] -> 2[0] [receive] via NET/IB/5
 tg10907:995237:995348 [0] NCCL INFO Channel 06/0 : 4[0] -> 2[0] [receive] via NET/IB/5
 tg10907:995237:995348 [0] NCCL INFO Channel 01/0 : 2[0] -> 0[0] via P2P/CUMEM
 tg10907:995237:995348 [0] NCCL INFO Channel 03/0 : 2[0] -> 0[0] via P2P/CUMEM
 tg10907:995237:995348 [0] NCCL INFO Channel 05/0 : 2[0] -> 0[0] via P2P/CUMEM
 tg10907:995237:995348 [0] NCCL INFO Channel 07/0 : 2[0] -> 0[0] via P2P/CUMEM

 [2025-04-23 13:06:23] tg10907:995237:995348 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10907:995237:995348 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10907:995237:995348 [0] NCCL INFO transport/p2p.cc:537 -> 1
 tg10907:995237:995348 [0] NCCL INFO transport.cc:216 -> 1
 tg10907:995237:995348 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10907:995237:995348 [0] NCCL INFO group.cc:148 -> 1
 tg10907:995237:995348 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 [rank1]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank1]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank1]: Last error:
 [rank1]: Cuda failure 101 'invalid device ordinal'
 tg10907:995237:995237 [0] NCCL INFO group.cc:460 -> 1
 tg10907:995237:995237 [0] NCCL INFO group.cc:581 -> 1
 tg10907:995237:995237 [0] NCCL INFO enqueue.cc:2299 -> 1
 [rank2]: Traceback (most recent call last):
 [rank2]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank2]:     main()
 [rank2]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank2]:     model = nn.parallel.DistributedDataParallel(
 [rank2]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank2]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank2]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank2]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank2]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank2]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995235:995235 [0] NCCL INFO cudaDriverVersion 12080
 tg10907:995235:995235 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0>
 tg10907:995235:995235 [0] NCCL INFO NCCL version 2.26.2+cuda12.2
 tg10907:995235:995235 [0] NCCL INFO Comm config Blocking set to 1
 tg10907:995235:995317 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin.
 tg10907:995235:995317 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0>
 tg10907:995235:995317 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. 
 tg10907:995235:995317 [0] NCCL INFO Using network IB
 tg10907:995235:995317 [0] NCCL INFO ncclCommInitRankConfig comm 0x19679cc0 rank 3 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init START
 tg10907:995235:995317 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028>
 [rank2]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank2]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank2]: Last error:
 [rank2]: Cuda failure 101 'invalid device ordinal'
 tg10907:995235:995317 [0] NCCL INFO Bootstrap timings total 1.073976 (create 0.000027, send 0.000088, recv 1.058721, ring 0.007501, delay 0.000000)
 tg10907:995235:995317 [0] NCCL INFO Setting affinity for GPU 0 to aaaa,aa000000
 tg10907:995235:995317 [0] NCCL INFO NVLS multicast support is not available on dev 0
 tg10907:995235:995317 [0] NCCL INFO comm 0x19679cc0 rank 3 nRanks 8 nNodes 2 localRanks 4 localRank 3 MNNVL 0
 tg10907:995235:995317 [0] NCCL INFO Trees [0] 2/-1/-1->3->1 [1] 0/-1/-1->3->2 [2] 1/-1/-1->3->2 [3] 0/7/-1->3->-1 [4] 2/-1/-1->3->1 [5] 0/-1/-1->3->2 [6] 1/-1/-1->3->2 [7] 0/-1/-1->3->7
 tg10907:995235:995317 [0] NCCL INFO P2P Chunksize set to 131072
 tg10907:995235:995344 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 41
 tg10907:995235:995342 [0] NCCL INFO [Proxy Service] Device 0 CPU core 37
 tg10907:995235:995317 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512
 [rank3]: Traceback (most recent call last):
 [rank3]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank3]:     main()
 [rank3]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank3]:     model = nn.parallel.DistributedDataParallel(
 [rank3]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank3]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank3]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank3]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank3]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank3]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995235:995317 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer
 tg10907:995235:995317 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin.
 tg10907:995235:995317 [0] NCCL INFO ncclCommInitRankConfig comm 0x19679cc0 rank 3 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init COMPLETE
 tg10907:995235:995317 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 8 total 1.57 (kernels 0.27, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00)
 tg10907:995235:995351 [0] NCCL INFO Channel 01/0 : 3[0] -> 5[0] [send] via NET/IB/6
 tg10907:995235:995351 [0] NCCL INFO Channel 05/0 : 3[0] -> 5[0] [send] via NET/IB/6
 tg10907:995235:995353 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 45
 tg10907:995235:995351 [0] NCCL INFO Channel 03/0 : 5[0] -> 3[0] [receive] via NET/IB/7
 [rank3]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank3]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank3]: Last error:
 [rank3]: Cuda failure 101 'invalid device ordinal'
 tg10907:995235:995351 [0] NCCL INFO Channel 07/0 : 5[0] -> 3[0] [receive] via NET/IB/7
 tg10907:995235:995351 [0] NCCL INFO Channel 00/0 : 3[0] -> 1[0] via P2P/CUMEM
 tg10907:995235:995351 [0] NCCL INFO Channel 02/0 : 3[0] -> 1[0] via P2P/CUMEM
 tg10907:995235:995351 [0] NCCL INFO Channel 04/0 : 3[0] -> 1[0] via P2P/CUMEM
 tg10907:995235:995351 [0] NCCL INFO Channel 06/0 : 3[0] -> 1[0] via P2P/CUMEM
 tg10907:995235:995351 [0] NCCL INFO Channel 03/0 : 3[0] -> 2[0] via P2P/CUMEM
 tg10907:995235:995351 [0] NCCL INFO Channel 07/0 : 3[0] -> 2[0] via P2P/CUMEM

 [2025-04-23 13:06:23] tg10907:995235:995351 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal'
 tg10907:995235:995351 [0] NCCL INFO transport/p2p.cc:352 -> 1
 tg10907:995235:995351 [0] NCCL INFO transport/p2p.cc:537 -> 1
 tg10907:995235:995351 [0] NCCL INFO transport.cc:216 -> 1
 tg10907:995235:995351 [0] NCCL INFO transport/generic.cc:19 -> 1
 tg10907:995235:995351 [0] NCCL INFO group.cc:148 -> 1
 [rank0]: Traceback (most recent call last):
 [rank0]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module>
 [rank0]:     main()
 [rank0]:   File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main
 [rank0]:     model = nn.parallel.DistributedDataParallel(
 [rank0]:             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 [rank0]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__
 [rank0]:     _verify_param_shape_across_processes(self.process_group, parameters)
 [rank0]:   File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes
 [rank0]:     return dist._verify_params_across_processes(process_group, tensors, logger)
 [rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 tg10907:995235:995351 [0] NCCL INFO group.cc:75 -> 1 [Async thread]
 tg10907:995235:995235 [0] NCCL INFO group.cc:460 -> 1
 tg10907:995235:995235 [0] NCCL INFO group.cc:581 -> 1
 tg10907:995235:995235 [0] NCCL INFO enqueue.cc:2299 -> 1
 [rank0]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2
 [rank0]: ncclUnhandledCudaError: Call to CUDA function failed.
 [rank0]: Last error:
 [rank0]: Cuda failure 101 'invalid device ordinal'
 [rank1]:[W423 13:06:23.342925903 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 [rank4]:[W423 13:06:23.343823987 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 tg10908:875807:875910 [0] NCCL INFO misc/socket.cc:881 -> 3
 [rank5]:[W423 13:06:23.357692461 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 tg10907:995237:995340 [0] NCCL INFO misc/socket.cc:881 -> 3
 [rank2]:[W423 13:06:23.364046822 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 [rank7]:[W423 13:06:23.363516833 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 tg10908:875806:875909 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10908:875806:875909 [0] NCCL INFO misc/socket.cc:881 -> 3
 [rank6]:[W423 13:06:23.385873250 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 [rank0]:[W423 13:06:23.389691207 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 [rank3]:[W423 13:06:23.399161934 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())
 tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875806:875909 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10908:875806:875931 [0] NCCL INFO comm 0x2d22f240 rank 6 nranks 8 cudaDev 0 busId 5f000 - Abort COMPLETE
 tg10908:875807:875910 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875807:875910 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10908:875807:875927 [0] NCCL INFO comm 0x12941f80 rank 5 nranks 8 cudaDev 0 busId cb000 - Abort COMPLETE
 tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995237:995340 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10907:995237:995359 [0] NCCL INFO comm 0x2341dd40 rank 2 nranks 8 cudaDev 0 busId 5f000 - Abort COMPLETE
 tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995236:995341 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995236:995341 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10907:995236:995357 [0] NCCL INFO comm 0x258ed380 rank 1 nranks 8 cudaDev 0 busId cb000 - Abort COMPLETE
 tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875805:875908 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875805:875925 [0] NCCL INFO comm 0xa39eac0 rank 4 nranks 8 cudaDev 0 busId 4e000 - Abort COMPLETE
 tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10908:875804:875911 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10908:875804:875929 [0] NCCL INFO comm 0x1ff86a80 rank 7 nranks 8 cudaDev 0 busId db000 - Abort COMPLETE
 tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995238:995346 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995238:995361 [0] NCCL INFO comm 0x12a20e80 rank 0 nranks 8 cudaDev 0 busId 4e000 - Abort COMPLETE
 tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:64 -> 3
 tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:80 -> 3
 tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:829 -> 3
 tg10907:995235:995342 [0] NCCL INFO misc/socket.cc:881 -> 3
 tg10907:995235:995363 [0] NCCL INFO comm 0x19679cc0 rank 3 nranks 8 cudaDev 0 busId db000 - Abort COMPLETE

 ======== GPU REPORT ========

 ======== GPU REPORT ========

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:24 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:24 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875805
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6051 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875806
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5857 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875807
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5761 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875804
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 0 ms
            Is Running                    : 1


 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995238
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5928 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995237
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6022 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995236
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5734 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995235
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 0 ms
            Is Running                    : 1

 Wed Apr 23 13:06:24 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   35C    P0            118W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   34C    P0            116W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 Wed Apr 23 13:06:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   35C    P0            120W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   35C    P0            118W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   35C    P0            120W /  700W |       1MiB /  81559MiB |      1%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   35C    P0            123W /  700W |       1MiB /  81559MiB |      1%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   35C    P0            122W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   35C    P0            118W /  700W |       1MiB /  81559MiB |      1%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+

 ======== GPU REPORT ========

 ======== GPU REPORT ========

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:25 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875805
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6051 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875806
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5857 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875807
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5761 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875804
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6158 ms
            Is Running                    : 0


 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:25 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995238
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5928 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995237
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6022 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995236
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5734 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995235
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6147 ms
            Is Running                    : 0

 Wed Apr 23 13:06:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   35C    P0            120W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   34C    P0            118W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 Wed Apr 23 13:06:25 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   35C    P0            121W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   35C    P0            122W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   35C    P0            119W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   35C    P0            125W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   35C    P0            124W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   35C    P0            119W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+

 ======== GPU REPORT ========

 ======== GPU REPORT ========

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:26 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875805
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6051 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875806
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5857 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875807
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5761 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875804
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6158 ms
            Is Running                    : 0


 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:26 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995238
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5928 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995237
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6022 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995236
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5734 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995235
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6147 ms
            Is Running                    : 0

 Wed Apr 23 13:06:26 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0             97W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   34C    P0            118W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0            100W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   35C    P0            122W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 Wed Apr 23 13:06:26 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0            114W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   33C    P0             89W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0            121W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   34C    P0            102W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+

 ======== GPU REPORT ========

 ======== GPU REPORT ========

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:27 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875805
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6051 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875806
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5857 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875807
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5761 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 875804
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6158 ms
            Is Running                    : 0


 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:27 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995238
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5928 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995237
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6022 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995236
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5734 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995235
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6147 ms
            Is Running                    : 0

 Wed Apr 23 13:06:27 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   33C    P0             80W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   34C    P0             99W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 Wed Apr 23 13:06:27 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   33C    P0             71W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0             74W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
 srun: error: tg10908: tasks 4-7: Exited with exit code 1
 srun: Terminating StepId=7014.1
 slurmstepd: error: *** STEP 7014.1 ON tg10907 CANCELLED AT 2025-04-23T17:06:27 ***
 slurmstepd: error: --task-epilog failed status=15
 srun: error: tg10907: tasks 0-3: Exited with exit code 1

 ======== GPU REPORT ========

 ==============NVSMI LOG==============

 Timestamp                                 : Wed Apr 23 13:06:28 2025
 Driver Version                            : 570.124.06
 CUDA Version                              : 12.8

 Attached GPUs                             : 4
 GPU 00000000:4E:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995238
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5928 ms
            Is Running                    : 0

 GPU 00000000:5F:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995237
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6022 ms
            Is Running                    : 0

 GPU 00000000:CB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995236
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 5734 ms
            Is Running                    : 0

 GPU 00000000:DB:00.0
    Accounting Mode                       : Enabled
    Accounting Mode Buffer Size           : 4000
    Accounted Processes
        Process ID                        : 995235
            GPU Utilization               : 0 %
            Memory Utilization            : 0 %
            Max memory usage              : 972 MiB
            Time                          : 6147 ms
            Is Running                    : 0

 Wed Apr 23 13:06:29 2025       
 +-----------------------------------------------------------------------------------------+
 | NVIDIA-SMI 570.124.06             Driver Version: 570.124.06     CUDA Version: 12.8     |
 |-----------------------------------------+------------------------+----------------------+
 | GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
 | Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
 |                                         |                        |               MIG M. |
 |=========================================+========================+======================|
 |   0  NVIDIA H100 80GB HBM3          On  |   00000000:4E:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   1  NVIDIA H100 80GB HBM3          On  |   00000000:5F:00.0 Off |                    0 |
 | N/A   33C    P0             71W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   2  NVIDIA H100 80GB HBM3          On  |   00000000:CB:00.0 Off |                    0 |
 | N/A   34C    P0             72W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
 |   3  NVIDIA H100 80GB HBM3          On  |   00000000:DB:00.0 Off |                    0 |
 | N/A   34C    P0             69W /  700W |       1MiB /  81559MiB |      0%      Default |
 |                                         |                        |             Disabled |
 +-----------------------------------------+------------------------+----------------------+
                                                                                         
 +-----------------------------------------------------------------------------------------+
 | Processes:                                                                              |
 |  GPU   GI   CI              PID   Type   Process name                        GPU Memory |
 |        ID   ID                                                               Usage      |
 |=========================================================================================|
 |  No running processes found                                                             |
 +-----------------------------------------------------------------------------------------+
diff --git a/uv.lock b/uv.lock
 version = 1
 revision = 1
 requires-python = ">=3.12"

 [[package]]
 name = "colorama"
 version = "0.4.6"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
 ]

 [[package]]
 name = "filelock"
 version = "3.18.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215 },
 ]

 [[package]]
 name = "fsspec"
 version = "2025.3.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/45/d8/8425e6ba5fcec61a1d16e41b1b71d2bf9344f1fe48012c2b48b9620feae5/fsspec-2025.3.2.tar.gz", hash = "sha256:e52c77ef398680bbd6a98c0e628fbc469491282981209907bbc8aea76a04fdc6", size = 299281 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/44/4b/e0cfc1a6f17e990f3e64b7d941ddc4acdc7b19d6edd51abf495f32b1a9e4/fsspec-2025.3.2-py3-none-any.whl", hash = "sha256:2daf8dc3d1dfa65b6aa37748d112773a7a08416f6c70d96b264c96476ecaf711", size = 194435 },
 ]

 [[package]]
 name = "jinja2"
 version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "markupsafe" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 },
 ]

 [[package]]
 name = "markdown-it-py"
 version = "3.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "mdurl" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 },
 ]

 [[package]]
 name = "markupsafe"
 version = "3.0.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274 },
    { url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348 },
    { url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149 },
    { url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118 },
    { url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993 },
    { url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178 },
    { url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319 },
    { url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352 },
    { url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097 },
    { url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601 },
    { url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274 },
    { url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352 },
    { url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122 },
    { url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085 },
    { url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978 },
    { url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208 },
    { url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357 },
    { url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344 },
    { url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101 },
    { url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603 },
    { url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510 },
    { url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486 },
    { url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480 },
    { url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914 },
    { url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796 },
    { url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473 },
    { url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114 },
    { url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098 },
    { url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208 },
    { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 },
 ]

 [[package]]
 name = "mdurl"
 version = "0.1.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 },
 ]

 [[package]]
 name = "mpmath"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198 },
 ]

 [[package]]
 name = "networkx"
 version = "3.4.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 },
 ]

 [[package]]
 name = "numpy"
 version = "2.2.5"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/dc/b2/ce4b867d8cd9c0ee84938ae1e6a6f7926ebf928c9090d036fc3c6a04f946/numpy-2.2.5.tar.gz", hash = "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", size = 20273920 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/e2/f7/1fd4ff108cd9d7ef929b8882692e23665dc9c23feecafbb9c6b80f4ec583/numpy-2.2.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", size = 20948633 },
    { url = "https://files.pythonhosted.org/packages/12/03/d443c278348371b20d830af155ff2079acad6a9e60279fac2b41dbbb73d8/numpy-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", size = 14176123 },
    { url = "https://files.pythonhosted.org/packages/2b/0b/5ca264641d0e7b14393313304da48b225d15d471250376f3fbdb1a2be603/numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", size = 5163817 },
    { url = "https://files.pythonhosted.org/packages/04/b3/d522672b9e3d28e26e1613de7675b441bbd1eaca75db95680635dd158c67/numpy-2.2.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", size = 6698066 },
    { url = "https://files.pythonhosted.org/packages/a0/93/0f7a75c1ff02d4b76df35079676b3b2719fcdfb39abdf44c8b33f43ef37d/numpy-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", size = 14087277 },
    { url = "https://files.pythonhosted.org/packages/b0/d9/7c338b923c53d431bc837b5b787052fef9ae68a56fe91e325aac0d48226e/numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", size = 16135742 },
    { url = "https://files.pythonhosted.org/packages/2d/10/4dec9184a5d74ba9867c6f7d1e9f2e0fb5fe96ff2bf50bb6f342d64f2003/numpy-2.2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", size = 15581825 },
    { url = "https://files.pythonhosted.org/packages/80/1f/2b6fcd636e848053f5b57712a7d1880b1565eec35a637fdfd0a30d5e738d/numpy-2.2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", size = 17899600 },
    { url = "https://files.pythonhosted.org/packages/ec/87/36801f4dc2623d76a0a3835975524a84bd2b18fe0f8835d45c8eae2f9ff2/numpy-2.2.5-cp312-cp312-win32.whl", hash = "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", size = 6312626 },
    { url = "https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", size = 12645715 },
    { url = "https://files.pythonhosted.org/packages/e2/a0/0aa7f0f4509a2e07bd7a509042967c2fab635690d4f48c6c7b3afd4f448c/numpy-2.2.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", size = 20935102 },
    { url = "https://files.pythonhosted.org/packages/7e/e4/a6a9f4537542912ec513185396fce52cdd45bdcf3e9d921ab02a93ca5aa9/numpy-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", size = 14191709 },
    { url = "https://files.pythonhosted.org/packages/be/65/72f3186b6050bbfe9c43cb81f9df59ae63603491d36179cf7a7c8d216758/numpy-2.2.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", size = 5149173 },
    { url = "https://files.pythonhosted.org/packages/e5/e9/83e7a9432378dde5802651307ae5e9ea07bb72b416728202218cd4da2801/numpy-2.2.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", size = 6684502 },
    { url = "https://files.pythonhosted.org/packages/ea/27/b80da6c762394c8ee516b74c1f686fcd16c8f23b14de57ba0cad7349d1d2/numpy-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", size = 14084417 },
    { url = "https://files.pythonhosted.org/packages/aa/fc/ebfd32c3e124e6a1043e19c0ab0769818aa69050ce5589b63d05ff185526/numpy-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", size = 16133807 },
    { url = "https://files.pythonhosted.org/packages/bf/9b/4cc171a0acbe4666f7775cfd21d4eb6bb1d36d3a0431f48a73e9212d2278/numpy-2.2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", size = 15575611 },
    { url = "https://files.pythonhosted.org/packages/a3/45/40f4135341850df48f8edcf949cf47b523c404b712774f8855a64c96ef29/numpy-2.2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", size = 17895747 },
    { url = "https://files.pythonhosted.org/packages/f8/4c/b32a17a46f0ffbde8cc82df6d3daeaf4f552e346df143e1b188a701a8f09/numpy-2.2.5-cp313-cp313-win32.whl", hash = "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", size = 6309594 },
    { url = "https://files.pythonhosted.org/packages/13/ae/72e6276feb9ef06787365b05915bfdb057d01fceb4a43cb80978e518d79b/numpy-2.2.5-cp313-cp313-win_amd64.whl", hash = "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", size = 12638356 },
    { url = "https://files.pythonhosted.org/packages/79/56/be8b85a9f2adb688e7ded6324e20149a03541d2b3297c3ffc1a73f46dedb/numpy-2.2.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", size = 20963778 },
    { url = "https://files.pythonhosted.org/packages/ff/77/19c5e62d55bff507a18c3cdff82e94fe174957bad25860a991cac719d3ab/numpy-2.2.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", size = 14207279 },
    { url = "https://files.pythonhosted.org/packages/75/22/aa11f22dc11ff4ffe4e849d9b63bbe8d4ac6d5fae85ddaa67dfe43be3e76/numpy-2.2.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", size = 5199247 },
    { url = "https://files.pythonhosted.org/packages/4f/6c/12d5e760fc62c08eded0394f62039f5a9857f758312bf01632a81d841459/numpy-2.2.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", size = 6711087 },
    { url = "https://files.pythonhosted.org/packages/ef/94/ece8280cf4218b2bee5cec9567629e61e51b4be501e5c6840ceb593db945/numpy-2.2.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", size = 14059964 },
    { url = "https://files.pythonhosted.org/packages/39/41/c5377dac0514aaeec69115830a39d905b1882819c8e65d97fc60e177e19e/numpy-2.2.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", size = 16121214 },
    { url = "https://files.pythonhosted.org/packages/db/54/3b9f89a943257bc8e187145c6bc0eb8e3d615655f7b14e9b490b053e8149/numpy-2.2.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", size = 15575788 },
    { url = "https://files.pythonhosted.org/packages/b1/c4/2e407e85df35b29f79945751b8f8e671057a13a376497d7fb2151ba0d290/numpy-2.2.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", size = 17893672 },
    { url = "https://files.pythonhosted.org/packages/29/7e/d0b44e129d038dba453f00d0e29ebd6eaf2f06055d72b95b9947998aca14/numpy-2.2.5-cp313-cp313t-win32.whl", hash = "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", size = 6377102 },
    { url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096 },
 ]

 [[package]]
 name = "nvidia-cublas-cu12"
 version = "12.6.4.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322 },
 ]

 [[package]]
 name = "nvidia-cuda-cupti-cu12"
 version = "12.6.80"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/49/60/7b6497946d74bcf1de852a21824d63baad12cd417db4195fc1bfe59db953/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6768bad6cab4f19e8292125e5f1ac8aa7d1718704012a0e3272a6f61c4bce132", size = 8917980 },
    { url = "https://files.pythonhosted.org/packages/a5/24/120ee57b218d9952c379d1e026c4479c9ece9997a4fb46303611ee48f038/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73", size = 8917972 },
 ]

 [[package]]
 name = "nvidia-cuda-nvrtc-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380 },
 ]

 [[package]]
 name = "nvidia-cuda-runtime-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/e1/23/e717c5ac26d26cf39a27fbc076240fad2e3b817e5889d671b67f4f9f49c5/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ba3b56a4f896141e25e19ab287cd71e52a6a0f4b29d0d31609f60e3b4d5219b7", size = 897690 },
    { url = "https://files.pythonhosted.org/packages/f0/62/65c05e161eeddbafeca24dc461f47de550d9fa8a7e04eb213e32b55cfd99/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a84d15d5e1da416dd4774cb42edf5e954a3e60cc945698dc1d5be02321c44dc8", size = 897678 },
 ]

 [[package]]
 name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "nvidia-cublas-cu12" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386 },
 ]

 [[package]]
 name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632 },
    { url = "https://files.pythonhosted.org/packages/60/de/99ec247a07ea40c969d904fc14f3a356b3e2a704121675b75c366b694ee1/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:768160ac89f6f7b459bee747e8d175dbf53619cfe74b2a5636264163138013ca", size = 200221622 },
 ]

 [[package]]
 name = "nvidia-cufile-cu12"
 version = "1.11.1.6"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/b2/66/cc9876340ac68ae71b15c743ddb13f8b30d5244af344ec8322b449e35426/nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159", size = 1142103 },
 ]

 [[package]]
 name = "nvidia-curand-cu12"
 version = "10.3.7.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010 },
    { url = "https://files.pythonhosted.org/packages/4a/aa/2c7ff0b5ee02eaef890c0ce7d4f74bc30901871c5e45dee1ae6d0083cd80/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:99f1a32f1ac2bd134897fc7a203f779303261268a65762a623bf30cc9fe79117", size = 56279000 },
 ]

 [[package]]
 name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "nvidia-cublas-cu12" },
    { name = "nvidia-cusparse-cu12" },
    { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790 },
    { url = "https://files.pythonhosted.org/packages/9f/81/baba53585da791d043c10084cf9553e074548408e04ae884cfe9193bd484/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6cf28f17f64107a0c4d7802be5ff5537b2130bfc112f25d5a30df227058ca0e6", size = 158229780 },
 ]

 [[package]]
 name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "nvidia-nvjitlink-cu12" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367 },
    { url = "https://files.pythonhosted.org/packages/43/ac/64c4316ba163e8217a99680c7605f779accffc6a4bcd0c778c12948d3707/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f", size = 216561357 },
 ]

 [[package]]
 name = "nvidia-cusparselt-cu12"
 version = "0.6.3"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796 },
 ]

 [[package]]
 name = "nvidia-nccl-cu12"
 version = "2.26.2"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/67/ca/f42388aed0fddd64ade7493dbba36e1f534d4e6fdbdd355c6a90030ae028/nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6", size = 201319755 },
 ]

 [[package]]
 name = "nvidia-nvjitlink-cu12"
 version = "12.6.85"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971 },
 ]

 [[package]]
 name = "nvidia-nvtx-cu12"
 version = "12.6.77"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/56/9a/fff8376f8e3d084cd1530e1ef7b879bb7d6d265620c95c1b322725c694f4/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b90bed3df379fa79afbd21be8e04a0314336b8ae16768b58f2d34cb1d04cd7d2", size = 89276 },
    { url = "https://files.pythonhosted.org/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1", size = 89265 },
 ]

 [[package]]
 name = "pillow"
 version = "11.2.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/af/cb/bb5c01fcd2a69335b86c22142b2bccfc3464087efb7fd382eee5ffc7fdf7/pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6", size = 47026707 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/c7/40/052610b15a1b8961f52537cc8326ca6a881408bc2bdad0d852edeb6ed33b/pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f", size = 3190185 },
    { url = "https://files.pythonhosted.org/packages/e5/7e/b86dbd35a5f938632093dc40d1682874c33dcfe832558fc80ca56bfcb774/pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b", size = 3030306 },
    { url = "https://files.pythonhosted.org/packages/a4/5c/467a161f9ed53e5eab51a42923c33051bf8d1a2af4626ac04f5166e58e0c/pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d", size = 4416121 },
    { url = "https://files.pythonhosted.org/packages/62/73/972b7742e38ae0e2ac76ab137ca6005dcf877480da0d9d61d93b613065b4/pillow-11.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4", size = 4501707 },
    { url = "https://files.pythonhosted.org/packages/e4/3a/427e4cb0b9e177efbc1a84798ed20498c4f233abde003c06d2650a6d60cb/pillow-11.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d", size = 4522921 },
    { url = "https://files.pythonhosted.org/packages/fe/7c/d8b1330458e4d2f3f45d9508796d7caf0c0d3764c00c823d10f6f1a3b76d/pillow-11.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4", size = 4612523 },
    { url = "https://files.pythonhosted.org/packages/b3/2f/65738384e0b1acf451de5a573d8153fe84103772d139e1e0bdf1596be2ea/pillow-11.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443", size = 4587836 },
    { url = "https://files.pythonhosted.org/packages/6a/c5/e795c9f2ddf3debb2dedd0df889f2fe4b053308bb59a3cc02a0cd144d641/pillow-11.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c", size = 4669390 },
    { url = "https://files.pythonhosted.org/packages/96/ae/ca0099a3995976a9fce2f423166f7bff9b12244afdc7520f6ed38911539a/pillow-11.2.1-cp312-cp312-win32.whl", hash = "sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3", size = 2332309 },
    { url = "https://files.pythonhosted.org/packages/7c/18/24bff2ad716257fc03da964c5e8f05d9790a779a8895d6566e493ccf0189/pillow-11.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941", size = 2676768 },
    { url = "https://files.pythonhosted.org/packages/da/bb/e8d656c9543276517ee40184aaa39dcb41e683bca121022f9323ae11b39d/pillow-11.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb", size = 2415087 },
    { url = "https://files.pythonhosted.org/packages/36/9c/447528ee3776e7ab8897fe33697a7ff3f0475bb490c5ac1456a03dc57956/pillow-11.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28", size = 3190098 },
    { url = "https://files.pythonhosted.org/packages/b5/09/29d5cd052f7566a63e5b506fac9c60526e9ecc553825551333e1e18a4858/pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830", size = 3030166 },
    { url = "https://files.pythonhosted.org/packages/71/5d/446ee132ad35e7600652133f9c2840b4799bbd8e4adba881284860da0a36/pillow-11.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0", size = 4408674 },
    { url = "https://files.pythonhosted.org/packages/69/5f/cbe509c0ddf91cc3a03bbacf40e5c2339c4912d16458fcb797bb47bcb269/pillow-11.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1", size = 4496005 },
    { url = "https://files.pythonhosted.org/packages/f9/b3/dd4338d8fb8a5f312021f2977fb8198a1184893f9b00b02b75d565c33b51/pillow-11.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f", size = 4518707 },
    { url = "https://files.pythonhosted.org/packages/13/eb/2552ecebc0b887f539111c2cd241f538b8ff5891b8903dfe672e997529be/pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155", size = 4610008 },
    { url = "https://files.pythonhosted.org/packages/72/d1/924ce51bea494cb6e7959522d69d7b1c7e74f6821d84c63c3dc430cbbf3b/pillow-11.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14", size = 4585420 },
    { url = "https://files.pythonhosted.org/packages/43/ab/8f81312d255d713b99ca37479a4cb4b0f48195e530cdc1611990eb8fd04b/pillow-11.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b", size = 4667655 },
    { url = "https://files.pythonhosted.org/packages/94/86/8f2e9d2dc3d308dfd137a07fe1cc478df0a23d42a6c4093b087e738e4827/pillow-11.2.1-cp313-cp313-win32.whl", hash = "sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2", size = 2332329 },
    { url = "https://files.pythonhosted.org/packages/6d/ec/1179083b8d6067a613e4d595359b5fdea65d0a3b7ad623fee906e1b3c4d2/pillow-11.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691", size = 2676388 },
    { url = "https://files.pythonhosted.org/packages/23/f1/2fc1e1e294de897df39fa8622d829b8828ddad938b0eaea256d65b84dd72/pillow-11.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c", size = 2414950 },
    { url = "https://files.pythonhosted.org/packages/c4/3e/c328c48b3f0ead7bab765a84b4977acb29f101d10e4ef57a5e3400447c03/pillow-11.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22", size = 3192759 },
    { url = "https://files.pythonhosted.org/packages/18/0e/1c68532d833fc8b9f404d3a642991441d9058eccd5606eab31617f29b6d4/pillow-11.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7", size = 3033284 },
    { url = "https://files.pythonhosted.org/packages/b7/cb/6faf3fb1e7705fd2db74e070f3bf6f88693601b0ed8e81049a8266de4754/pillow-11.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16", size = 4445826 },
    { url = "https://files.pythonhosted.org/packages/07/94/8be03d50b70ca47fb434a358919d6a8d6580f282bbb7af7e4aa40103461d/pillow-11.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b", size = 4527329 },
    { url = "https://files.pythonhosted.org/packages/fd/a4/bfe78777076dc405e3bd2080bc32da5ab3945b5a25dc5d8acaa9de64a162/pillow-11.2.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406", size = 4549049 },
    { url = "https://files.pythonhosted.org/packages/65/4d/eaf9068dc687c24979e977ce5677e253624bd8b616b286f543f0c1b91662/pillow-11.2.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91", size = 4635408 },
    { url = "https://files.pythonhosted.org/packages/1d/26/0fd443365d9c63bc79feb219f97d935cd4b93af28353cba78d8e77b61719/pillow-11.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751", size = 4614863 },
    { url = "https://files.pythonhosted.org/packages/49/65/dca4d2506be482c2c6641cacdba5c602bc76d8ceb618fd37de855653a419/pillow-11.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9", size = 4692938 },
    { url = "https://files.pythonhosted.org/packages/b3/92/1ca0c3f09233bd7decf8f7105a1c4e3162fb9142128c74adad0fb361b7eb/pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd", size = 2335774 },
    { url = "https://files.pythonhosted.org/packages/a5/ac/77525347cb43b83ae905ffe257bbe2cc6fd23acb9796639a1f56aa59d191/pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e", size = 2681895 },
    { url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234 },
 ]

 [[package]]
 name = "pygments"
 version = "2.19.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 },
 ]

 [[package]]
 name = "rich"
 version = "14.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "markdown-it-py" },
    { name = "pygments" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a1/53/830aa4c3066a8ab0ae9a9955976fb770fe9c6102117c8ec4ab3ea62d89e8/rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725", size = 224078 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229 },
 ]

 [[package]]
 name = "setuptools"
 version = "79.0.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/7d/19/fecb7e2825616270f34512b3394cdcf6f45a79b5b6d94fdbd86a509e67b5/setuptools-79.0.0.tar.gz", hash = "sha256:9828422e7541213b0aacb6e10bbf9dd8febeaa45a48570e09b6d100e063fc9f9", size = 1367685 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/cc/ea/d53f2f8897c46a36df085964d07761ea4c2d1f2cf92019693b6742b7aabb/setuptools-79.0.0-py3-none-any.whl", hash = "sha256:b9ab3a104bedb292323f53797b00864e10e434a3ab3906813a7169e4745b912a", size = 1256065 },
 ]

 [[package]]
 name = "sympy"
 version = "1.13.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "mpmath" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/11/8a/5a7fd6284fa8caac23a26c9ddf9c30485a48169344b4bd3b0f02fef1890f/sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9", size = 7533196 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/99/ff/c87e0622b1dadea79d2fb0b25ade9ed98954c9033722eb707053d310d4f3/sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73", size = 6189483 },
 ]

 [[package]]
 name = "torch"
 version = "2.7.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "filelock" },
    { name = "fsspec" },
    { name = "jinja2" },
    { name = "networkx" },
    { name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "setuptools" },
    { name = "sympy" },
    { name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" },
    { name = "typing-extensions" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/aa/5e/ac759f4c0ab7c01feffa777bd68b43d2ac61560a9770eeac074b450f81d4/torch-2.7.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:36a6368c7ace41ad1c0f69f18056020b6a5ca47bedaca9a2f3b578f5a104c26c", size = 99013250 },
    { url = "https://files.pythonhosted.org/packages/9c/58/2d245b6f1ef61cf11dfc4aceeaacbb40fea706ccebac3f863890c720ab73/torch-2.7.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:15aab3e31c16feb12ae0a88dba3434a458874636f360c567caa6a91f6bfba481", size = 865042157 },
    { url = "https://files.pythonhosted.org/packages/44/80/b353c024e6b624cd9ce1d66dcb9d24e0294680f95b369f19280e241a0159/torch-2.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:f56d4b2510934e072bab3ab8987e00e60e1262fb238176168f5e0c43a1320c6d", size = 212482262 },
    { url = "https://files.pythonhosted.org/packages/ee/8d/b2939e5254be932db1a34b2bd099070c509e8887e0c5a90c498a917e4032/torch-2.7.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:30b7688a87239a7de83f269333651d8e582afffce6f591fff08c046f7787296e", size = 68574294 },
    { url = "https://files.pythonhosted.org/packages/14/24/720ea9a66c29151b315ea6ba6f404650834af57a26b2a04af23ec246b2d5/torch-2.7.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:868ccdc11798535b5727509480cd1d86d74220cfdc42842c4617338c1109a205", size = 99015553 },
    { url = "https://files.pythonhosted.org/packages/4b/27/285a8cf12bd7cd71f9f211a968516b07dcffed3ef0be585c6e823675ab91/torch-2.7.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b52347118116cf3dff2ab5a3c3dd97c719eb924ac658ca2a7335652076df708", size = 865046389 },
    { url = "https://files.pythonhosted.org/packages/74/c8/2ab2b6eadc45554af8768ae99668c5a8a8552e2012c7238ded7e9e4395e1/torch-2.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:434cf3b378340efc87c758f250e884f34460624c0523fe5c9b518d205c91dd1b", size = 212490304 },
    { url = "https://files.pythonhosted.org/packages/28/fd/74ba6fde80e2b9eef4237fe668ffae302c76f0e4221759949a632ca13afa/torch-2.7.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:edad98dddd82220465b106506bb91ee5ce32bd075cddbcf2b443dfaa2cbd83bf", size = 68856166 },
    { url = "https://files.pythonhosted.org/packages/cb/b4/8df3f9fe6bdf59e56a0e538592c308d18638eb5f5dc4b08d02abb173c9f0/torch-2.7.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a885fc25afefb6e6eb18a7d1e8bfa01cc153e92271d980a49243b250d5ab6d9", size = 99091348 },
    { url = "https://files.pythonhosted.org/packages/9d/f5/0bd30e9da04c3036614aa1b935a9f7e505a9e4f1f731b15e165faf8a4c74/torch-2.7.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:176300ff5bc11a5f5b0784e40bde9e10a35c4ae9609beed96b4aeb46a27f5fae", size = 865104023 },
    { url = "https://files.pythonhosted.org/packages/d1/b7/2235d0c3012c596df1c8d39a3f4afc1ee1b6e318d469eda4c8bb68566448/torch-2.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d0ca446a93f474985d81dc866fcc8dccefb9460a29a456f79d99c29a78a66993", size = 212750916 },
    { url = "https://files.pythonhosted.org/packages/90/48/7e6477cf40d48cc0a61fa0d41ee9582b9a316b12772fcac17bc1a40178e7/torch-2.7.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:27f5007bdf45f7bb7af7f11d1828d5c2487e030690afb3d89a651fd7036a390e", size = 68575074 },
 ]

 [[package]]
 name = "torch-distributed-debug"
 version = "0.1.0"
 source = { virtual = "." }
 dependencies = [
    { name = "rich" },
    { name = "torch" },
    { name = "torchvision" },
    { name = "tqdm" },
 ]

 [package.metadata]
 requires-dist = [
    { name = "rich", specifier = ">=14.0.0" },
    { name = "torch", specifier = ">=2.7.0" },
    { name = "torchvision", specifier = ">=0.22.0" },
    { name = "tqdm", specifier = ">=4.67.1" },
 ]

 [[package]]
 name = "torchvision"
 version = "0.22.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "numpy" },
    { name = "pillow" },
    { name = "torch" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/cb/ea/887d1d61cf4431a46280972de665f350af1898ce5006cd046326e5d0a2f2/torchvision-0.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31c3165418fe21c3d81fe3459e51077c2f948801b8933ed18169f54652796a0f", size = 1947826 },
    { url = "https://files.pythonhosted.org/packages/72/ef/21f8b6122e13ae045b8e49658029c695fd774cd21083b3fa5c3f9c5d3e35/torchvision-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f116bc82e0c076e70ba7776e611ed392b9666aa443662e687808b08993d26af", size = 2514571 },
    { url = "https://files.pythonhosted.org/packages/7c/48/5f7617f6c60d135f86277c53f9d5682dfa4e66f4697f505f1530e8b69fb1/torchvision-0.22.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ce4dc334ebd508de2c534817c9388e928bc2500cf981906ae8d6e2ca3bf4727a", size = 7446522 },
    { url = "https://files.pythonhosted.org/packages/99/94/a015e93955f5d3a68689cc7c385a3cfcd2d62b84655d18b61f32fb04eb67/torchvision-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:24b8c9255c209ca419cc7174906da2791c8b557b75c23496663ec7d73b55bebf", size = 1716664 },
    { url = "https://files.pythonhosted.org/packages/e1/2a/9b34685599dcb341d12fc2730055155623db7a619d2415a8d31f17050952/torchvision-0.22.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ece17995857dd328485c9c027c0b20ffc52db232e30c84ff6c95ab77201112c5", size = 1947823 },
    { url = "https://files.pythonhosted.org/packages/77/77/88f64879483d66daf84f1d1c4d5c31ebb08e640411139042a258d5f7dbfe/torchvision-0.22.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:471c6dd75bb984c6ebe4f60322894a290bf3d4b195e769d80754f3689cd7f238", size = 2471592 },
    { url = "https://files.pythonhosted.org/packages/f7/82/2f813eaae7c1fae1f9d9e7829578f5a91f39ef48d6c1c588a8900533dd3d/torchvision-0.22.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:2b839ac0610a38f56bef115ee5b9eaca5f9c2da3c3569a68cc62dbcc179c157f", size = 7446333 },
    { url = "https://files.pythonhosted.org/packages/58/19/ca7a4f8907a56351dfe6ae0a708f4e6b3569b5c61d282e3e7f61cf42a4ce/torchvision-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:4ada1c08b2f761443cd65b7c7b4aec9e2fc28f75b0d4e1b1ebc9d3953ebccc4d", size = 1716693 },
    { url = "https://files.pythonhosted.org/packages/6f/a7/f43e9c8d13118b4ffbaebea664c9338ab20fa115a908125afd2238ff16e7/torchvision-0.22.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cdc96daa4658b47ce9384154c86ed1e70cba9d972a19f5de6e33f8f94a626790", size = 2137621 },
    { url = "https://files.pythonhosted.org/packages/6a/9a/2b59f5758ba7e3f23bc84e16947493bbce97392ec6d18efba7bdf0a3b10e/torchvision-0.22.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:753d3c84eeadd5979a33b3b73a25ecd0aa4af44d6b45ed2c70d44f5e0ac68312", size = 2476555 },
    { url = "https://files.pythonhosted.org/packages/7d/40/a7bc2ab9b1e56d10a7fd9ae83191bb425fa308caa23d148f1c568006e02c/torchvision-0.22.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b30e3ed29e4a61f7499bca50f57d8ebd23dfc52b14608efa17a534a55ee59a03", size = 7617924 },
    { url = "https://files.pythonhosted.org/packages/c1/7b/30d423bdb2546250d719d7821aaf9058cc093d165565b245b159c788a9dd/torchvision-0.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e5d680162694fac4c8a374954e261ddfb4eb0ce103287b0f693e4e9c579ef957", size = 1638621 },
 ]

 [[package]]
 name = "tqdm"
 version = "4.67.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "colorama", marker = "sys_platform == 'win32'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 },
 ]

 [[package]]
 name = "triton"
 version = "3.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "setuptools" },
 ]
 wheels = [
    { url = "https://files.pythonhosted.org/packages/11/53/ce18470914ab6cfbec9384ee565d23c4d1c55f0548160b1c7b33000b11fd/triton-3.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b68c778f6c4218403a6bd01be7484f6dc9e20fe2083d22dd8aef33e3b87a10a3", size = 156504509 },
    { url = "https://files.pythonhosted.org/packages/7d/74/4bf2702b65e93accaa20397b74da46fb7a0356452c1bb94dbabaf0582930/triton-3.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47bc87ad66fa4ef17968299acacecaab71ce40a238890acc6ad197c3abe2b8f1", size = 156516468 },
    { url = "https://files.pythonhosted.org/packages/0a/93/f28a696fa750b9b608baa236f8225dd3290e5aff27433b06143adc025961/triton-3.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce4700fc14032af1e049005ae94ba908e71cd6c2df682239aed08e49bc71b742", size = 156580729 },
 ]

 [[package]]
 name = "typing-extensions"
 version = "4.13.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967 }
 wheels = [
    { url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806 },
 ]
	#!/bin/bash
	#SBATCH --nodes=2
	#SBATCH --ntasks-per-node=4
	#SBATCH --gpus-per-task=1
	#SBATCH --cpus-per-task=12
	#SBATCH --mem=0
	#SBATCH --time=00:05:00


	# Echo time and hostname into log
	echo "Date: $(date)"
	echo "Hostname: $(hostname)"

	module --quiet purge
	# This example uses [UV](https://www.docs.astral.sh/uv) to manage package dependencies.

	# Copy the dataset archive into $SLURM_TMPDIR (only on the first worker of each node)
	srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c \
	'mkdir -p $SLURM_TMPDIR/data && ln -s $SCRATCH/data/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/cifar-10-python.tar.gz'

	# Get a unique port for this job based on the job ID
	export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID \| tail -c 4))
	export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" \| head -n 1)
	export NCCL_DEBUG=INFO
	# Execute Python script in each task (one per GPU)
	srun uv run --offline --frozen python main.py "$@"
	"""Multi-GPU Training example."""

	import argparse
	import logging
	import os
	from datetime import timedelta
	from pathlib import Path
	import sys

	import rich.logging
	import torch
	import torch.distributed
	from torch import Tensor, nn
	from torch.distributed import ReduceOp
	from torch.nn import functional as F
	from torch.utils.data import DataLoader, random_split
	from torch.utils.data.distributed import DistributedSampler
	from torchvision import transforms
	from torchvision.datasets import CIFAR10
	from torchvision.models import resnet18
	from tqdm import tqdm


	def main():
	# Use an argument parser so we can pass hyperparameters from the command line.
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("--epochs", type=int, default=10)
	parser.add_argument("--learning-rate", type=float, default=5e-4)
	parser.add_argument("--weight-decay", type=float, default=1e-4)
	parser.add_argument("--batch-size", type=int, default=128)
	args = parser.parse_args()

	epochs: int = args.epochs
	learning_rate: float = args.learning_rate
	weight_decay: float = args.weight_decay
	# NOTE: This is the "local" batch size, per-GPU.
	batch_size: int = args.batch_size

	# Check that the GPU is available
	assert torch.cuda.is_available() and torch.cuda.device_count() > 0
	rank, world_size, local_rank = setup()
	is_master = rank == 0
	is_local_master = local_rank == 0
	# since we always use 1 gpu per task with `srun`.
	n_gpus_per_task = torch.cuda.device_count()
	assert n_gpus_per_task == 1, (
	"DDP isn't meant to be used with multiple GPUs per task."
	)

	# Setup logging (optional, but much better than using print statements)
	logging.basicConfig(
	level=logging.INFO,
	format=f"[{rank}/{world_size}] %(name)s - %(message)s ",
	handlers=[
	rich.logging.RichHandler(markup=True)
	], # Very pretty, uses the `rich` package.
	)

	logger = logging.getLogger(__name__)
	logger.info(
	f"World size: {world_size}, global rank: {rank}, local rank: {local_rank}, local device count: {n_gpus_per_task}"
	)

	# NOTE: We always see the GPU as `cuda:0` in each task because of `srun`, which sets the CUDA_VISIBLE_DEVICES
	# env var.
	# Otherwise, if you use `torch.distributed.launch`, accelerate, or similar, use `cuda:{local_rank}`.
	device = torch.device("cuda", 0)

	# Create a model and move it to the GPU.
	model = resnet18(num_classes=10)
	model = model.to(device)

	# Wrap the model with DistributedDataParallel
	# (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel)
	model = nn.parallel.DistributedDataParallel(
	model, device_ids=[local_rank], output_device=local_rank
	)

	optimizer = torch.optim.AdamW(
	model.parameters(), lr=learning_rate, weight_decay=weight_decay
	)

	# Setup CIFAR10
	num_workers = get_num_workers()

	dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data"
	train_dataset, valid_dataset, test_dataset = make_datasets(
	str(dataset_path), is_master=is_local_master
	)

	# Restricts data loading to a subset of the dataset exclusive to the current process
	train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True)
	valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False)
	test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False)

	# NOTE: Here `batch_size` is still the "local" (per-gpu) batch size.
	# This way, the effective batch size scales directly with number of GPUs, no need to specify it
	# in advance. You might want to adjust the learning rate and other hyper-parameters though.
	if is_master:
	logger.info(f"Effective batch size: {batch_size * world_size}")
	train_dataloader = DataLoader(
	train_dataset,
	batch_size=batch_size,
	num_workers=num_workers,
	shuffle=False, # shuffling is now done in the sampler, not the dataloader.
	sampler=train_sampler,
	)
	valid_dataloader = DataLoader(
	valid_dataset,
	batch_size=batch_size,
	num_workers=num_workers,
	shuffle=False,
	sampler=valid_sampler,
	)
	test_dataloader = DataLoader( # NOTE: Not used in this example.
	test_dataset,
	batch_size=batch_size,
	num_workers=num_workers,
	shuffle=False,
	sampler=test_sampler,
	)

	# Checkout the "checkpointing and preemption" example for more info!
	logger.debug("Starting training from scratch.")

	for epoch in range(epochs):
	logger.debug(f"Starting epoch {epoch}/{epochs}")

	# NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch.
	train_sampler.set_epoch(epoch)

	# Set the model in training mode (important for e.g. BatchNorm and Dropout layers)
	model.train()

	# NOTE: using a progress bar from tqdm because it's nicer than using `print`.
	progress_bar = tqdm(
	total=len(train_dataloader),
	desc=f"Train epoch {epoch}",
	disable=not (is_master and sys.stdout.isatty()),
	)

	# Training loop
	for batch in train_dataloader:
	# Move the batch to the GPU before we pass it to the model
	# batch = tuple(item.to(device) for item in batch)
	x, y = batch

	# Forward pass
	logits: Tensor = model(x)

	local_loss = F.cross_entropy(logits, y)

	optimizer.zero_grad()
	local_loss.backward()
	# NOTE: nn.DistributedDataParallel automatically averages the gradients across devices.
	optimizer.step()

	# Calculate some metrics:
	# local metrics
	local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum()
	local_n_samples = logits.shape[0]
	local_accuracy = local_n_correct_predictions / local_n_samples

	# "global" metrics: calculated with the results from all workers
	# NOTE: Creating new tensors to hold the "global" values, but this isn't required.
	n_correct_predictions = local_n_correct_predictions.clone()
	# Reduce the local metrics across all workers, sending the result to rank 0.
	torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM)
	# Actual (global) batch size for this step.
	n_samples = torch.as_tensor(local_n_samples, device=device)
	torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM)
	# Will store the average loss across all workers.
	loss = local_loss.clone()
	torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM)
	loss.div_(world_size) # Report the average loss across all workers.

	accuracy = n_correct_predictions / n_samples

	logger.debug(f"(local) Accuracy: {local_accuracy:.2%}")
	logger.debug(f"(local) Loss: {local_loss.item()}")
	# NOTE: This would log the same values in all workers. Only logging on master:
	if is_master:
	logger.debug(f"Accuracy: {accuracy.item():.2%}")
	logger.debug(f"Average Loss: {loss.item()}")

	# Advance the progress bar one step and update the progress bar text.
	progress_bar.update(1)
	progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item())

	progress_bar.close()

	val_loss, val_accuracy = validation_loop(model, valid_dataloader, device)
	# NOTE: This would log the same values in all workers. Only logging on master:
	if is_master:
	logger.info(
	f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}"
	)

	print("Done!")
	torch.distributed.destroy_process_group()


	@torch.no_grad()
	def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device):
	model.eval()

	total_loss = torch.as_tensor(0.0, device=device)
	n_samples = torch.as_tensor(0, device=device)
	correct_predictions = torch.as_tensor(0, device=device)

	for batch in dataloader:
	# batch = tuple(item.to(device) for item in batch)
	x, y = batch

	logits: Tensor = model(x)
	loss = F.cross_entropy(logits, y)

	batch_n_samples = x.shape[0]
	batch_correct_predictions = logits.argmax(-1).eq(y).sum()

	total_loss += loss
	n_samples += batch_n_samples
	correct_predictions += batch_correct_predictions

	# Sum up the metrics we gathered on each worker before returning the overall val metrics.
	torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM)
	torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM)
	torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM)

	accuracy = correct_predictions / n_samples
	return total_loss, accuracy


	def setup():
	assert torch.distributed.is_available()
	print("PyTorch Distributed available.")
	print(" Backends:")
	print(f" Gloo: {torch.distributed.is_gloo_available()}")
	print(f" NCCL: {torch.distributed.is_nccl_available()}")
	print(f" MPI: {torch.distributed.is_mpi_available()}")

	# NOTE: the env:// init method uses FileLocks, which sometimes causes deadlocks due to the
	# distributed filesystem configuration on the Mila cluster.
	# For multi-node jobs, use the TCP init method instead.
	master_addr = os.environ["MASTER_ADDR"]
	master_port = os.environ["MASTER_PORT"]

	# Default timeout is 30 minutes. Reducing the timeout here, so the job fails quicker if there's
	# a communication problem between nodes.
	timeout = timedelta(seconds=60)

	# DDP Job is being run via `srun` on a slurm cluster.
	rank = int(os.environ["SLURM_PROCID"])
	local_rank = int(os.environ["SLURM_LOCALID"])

	world_size = int(os.environ["SLURM_NTASKS"])

	# SLURM var -> torch.distributed vars in case needed
	# NOTE: Setting these values isn't exactly necessary, but some code might assume it's
	# being run via torchrun or torch.distributed.launch, so setting these can be a good idea.
	os.environ["RANK"] = str(rank)
	os.environ["LOCAL_RANK"] = str(local_rank)
	os.environ["WORLD_SIZE"] = str(world_size)

	torch.distributed.init_process_group(
	backend="nccl",
	init_method=f"tcp://{master_addr}:{master_port}",
	timeout=timeout,
	world_size=world_size,
	rank=rank,
	)
	return rank, world_size, local_rank


	def make_datasets(
	dataset_path: str,
	is_master: bool,
	val_split: float = 0.1,
	val_split_seed: int = 42,
	):
	"""Returns the training, validation, and test splits for CIFAR10.

	NOTE: We don't use image transforms here for simplicity.
	Having different transformations for train and validation would complicate things a bit.
	Later examples will show how to do the train/val/test split properly when using transforms.

	NOTE: Only the master process (rank-0) downloads the dataset if necessary.
	"""
	# - Master: Download (if necessary) THEN Barrier
	# - others: Barrier THEN NO Download
	if not is_master:
	# Wait for the master process to finish downloading (reach the barrier below)
	torch.distributed.barrier()
	train_dataset = CIFAR10(
	root=dataset_path,
	transform=transforms.ToTensor(),
	download=is_master,
	train=True,
	)
	test_dataset = CIFAR10(
	root=dataset_path,
	transform=transforms.ToTensor(),
	download=is_master,
	train=False,
	)
	if is_master:
	# Join the workers waiting in the barrier above. They can now load the datasets from disk.
	torch.distributed.barrier()
	# Split the training dataset into a training and validation set.
	n_samples = len(train_dataset)
	n_valid = int(val_split * n_samples)
	n_train = n_samples - n_valid
	train_dataset, valid_dataset = random_split(
	train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed)
	)
	return train_dataset, valid_dataset, test_dataset


	def get_num_workers() -> int:
	"""Gets the optimal number of DatLoader workers to use in the current job."""
	if "SLURM_CPUS_PER_TASK" in os.environ:
	return int(os.environ["SLURM_CPUS_PER_TASK"])
	if hasattr(os, "sched_getaffinity"):
	return len(os.sched_getaffinity(0))
	return torch.multiprocessing.cpu_count()


	if __name__ == "__main__":
	main()
	[project]
	name = "torch-distributed-debug"
	version = "0.1.0"
	description = "Add your description here"
	readme = "README.md"
	requires-python = ">=3.12"
	dependencies = [
	"rich>=14.0.0",
	"torch>=2.7.0",
	"torchvision>=0.22.0",
	"tqdm>=4.67.1",
	]