-
Clone this gist somewhere using the URL, for example
git clone (the_url_of_this_gist) some_folder
then move to that folder. -
Install UV following their docs
-
run
uv sync
to recreate the same virtual env -
run the following on a login node to download cifar10 in your $SCRATCH/data/cifar10
mkdir -p $SCRATCH/data/cifar10
- `uv run python -c 'import pathlib, os, torchvision.datasets; torchvision.datasets.CIFAR10(pathlib.Path(os.environ["SCRATCH"]) / "data/cifar10", download=True)
-
Launch the job with
sbatch job.sh
Last active
April 23, 2025 17:09
-
-
Save lebrice/ed4feced20bdef040e2cf93460cc7957 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
#SBATCH --nodes=2 | |
#SBATCH --ntasks-per-node=4 | |
#SBATCH --gpus-per-task=1 | |
#SBATCH --cpus-per-task=12 | |
#SBATCH --mem=0 | |
#SBATCH --time=00:05:00 | |
# Echo time and hostname into log | |
echo "Date: $(date)" | |
echo "Hostname: $(hostname)" | |
module --quiet purge | |
# This example uses [UV](https://www.docs.astral.sh/uv) to manage package dependencies. | |
# Copy the dataset archive into $SLURM_TMPDIR (only on the first worker of each node) | |
srun --ntasks=$SLURM_JOB_NUM_NODES --ntasks-per-node=1 bash -c \ | |
'mkdir -p $SLURM_TMPDIR/data && ln -s $SCRATCH/data/cifar10/cifar-10-python.tar.gz $SLURM_TMPDIR/data/cifar-10-python.tar.gz' | |
# Get a unique port for this job based on the job ID | |
export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) | |
export MASTER_ADDR=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1) | |
export NCCL_DEBUG=INFO | |
# Execute Python script in each task (one per GPU) | |
srun uv run --offline --frozen python main.py "$@" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Multi-GPU Training example.""" | |
import argparse | |
import logging | |
import os | |
from datetime import timedelta | |
from pathlib import Path | |
import sys | |
import rich.logging | |
import torch | |
import torch.distributed | |
from torch import Tensor, nn | |
from torch.distributed import ReduceOp | |
from torch.nn import functional as F | |
from torch.utils.data import DataLoader, random_split | |
from torch.utils.data.distributed import DistributedSampler | |
from torchvision import transforms | |
from torchvision.datasets import CIFAR10 | |
from torchvision.models import resnet18 | |
from tqdm import tqdm | |
def main(): | |
# Use an argument parser so we can pass hyperparameters from the command line. | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument("--epochs", type=int, default=10) | |
parser.add_argument("--learning-rate", type=float, default=5e-4) | |
parser.add_argument("--weight-decay", type=float, default=1e-4) | |
parser.add_argument("--batch-size", type=int, default=128) | |
args = parser.parse_args() | |
epochs: int = args.epochs | |
learning_rate: float = args.learning_rate | |
weight_decay: float = args.weight_decay | |
# NOTE: This is the "local" batch size, per-GPU. | |
batch_size: int = args.batch_size | |
# Check that the GPU is available | |
assert torch.cuda.is_available() and torch.cuda.device_count() > 0 | |
rank, world_size, local_rank = setup() | |
is_master = rank == 0 | |
is_local_master = local_rank == 0 | |
# since we always use 1 gpu per task with `srun`. | |
n_gpus_per_task = torch.cuda.device_count() | |
assert n_gpus_per_task == 1, ( | |
"DDP isn't meant to be used with multiple GPUs per task." | |
) | |
# Setup logging (optional, but much better than using print statements) | |
logging.basicConfig( | |
level=logging.INFO, | |
format=f"[{rank}/{world_size}] %(name)s - %(message)s ", | |
handlers=[ | |
rich.logging.RichHandler(markup=True) | |
], # Very pretty, uses the `rich` package. | |
) | |
logger = logging.getLogger(__name__) | |
logger.info( | |
f"World size: {world_size}, global rank: {rank}, local rank: {local_rank}, local device count: {n_gpus_per_task}" | |
) | |
# NOTE: We always see the GPU as `cuda:0` in each task because of `srun`, which sets the CUDA_VISIBLE_DEVICES | |
# env var. | |
# Otherwise, if you use `torch.distributed.launch`, accelerate, or similar, use `cuda:{local_rank}`. | |
device = torch.device("cuda", 0) | |
# Create a model and move it to the GPU. | |
model = resnet18(num_classes=10) | |
model = model.to(device) | |
# Wrap the model with DistributedDataParallel | |
# (See https://pytorch.org/docs/stable/nn.html#torch.nn.parallel.DistributedDataParallel) | |
model = nn.parallel.DistributedDataParallel( | |
model, device_ids=[local_rank], output_device=local_rank | |
) | |
optimizer = torch.optim.AdamW( | |
model.parameters(), lr=learning_rate, weight_decay=weight_decay | |
) | |
# Setup CIFAR10 | |
num_workers = get_num_workers() | |
dataset_path = Path(os.environ.get("SLURM_TMPDIR", ".")) / "data" | |
train_dataset, valid_dataset, test_dataset = make_datasets( | |
str(dataset_path), is_master=is_local_master | |
) | |
# Restricts data loading to a subset of the dataset exclusive to the current process | |
train_sampler = DistributedSampler(dataset=train_dataset, shuffle=True) | |
valid_sampler = DistributedSampler(dataset=valid_dataset, shuffle=False) | |
test_sampler = DistributedSampler(dataset=test_dataset, shuffle=False) | |
# NOTE: Here `batch_size` is still the "local" (per-gpu) batch size. | |
# This way, the effective batch size scales directly with number of GPUs, no need to specify it | |
# in advance. You might want to adjust the learning rate and other hyper-parameters though. | |
if is_master: | |
logger.info(f"Effective batch size: {batch_size * world_size}") | |
train_dataloader = DataLoader( | |
train_dataset, | |
batch_size=batch_size, | |
num_workers=num_workers, | |
shuffle=False, # shuffling is now done in the sampler, not the dataloader. | |
sampler=train_sampler, | |
) | |
valid_dataloader = DataLoader( | |
valid_dataset, | |
batch_size=batch_size, | |
num_workers=num_workers, | |
shuffle=False, | |
sampler=valid_sampler, | |
) | |
test_dataloader = DataLoader( # NOTE: Not used in this example. | |
test_dataset, | |
batch_size=batch_size, | |
num_workers=num_workers, | |
shuffle=False, | |
sampler=test_sampler, | |
) | |
# Checkout the "checkpointing and preemption" example for more info! | |
logger.debug("Starting training from scratch.") | |
for epoch in range(epochs): | |
logger.debug(f"Starting epoch {epoch}/{epochs}") | |
# NOTE: Here we need to call `set_epoch` so the ordering is able to change at each epoch. | |
train_sampler.set_epoch(epoch) | |
# Set the model in training mode (important for e.g. BatchNorm and Dropout layers) | |
model.train() | |
# NOTE: using a progress bar from tqdm because it's nicer than using `print`. | |
progress_bar = tqdm( | |
total=len(train_dataloader), | |
desc=f"Train epoch {epoch}", | |
disable=not (is_master and sys.stdout.isatty()), | |
) | |
# Training loop | |
for batch in train_dataloader: | |
# Move the batch to the GPU before we pass it to the model | |
# batch = tuple(item.to(device) for item in batch) | |
x, y = batch | |
# Forward pass | |
logits: Tensor = model(x) | |
local_loss = F.cross_entropy(logits, y) | |
optimizer.zero_grad() | |
local_loss.backward() | |
# NOTE: nn.DistributedDataParallel automatically averages the gradients across devices. | |
optimizer.step() | |
# Calculate some metrics: | |
# local metrics | |
local_n_correct_predictions = logits.detach().argmax(-1).eq(y).sum() | |
local_n_samples = logits.shape[0] | |
local_accuracy = local_n_correct_predictions / local_n_samples | |
# "global" metrics: calculated with the results from all workers | |
# NOTE: Creating new tensors to hold the "global" values, but this isn't required. | |
n_correct_predictions = local_n_correct_predictions.clone() | |
# Reduce the local metrics across all workers, sending the result to rank 0. | |
torch.distributed.reduce(n_correct_predictions, dst=0, op=ReduceOp.SUM) | |
# Actual (global) batch size for this step. | |
n_samples = torch.as_tensor(local_n_samples, device=device) | |
torch.distributed.reduce(n_samples, dst=0, op=ReduceOp.SUM) | |
# Will store the average loss across all workers. | |
loss = local_loss.clone() | |
torch.distributed.reduce(loss, dst=0, op=ReduceOp.SUM) | |
loss.div_(world_size) # Report the average loss across all workers. | |
accuracy = n_correct_predictions / n_samples | |
logger.debug(f"(local) Accuracy: {local_accuracy:.2%}") | |
logger.debug(f"(local) Loss: {local_loss.item()}") | |
# NOTE: This would log the same values in all workers. Only logging on master: | |
if is_master: | |
logger.debug(f"Accuracy: {accuracy.item():.2%}") | |
logger.debug(f"Average Loss: {loss.item()}") | |
# Advance the progress bar one step and update the progress bar text. | |
progress_bar.update(1) | |
progress_bar.set_postfix(loss=loss.item(), accuracy=accuracy.item()) | |
progress_bar.close() | |
val_loss, val_accuracy = validation_loop(model, valid_dataloader, device) | |
# NOTE: This would log the same values in all workers. Only logging on master: | |
if is_master: | |
logger.info( | |
f"Epoch {epoch}: Val loss: {val_loss:.3f} accuracy: {val_accuracy:.2%}" | |
) | |
print("Done!") | |
torch.distributed.destroy_process_group() | |
@torch.no_grad() | |
def validation_loop(model: nn.Module, dataloader: DataLoader, device: torch.device): | |
model.eval() | |
total_loss = torch.as_tensor(0.0, device=device) | |
n_samples = torch.as_tensor(0, device=device) | |
correct_predictions = torch.as_tensor(0, device=device) | |
for batch in dataloader: | |
# batch = tuple(item.to(device) for item in batch) | |
x, y = batch | |
logits: Tensor = model(x) | |
loss = F.cross_entropy(logits, y) | |
batch_n_samples = x.shape[0] | |
batch_correct_predictions = logits.argmax(-1).eq(y).sum() | |
total_loss += loss | |
n_samples += batch_n_samples | |
correct_predictions += batch_correct_predictions | |
# Sum up the metrics we gathered on each worker before returning the overall val metrics. | |
torch.distributed.all_reduce(total_loss, op=torch.distributed.ReduceOp.SUM) | |
torch.distributed.all_reduce(correct_predictions, op=torch.distributed.ReduceOp.SUM) | |
torch.distributed.all_reduce(n_samples, op=torch.distributed.ReduceOp.SUM) | |
accuracy = correct_predictions / n_samples | |
return total_loss, accuracy | |
def setup(): | |
assert torch.distributed.is_available() | |
print("PyTorch Distributed available.") | |
print(" Backends:") | |
print(f" Gloo: {torch.distributed.is_gloo_available()}") | |
print(f" NCCL: {torch.distributed.is_nccl_available()}") | |
print(f" MPI: {torch.distributed.is_mpi_available()}") | |
# NOTE: the env:// init method uses FileLocks, which sometimes causes deadlocks due to the | |
# distributed filesystem configuration on the Mila cluster. | |
# For multi-node jobs, use the TCP init method instead. | |
master_addr = os.environ["MASTER_ADDR"] | |
master_port = os.environ["MASTER_PORT"] | |
# Default timeout is 30 minutes. Reducing the timeout here, so the job fails quicker if there's | |
# a communication problem between nodes. | |
timeout = timedelta(seconds=60) | |
# DDP Job is being run via `srun` on a slurm cluster. | |
rank = int(os.environ["SLURM_PROCID"]) | |
local_rank = int(os.environ["SLURM_LOCALID"]) | |
world_size = int(os.environ["SLURM_NTASKS"]) | |
# SLURM var -> torch.distributed vars in case needed | |
# NOTE: Setting these values isn't exactly necessary, but some code might assume it's | |
# being run via torchrun or torch.distributed.launch, so setting these can be a good idea. | |
os.environ["RANK"] = str(rank) | |
os.environ["LOCAL_RANK"] = str(local_rank) | |
os.environ["WORLD_SIZE"] = str(world_size) | |
torch.distributed.init_process_group( | |
backend="nccl", | |
init_method=f"tcp://{master_addr}:{master_port}", | |
timeout=timeout, | |
world_size=world_size, | |
rank=rank, | |
) | |
return rank, world_size, local_rank | |
def make_datasets( | |
dataset_path: str, | |
is_master: bool, | |
val_split: float = 0.1, | |
val_split_seed: int = 42, | |
): | |
"""Returns the training, validation, and test splits for CIFAR10. | |
NOTE: We don't use image transforms here for simplicity. | |
Having different transformations for train and validation would complicate things a bit. | |
Later examples will show how to do the train/val/test split properly when using transforms. | |
NOTE: Only the master process (rank-0) downloads the dataset if necessary. | |
""" | |
# - Master: Download (if necessary) THEN Barrier | |
# - others: Barrier THEN *NO* Download | |
if not is_master: | |
# Wait for the master process to finish downloading (reach the barrier below) | |
torch.distributed.barrier() | |
train_dataset = CIFAR10( | |
root=dataset_path, | |
transform=transforms.ToTensor(), | |
download=is_master, | |
train=True, | |
) | |
test_dataset = CIFAR10( | |
root=dataset_path, | |
transform=transforms.ToTensor(), | |
download=is_master, | |
train=False, | |
) | |
if is_master: | |
# Join the workers waiting in the barrier above. They can now load the datasets from disk. | |
torch.distributed.barrier() | |
# Split the training dataset into a training and validation set. | |
n_samples = len(train_dataset) | |
n_valid = int(val_split * n_samples) | |
n_train = n_samples - n_valid | |
train_dataset, valid_dataset = random_split( | |
train_dataset, (n_train, n_valid), torch.Generator().manual_seed(val_split_seed) | |
) | |
return train_dataset, valid_dataset, test_dataset | |
def get_num_workers() -> int: | |
"""Gets the optimal number of DatLoader workers to use in the current job.""" | |
if "SLURM_CPUS_PER_TASK" in os.environ: | |
return int(os.environ["SLURM_CPUS_PER_TASK"]) | |
if hasattr(os, "sched_getaffinity"): | |
return len(os.sched_getaffinity(0)) | |
return torch.multiprocessing.cpu_count() | |
if __name__ == "__main__": | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[project] | |
name = "torch-distributed-debug" | |
version = "0.1.0" | |
description = "Add your description here" | |
readme = "README.md" | |
requires-python = ">=3.12" | |
dependencies = [ | |
"rich>=14.0.0", | |
"torch>=2.7.0", | |
"torchvision>=0.22.0", | |
"tqdm>=4.67.1", | |
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Date: Wed Apr 23 01:06:08 PM EDT 2025 | |
Hostname: tg10907.tamia.ecpia.ca | |
======== GPU REPORT ======== | |
======== GPU REPORT ======== | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:09 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:09 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes : None | |
Wed Apr 23 13:06:09 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
Wed Apr 23 13:06:09 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 33C P0 68W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 68W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 33C P0 71W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 34C P0 70W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:20] INFO [5/8] __main__ - World size: 8, global main.py:60 | |
rank: 5, local rank: 1, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:20] INFO [7/8] __main__ - World size: 8, global main.py:60 | |
rank: 7, local rank: 3, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:20] INFO [3/8] __main__ - World size: 8, global main.py:60 | |
rank: 3, local rank: 3, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:20] INFO [1/8] __main__ - World size: 8, global main.py:60 | |
rank: 1, local rank: 1, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:21] INFO [6/8] __main__ - World size: 8, global main.py:60 | |
rank: 6, local rank: 2, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:21] INFO [4/8] __main__ - World size: 8, global main.py:60 | |
rank: 4, local rank: 0, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:21] INFO [2/8] __main__ - World size: 8, global main.py:60 | |
rank: 2, local rank: 2, local device | |
count: 1 | |
PyTorch Distributed available. | |
Backends: | |
Gloo: True | |
NCCL: True | |
MPI: False | |
[04/23/25 13:06:21] INFO [0/8] __main__ - World size: 8, global main.py:60 | |
rank: 0, local rank: 0, local device | |
count: 1 | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
libibverbs: Warning: couldn't load driver 'libvmw_pvrdma-rdmav34.so': libvmw_pvrdma-rdmav34.so: cannot open shared object file: No such file or directory | |
tg10907:995238:995238 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0> | |
tg10907:995238:995238 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10907:995238:995238 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10907:995238:995238 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10907:995238:995314 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10907:995238:995314 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0> | |
tg10907:995238:995314 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10907:995238:995314 [0] NCCL INFO Using network IB | |
tg10907:995238:995314 [0] NCCL INFO ncclCommInitRankConfig comm 0x12a20e80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init START | |
tg10907:995238:995314 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
tg10907:995238:995314 [0] NCCL INFO Bootstrap timings total 1.074140 (create 0.000026, send 0.000093, recv 0.000564, ring 1.032268, delay 0.000001) | |
tg10907:995238:995314 [0] NCCL INFO Setting affinity for GPU 0 to 555555 | |
tg10907:995238:995314 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10907:995238:995314 [0] NCCL INFO comm 0x12a20e80 rank 0 nRanks 8 nNodes 2 localRanks 4 localRank 0 MNNVL 0 | |
tg10907:995238:995314 [0] NCCL INFO Channel 00/08 : 0 3 1 2 4 7 5 6 | |
tg10907:995238:995314 [0] NCCL INFO Channel 01/08 : 0 3 5 6 4 7 1 2 | |
tg10907:995238:995314 [0] NCCL INFO Channel 02/08 : 0 6 7 5 4 2 3 1 | |
tg10907:995238:995314 [0] NCCL INFO Channel 03/08 : 0 1 7 6 4 5 3 2 | |
tg10907:995238:995314 [0] NCCL INFO Channel 04/08 : 0 3 1 2 4 7 5 6 | |
tg10907:995238:995314 [0] NCCL INFO Channel 05/08 : 0 3 5 6 4 7 1 2 | |
tg10907:995238:995314 [0] NCCL INFO Channel 06/08 : 0 6 7 5 4 2 3 1 | |
tg10907:995238:995314 [0] NCCL INFO Channel 07/08 : 0 1 7 6 4 5 3 2 | |
tg10907:995238:995314 [0] NCCL INFO Trees [0] 1/4/-1->0->-1 [1] -1/-1/-1->0->3 [2] -1/-1/-1->0->1 [3] 2/-1/-1->0->3 [4] 1/-1/-1->0->4 [5] -1/-1/-1->0->3 [6] -1/-1/-1->0->1 [7] 2/-1/-1->0->3 | |
tg10907:995238:995314 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10907:995238:995314 [0] NCCL INFO Check P2P Type intraNodeP2pSupport 1 directMode 0 | |
tg10907:995238:995346 [0] NCCL INFO [Proxy Service] Device 0 CPU core 6 | |
tg10907:995238:995347 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 8 | |
tg10907:995238:995314 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
tg10907:995238:995314 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10907:995238:995314 [0] NCCL INFO CC Off, workFifoBytes 1048576 | |
tg10907:995238:995314 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10907:995238:995314 [0] NCCL INFO ncclCommInitRankConfig comm 0x12a20e80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10907:995238:995314 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 0 nranks 8 total 1.58 (kernels 0.28, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00) | |
tg10907:995238:995350 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[0] via P2P/CUMEM | |
tg10907:995238:995350 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[0] via P2P/CUMEM | |
tg10907:995238:995350 [0] NCCL INFO Channel 00/0 : 6[0] -> 0[0] [receive] via NET/IB/4 | |
tg10907:995238:995354 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 10 | |
tg10907:995238:995350 [0] NCCL INFO Channel 04/0 : 6[0] -> 0[0] [receive] via NET/IB/4 | |
tg10907:995238:995350 [0] NCCL INFO Channel 00/0 : 0[0] -> 3[0] via P2P/CUMEM | |
tg10907:995238:995350 [0] NCCL INFO Channel 01/0 : 0[0] -> 3[0] via P2P/CUMEM | |
tg10907:995238:995350 [0] NCCL INFO Channel 04/0 : 0[0] -> 3[0] via P2P/CUMEM | |
tg10907:995238:995350 [0] NCCL INFO Channel 05/0 : 0[0] -> 3[0] via P2P/CUMEM | |
tg10907:995238:995350 [0] NCCL INFO Channel 02/0 : 0[0] -> 6[0] [send] via NET/IB/5 | |
tg10907:995238:995350 [0] NCCL INFO Channel 06/0 : 0[0] -> 6[0] [send] via NET/IB/5 | |
tg10908:875805:875805 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10908:875805:875805 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0> | |
tg10908:875805:875805 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10908:875805:875805 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10908:875805:875884 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10908:875805:875884 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0> | |
tg10908:875805:875884 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10908:875805:875884 [0] NCCL INFO Using network IB | |
tg10908:875805:875884 [0] NCCL INFO ncclCommInitRankConfig comm 0xa39eac0 rank 4 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init START | |
tg10908:875805:875884 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
tg10908:875805:875884 [0] NCCL INFO Bootstrap timings total 1.033781 (create 0.000065, send 1.018227, recv 0.000403, ring 0.000452, delay 0.000001) | |
tg10908:875805:875884 [0] NCCL INFO Setting affinity for GPU 0 to 555555 | |
tg10908:875805:875884 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10908:875805:875884 [0] NCCL INFO comm 0xa39eac0 rank 4 nRanks 8 nNodes 2 localRanks 4 localRank 0 MNNVL 0 | |
tg10908:875805:875884 [0] NCCL INFO Trees [0] 5/-1/-1->4->0 [1] -1/-1/-1->4->7 [2] -1/-1/-1->4->5 [3] 6/-1/-1->4->7 [4] 5/0/-1->4->-1 [5] -1/-1/-1->4->7 [6] -1/-1/-1->4->5 [7] 6/-1/-1->4->7 | |
tg10908:875805:875884 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10908:875805:875908 [0] NCCL INFO [Proxy Service] Device 0 CPU core 4 | |
tg10908:875805:875912 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 6 | |
tg10908:875805:875884 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
tg10908:875805:875884 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10908:875805:875884 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10908:875805:875884 [0] NCCL INFO ncclCommInitRankConfig comm 0xa39eac0 rank 4 nranks 8 cudaDev 0 nvmlDev 0 busId 4e000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10908:875805:875884 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 4 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.00, topo 0.04, graphs 0.10, connections 0.01, rest 0.00) | |
tg10908:875805:875917 [0] NCCL INFO Channel 03/0 : 4[0] -> 5[0] via P2P/CUMEM | |
tg10908:875805:875917 [0] NCCL INFO Channel 07/0 : 4[0] -> 5[0] via P2P/CUMEM | |
tg10908:875805:875917 [0] NCCL INFO Channel 00/0 : 2[0] -> 4[0] [receive] via NET/IB/4 | |
tg10908:875805:875917 [0] NCCL INFO Channel 04/0 : 2[0] -> 4[0] [receive] via NET/IB/4 | |
tg10908:875805:875922 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 10 | |
tg10908:875805:875917 [0] NCCL INFO Channel 00/0 : 4[0] -> 7[0] via P2P/CUMEM | |
tg10908:875805:875917 [0] NCCL INFO Channel 01/0 : 4[0] -> 7[0] via P2P/CUMEM | |
tg10908:875805:875917 [0] NCCL INFO Channel 04/0 : 4[0] -> 7[0] via P2P/CUMEM | |
tg10908:875805:875917 [0] NCCL INFO Channel 05/0 : 4[0] -> 7[0] via P2P/CUMEM | |
tg10908:875805:875917 [0] NCCL INFO Channel 02/0 : 4[0] -> 2[0] [send] via NET/IB/5 | |
tg10908:875805:875917 [0] NCCL INFO Channel 06/0 : 4[0] -> 2[0] [send] via NET/IB/5 | |
[2025-04-23 13:06:23] tg10908:875805:875917 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10908:875805:875917 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10908:875805:875917 [0] NCCL INFO transport/p2p.cc:489 -> 1 | |
tg10908:875805:875917 [0] NCCL INFO transport.cc:197 -> 1 | |
tg10908:875805:875917 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10908:875805:875917 [0] NCCL INFO group.cc:148 -> 1 | |
tg10908:875805:875917 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
tg10908:875805:875805 [0] NCCL INFO group.cc:460 -> 1 | |
tg10908:875805:875805 [0] NCCL INFO group.cc:581 -> 1 | |
tg10908:875805:875805 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
tg10908:875807:875807 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10908:875807:875807 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0> | |
tg10908:875807:875807 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10908:875807:875807 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10908:875807:875883 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10908:875807:875883 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0> | |
tg10908:875807:875883 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10908:875807:875883 [0] NCCL INFO Using network IB | |
tg10908:875807:875883 [0] NCCL INFO ncclCommInitRankConfig comm 0x12941f80 rank 5 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init START | |
tg10908:875807:875883 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
tg10908:875807:875883 [0] NCCL INFO Bootstrap timings total 1.033800 (create 0.000027, send 0.000538, recv 0.000957, ring 0.000472, delay 0.000000) | |
tg10908:875807:875883 [0] NCCL INFO Setting affinity for GPU 0 to aaaaaa | |
tg10908:875807:875883 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10908:875807:875883 [0] NCCL INFO comm 0x12941f80 rank 5 nRanks 8 nNodes 2 localRanks 4 localRank 1 MNNVL 0 | |
tg10908:875807:875883 [0] NCCL INFO Trees [0] 7/-1/-1->5->4 [1] 6/-1/-1->5->1 [2] 4/-1/-1->5->7 [3] -1/-1/-1->5->6 [4] 7/-1/-1->5->4 [5] 6/1/-1->5->-1 [6] 4/-1/-1->5->7 [7] -1/-1/-1->5->6 | |
tg10908:875807:875883 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10908:875807:875914 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 15 | |
tg10908:875807:875910 [0] NCCL INFO [Proxy Service] Device 0 CPU core 13 | |
tg10908:875807:875883 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
tg10908:875807:875883 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10908:875807:875883 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10908:875807:875883 [0] NCCL INFO ncclCommInitRankConfig comm 0x12941f80 rank 5 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10908:875807:875883 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 5 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.04, topo 0.03, graphs 0.06, connections 0.01, rest 0.00) | |
tg10908:875807:875918 [0] NCCL INFO Channel 00/0 : 5[0] -> 6[0] via P2P/CUMEM | |
tg10908:875807:875918 [0] NCCL INFO Channel 01/0 : 5[0] -> 6[0] via P2P/CUMEM | |
tg10908:875807:875918 [0] NCCL INFO Channel 04/0 : 5[0] -> 6[0] via P2P/CUMEM | |
tg10908:875807:875918 [0] NCCL INFO Channel 05/0 : 5[0] -> 6[0] via P2P/CUMEM | |
tg10908:875807:875918 [0] NCCL INFO Channel 01/0 : 3[0] -> 5[0] [receive] via NET/IB/6 | |
tg10908:875807:875923 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 17 | |
tg10908:875807:875918 [0] NCCL INFO Channel 05/0 : 3[0] -> 5[0] [receive] via NET/IB/6 | |
tg10908:875807:875918 [0] NCCL INFO Channel 03/0 : 5[0] -> 3[0] [send] via NET/IB/7 | |
tg10908:875807:875918 [0] NCCL INFO Channel 07/0 : 5[0] -> 3[0] [send] via NET/IB/7 | |
tg10908:875807:875918 [0] NCCL INFO Channel 02/0 : 5[0] -> 4[0] via P2P/CUMEM | |
tg10908:875807:875918 [0] NCCL INFO Channel 06/0 : 5[0] -> 4[0] via P2P/CUMEM | |
[2025-04-23 13:06:23] tg10908:875807:875918 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10908:875807:875918 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10908:875807:875918 [0] NCCL INFO transport/p2p.cc:489 -> 1 | |
tg10908:875807:875918 [0] NCCL INFO transport.cc:197 -> 1 | |
tg10908:875807:875918 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10908:875807:875918 [0] NCCL INFO group.cc:148 -> 1 | |
tg10908:875807:875918 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
tg10908:875807:875807 [0] NCCL INFO group.cc:460 -> 1 | |
tg10908:875807:875807 [0] NCCL INFO group.cc:581 -> 1 | |
tg10908:875807:875807 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
tg10908:875804:875804 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10908:875804:875804 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0> | |
tg10908:875804:875804 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10908:875804:875804 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10908:875804:875882 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10908:875804:875882 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0> | |
tg10908:875804:875882 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10908:875804:875882 [0] NCCL INFO Using network IB | |
tg10908:875804:875882 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ff86a80 rank 7 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init START | |
tg10908:875804:875882 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
tg10908:875804:875882 [0] NCCL INFO Bootstrap timings total 1.033817 (create 0.000034, send 0.000473, recv 0.000650, ring 1.023471, delay 0.000001) | |
tg10908:875804:875882 [0] NCCL INFO Setting affinity for GPU 0 to aaaa,aa000000 | |
tg10908:875804:875882 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10908:875804:875882 [0] NCCL INFO comm 0x1ff86a80 rank 7 nRanks 8 nNodes 2 localRanks 4 localRank 3 MNNVL 0 | |
tg10908:875804:875882 [0] NCCL INFO Trees [0] 6/-1/-1->7->5 [1] 4/-1/-1->7->6 [2] 5/-1/-1->7->6 [3] 4/-1/-1->7->3 [4] 6/-1/-1->7->5 [5] 4/-1/-1->7->6 [6] 5/-1/-1->7->6 [7] 4/3/-1->7->-1 | |
tg10908:875804:875882 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10908:875804:875915 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 31 | |
tg10908:875804:875911 [0] NCCL INFO [Proxy Service] Device 0 CPU core 29 | |
tg10908:875804:875882 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
tg10908:875804:875882 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10908:875804:875882 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10908:875804:875882 [0] NCCL INFO ncclCommInitRankConfig comm 0x1ff86a80 rank 7 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10908:875804:875882 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 7 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.05, topo 0.03, graphs 0.06, connections 0.01, rest 0.00) | |
tg10908:875804:875916 [0] NCCL INFO Channel 01/0 : 7[0] -> 1[0] [send] via NET/IB/6 | |
tg10908:875804:875916 [0] NCCL INFO Channel 05/0 : 7[0] -> 1[0] [send] via NET/IB/6 | |
tg10908:875804:875920 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 35 | |
tg10908:875804:875916 [0] NCCL INFO Channel 03/0 : 1[0] -> 7[0] [receive] via NET/IB/7 | |
[2025-04-23 13:06:23] tg10907:995238:995350 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10907:995238:995350 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10907:995238:995350 [0] NCCL INFO transport/p2p.cc:489 -> 1 | |
tg10907:995238:995350 [0] NCCL INFO transport.cc:197 -> 1 | |
tg10907:995238:995350 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10907:995238:995350 [0] NCCL INFO group.cc:148 -> 1 | |
tg10907:995238:995350 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
tg10907:995238:995238 [0] NCCL INFO group.cc:460 -> 1 | |
tg10907:995238:995238 [0] NCCL INFO group.cc:581 -> 1 | |
tg10907:995238:995238 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
tg10908:875804:875916 [0] NCCL INFO Channel 07/0 : 1[0] -> 7[0] [receive] via NET/IB/7 | |
tg10908:875804:875916 [0] NCCL INFO Channel 00/0 : 7[0] -> 5[0] via P2P/CUMEM | |
tg10908:875804:875916 [0] NCCL INFO Channel 02/0 : 7[0] -> 5[0] via P2P/CUMEM | |
tg10908:875804:875916 [0] NCCL INFO Channel 04/0 : 7[0] -> 5[0] via P2P/CUMEM | |
tg10908:875804:875916 [0] NCCL INFO Channel 06/0 : 7[0] -> 5[0] via P2P/CUMEM | |
tg10908:875804:875916 [0] NCCL INFO Channel 03/0 : 7[0] -> 6[0] via P2P/CUMEM | |
tg10908:875804:875916 [0] NCCL INFO Channel 07/0 : 7[0] -> 6[0] via P2P/CUMEM | |
[2025-04-23 13:06:23] tg10908:875804:875916 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10908:875804:875916 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10908:875804:875916 [0] NCCL INFO transport/p2p.cc:537 -> 1 | |
tg10908:875804:875916 [0] NCCL INFO transport.cc:216 -> 1 | |
tg10908:875804:875916 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10908:875804:875916 [0] NCCL INFO group.cc:148 -> 1 | |
tg10907:995236:995236 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10907:995236:995236 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0> | |
tg10907:995236:995236 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10907:995236:995236 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10907:995236:995316 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10907:995236:995316 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0> | |
tg10907:995236:995316 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10907:995236:995316 [0] NCCL INFO Using network IB | |
tg10907:995236:995316 [0] NCCL INFO ncclCommInitRankConfig comm 0x258ed380 rank 1 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init START | |
tg10907:995236:995316 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
tg10908:875804:875916 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
tg10908:875804:875804 [0] NCCL INFO group.cc:460 -> 1 | |
tg10908:875804:875804 [0] NCCL INFO group.cc:581 -> 1 | |
tg10908:875804:875804 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
tg10907:995236:995316 [0] NCCL INFO Bootstrap timings total 1.074027 (create 0.000032, send 0.000105, recv 0.000399, ring 1.063659, delay 0.000001) | |
tg10907:995236:995316 [0] NCCL INFO Setting affinity for GPU 0 to aaaaaa | |
tg10907:995236:995316 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10907:995236:995316 [0] NCCL INFO comm 0x258ed380 rank 1 nRanks 8 nNodes 2 localRanks 4 localRank 1 MNNVL 0 | |
tg10907:995236:995316 [0] NCCL INFO Trees [0] 3/-1/-1->1->0 [1] 2/5/-1->1->-1 [2] 0/-1/-1->1->3 [3] -1/-1/-1->1->2 [4] 3/-1/-1->1->0 [5] 2/-1/-1->1->5 [6] 0/-1/-1->1->3 [7] -1/-1/-1->1->2 | |
tg10907:995236:995316 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10907:995236:995341 [0] NCCL INFO [Proxy Service] Device 0 CPU core 5 | |
tg10907:995236:995345 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 7 | |
tg10907:995236:995316 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
tg10908:875806:875806 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10908:875806:875806 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.27<0> | |
tg10908:875806:875806 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10908:875806:875806 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10908:875806:875885 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10908:875806:875885 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.27<0> | |
tg10908:875806:875885 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10908:875806:875885 [0] NCCL INFO Using network IB | |
tg10908:875806:875885 [0] NCCL INFO ncclCommInitRankConfig comm 0x2d22f240 rank 6 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init START | |
tg10908:875806:875885 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
tg10907:995236:995316 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10907:995236:995316 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10907:995236:995316 [0] NCCL INFO ncclCommInitRankConfig comm 0x258ed380 rank 1 nranks 8 cudaDev 0 nvmlDev 0 busId cb000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10907:995236:995316 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 1 nranks 8 total 1.57 (kernels 0.27, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00) | |
tg10907:995236:995349 [0] NCCL INFO Channel 00/0 : 1[0] -> 2[0] via P2P/CUMEM | |
tg10907:995236:995349 [0] NCCL INFO Channel 01/0 : 1[0] -> 2[0] via P2P/CUMEM | |
tg10907:995236:995349 [0] NCCL INFO Channel 04/0 : 1[0] -> 2[0] via P2P/CUMEM | |
tg10907:995236:995349 [0] NCCL INFO Channel 05/0 : 1[0] -> 2[0] via P2P/CUMEM | |
tg10907:995236:995349 [0] NCCL INFO Channel 01/0 : 7[0] -> 1[0] [receive] via NET/IB/6 | |
tg10908:875806:875885 [0] NCCL INFO Bootstrap timings total 1.033766 (create 0.000027, send 0.000474, recv 0.000752, ring 1.022451, delay 0.000000) | |
tg10908:875806:875885 [0] NCCL INFO Setting affinity for GPU 0 to 5555,55000000 | |
tg10908:875806:875885 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10908:875806:875885 [0] NCCL INFO comm 0x2d22f240 rank 6 nRanks 8 nNodes 2 localRanks 4 localRank 2 MNNVL 0 | |
tg10908:875806:875885 [0] NCCL INFO Trees [0] -1/-1/-1->6->7 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->2 [3] 5/-1/-1->6->4 [4] -1/-1/-1->6->7 [5] 7/-1/-1->6->5 [6] 7/2/-1->6->-1 [7] 5/-1/-1->6->4 | |
tg10908:875806:875885 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10908:875806:875913 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 32 | |
tg10908:875806:875909 [0] NCCL INFO [Proxy Service] Device 0 CPU core 30 | |
tg10908:875806:875885 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
[rank4]: Traceback (most recent call last): | |
[rank4]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank4]: main() | |
[rank4]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank4]: model = nn.parallel.DistributedDataParallel( | |
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank4]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank4]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank4]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank4]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank4]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995236:995349 [0] NCCL INFO Channel 05/0 : 7[0] -> 1[0] [receive] via NET/IB/6 | |
tg10907:995236:995352 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 11 | |
tg10907:995236:995349 [0] NCCL INFO Channel 03/0 : 1[0] -> 7[0] [send] via NET/IB/7 | |
tg10907:995236:995349 [0] NCCL INFO Channel 07/0 : 1[0] -> 7[0] [send] via NET/IB/7 | |
tg10907:995236:995349 [0] NCCL INFO Channel 02/0 : 1[0] -> 0[0] via P2P/CUMEM | |
tg10907:995236:995349 [0] NCCL INFO Channel 06/0 : 1[0] -> 0[0] via P2P/CUMEM | |
[2025-04-23 13:06:23] tg10907:995236:995349 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10907:995236:995349 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10907:995236:995349 [0] NCCL INFO transport/p2p.cc:489 -> 1 | |
tg10907:995236:995349 [0] NCCL INFO transport.cc:197 -> 1 | |
tg10907:995236:995349 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10907:995236:995349 [0] NCCL INFO group.cc:148 -> 1 | |
tg10907:995236:995349 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
[rank4]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank4]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank4]: Last error: | |
[rank4]: Cuda failure 101 'invalid device ordinal' | |
tg10908:875806:875885 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10908:875806:875885 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10908:875806:875885 [0] NCCL INFO ncclCommInitRankConfig comm 0x2d22f240 rank 6 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10908:875806:875885 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 6 nranks 8 total 1.56 (kernels 0.30, alloc 0.08, bootstrap 1.03, allgathers 0.05, topo 0.03, graphs 0.06, connections 0.01, rest 0.00) | |
tg10908:875806:875919 [0] NCCL INFO Channel 02/0 : 6[0] -> 7[0] via P2P/CUMEM | |
tg10908:875806:875919 [0] NCCL INFO Channel 06/0 : 6[0] -> 7[0] via P2P/CUMEM | |
tg10908:875806:875919 [0] NCCL INFO Channel 00/0 : 6[0] -> 0[0] [send] via NET/IB/4 | |
tg10908:875806:875919 [0] NCCL INFO Channel 04/0 : 6[0] -> 0[0] [send] via NET/IB/4 | |
tg10908:875806:875921 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 36 | |
[rank5]: Traceback (most recent call last): | |
[rank5]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank5]: main() | |
[rank5]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank5]: model = nn.parallel.DistributedDataParallel( | |
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank5]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank5]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank5]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank5]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank5]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995236:995236 [0] NCCL INFO group.cc:460 -> 1 | |
tg10907:995236:995236 [0] NCCL INFO group.cc:581 -> 1 | |
tg10907:995236:995236 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
[rank5]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank5]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank5]: Last error: | |
[rank5]: Cuda failure 101 'invalid device ordinal' | |
tg10908:875806:875919 [0] NCCL INFO Channel 02/0 : 0[0] -> 6[0] [receive] via NET/IB/5 | |
tg10908:875806:875919 [0] NCCL INFO Channel 06/0 : 0[0] -> 6[0] [receive] via NET/IB/5 | |
tg10908:875806:875919 [0] NCCL INFO Channel 01/0 : 6[0] -> 4[0] via P2P/CUMEM | |
tg10908:875806:875919 [0] NCCL INFO Channel 03/0 : 6[0] -> 4[0] via P2P/CUMEM | |
tg10908:875806:875919 [0] NCCL INFO Channel 05/0 : 6[0] -> 4[0] via P2P/CUMEM | |
tg10908:875806:875919 [0] NCCL INFO Channel 07/0 : 6[0] -> 4[0] via P2P/CUMEM | |
[2025-04-23 13:06:23] tg10908:875806:875919 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10908:875806:875919 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10908:875806:875919 [0] NCCL INFO transport/p2p.cc:537 -> 1 | |
tg10908:875806:875919 [0] NCCL INFO transport.cc:216 -> 1 | |
tg10908:875806:875919 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10908:875806:875919 [0] NCCL INFO group.cc:148 -> 1 | |
tg10908:875806:875919 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
[rank7]: Traceback (most recent call last): | |
[rank7]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank7]: main() | |
[rank7]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank7]: model = nn.parallel.DistributedDataParallel( | |
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank7]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank7]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank7]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank7]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank7]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995237:995237 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10907:995237:995237 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0> | |
tg10907:995237:995237 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10907:995237:995237 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10907:995237:995315 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10907:995237:995315 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0> | |
tg10907:995237:995315 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10907:995237:995315 [0] NCCL INFO Using network IB | |
tg10907:995237:995315 [0] NCCL INFO ncclCommInitRankConfig comm 0x2341dd40 rank 2 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init START | |
tg10907:995237:995315 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
[rank7]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank7]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank7]: Last error: | |
[rank7]: Cuda failure 101 'invalid device ordinal' | |
tg10908:875806:875806 [0] NCCL INFO group.cc:460 -> 1 | |
tg10908:875806:875806 [0] NCCL INFO group.cc:581 -> 1 | |
tg10908:875806:875806 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
[rank6]: Traceback (most recent call last): | |
[rank6]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank6]: main() | |
[rank6]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank6]: model = nn.parallel.DistributedDataParallel( | |
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank6]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank6]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank6]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank6]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank6]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995237:995315 [0] NCCL INFO Bootstrap timings total 1.074027 (create 0.000024, send 0.000083, recv 0.000637, ring 1.063644, delay 0.000000) | |
tg10907:995237:995315 [0] NCCL INFO Setting affinity for GPU 0 to 5555,55000000 | |
tg10907:995237:995315 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10907:995237:995315 [0] NCCL INFO comm 0x2341dd40 rank 2 nRanks 8 nNodes 2 localRanks 4 localRank 2 MNNVL 0 | |
tg10907:995237:995315 [0] NCCL INFO Trees [0] -1/-1/-1->2->3 [1] 3/-1/-1->2->1 [2] 3/6/-1->2->-1 [3] 1/-1/-1->2->0 [4] -1/-1/-1->2->3 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->6 [7] 1/-1/-1->2->0 | |
tg10907:995237:995315 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10907:995237:995340 [0] NCCL INFO [Proxy Service] Device 0 CPU core 36 | |
tg10907:995237:995343 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 38 | |
tg10907:995237:995315 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
[rank6]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank6]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank6]: Last error: | |
[rank6]: Cuda failure 101 'invalid device ordinal' | |
tg10907:995237:995315 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10907:995237:995315 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10907:995237:995315 [0] NCCL INFO ncclCommInitRankConfig comm 0x2341dd40 rank 2 nranks 8 cudaDev 0 nvmlDev 0 busId 5f000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10907:995237:995315 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 2 nranks 8 total 1.58 (kernels 0.27, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00) | |
tg10907:995237:995348 [0] NCCL INFO Channel 02/0 : 2[0] -> 3[0] via P2P/CUMEM | |
tg10907:995237:995348 [0] NCCL INFO Channel 06/0 : 2[0] -> 3[0] via P2P/CUMEM | |
tg10907:995237:995348 [0] NCCL INFO Channel 00/0 : 2[0] -> 4[0] [send] via NET/IB/4 | |
tg10907:995237:995348 [0] NCCL INFO Channel 04/0 : 2[0] -> 4[0] [send] via NET/IB/4 | |
tg10907:995237:995355 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 42 | |
[rank1]: Traceback (most recent call last): | |
[rank1]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank1]: main() | |
[rank1]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank1]: model = nn.parallel.DistributedDataParallel( | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank1]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank1]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank1]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995237:995348 [0] NCCL INFO Channel 02/0 : 4[0] -> 2[0] [receive] via NET/IB/5 | |
tg10907:995237:995348 [0] NCCL INFO Channel 06/0 : 4[0] -> 2[0] [receive] via NET/IB/5 | |
tg10907:995237:995348 [0] NCCL INFO Channel 01/0 : 2[0] -> 0[0] via P2P/CUMEM | |
tg10907:995237:995348 [0] NCCL INFO Channel 03/0 : 2[0] -> 0[0] via P2P/CUMEM | |
tg10907:995237:995348 [0] NCCL INFO Channel 05/0 : 2[0] -> 0[0] via P2P/CUMEM | |
tg10907:995237:995348 [0] NCCL INFO Channel 07/0 : 2[0] -> 0[0] via P2P/CUMEM | |
[2025-04-23 13:06:23] tg10907:995237:995348 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10907:995237:995348 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10907:995237:995348 [0] NCCL INFO transport/p2p.cc:537 -> 1 | |
tg10907:995237:995348 [0] NCCL INFO transport.cc:216 -> 1 | |
tg10907:995237:995348 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10907:995237:995348 [0] NCCL INFO group.cc:148 -> 1 | |
tg10907:995237:995348 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
[rank1]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank1]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank1]: Last error: | |
[rank1]: Cuda failure 101 'invalid device ordinal' | |
tg10907:995237:995237 [0] NCCL INFO group.cc:460 -> 1 | |
tg10907:995237:995237 [0] NCCL INFO group.cc:581 -> 1 | |
tg10907:995237:995237 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
[rank2]: Traceback (most recent call last): | |
[rank2]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank2]: main() | |
[rank2]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank2]: model = nn.parallel.DistributedDataParallel( | |
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank2]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank2]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank2]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank2]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank2]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995235:995235 [0] NCCL INFO cudaDriverVersion 12080 | |
tg10907:995235:995235 [0] NCCL INFO Bootstrap: Using ibp77s0:10.225.0.26<0> | |
tg10907:995235:995235 [0] NCCL INFO NCCL version 2.26.2+cuda12.2 | |
tg10907:995235:995235 [0] NCCL INFO Comm config Blocking set to 1 | |
tg10907:995235:995317 [0] NCCL INFO NET/Plugin: Could not find: libnccl-net.so. Using internal net plugin. | |
tg10907:995235:995317 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [RO]; OOB ibp77s0:10.225.0.26<0> | |
tg10907:995235:995317 [0] NCCL INFO PROFILER/Plugin: Could not find: libnccl-profiler.so. | |
tg10907:995235:995317 [0] NCCL INFO Using network IB | |
tg10907:995235:995317 [0] NCCL INFO ncclCommInitRankConfig comm 0x19679cc0 rank 3 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init START | |
tg10907:995235:995317 [0] NCCL INFO RAS client listening socket at 127.0.0.1<28028> | |
[rank2]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank2]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank2]: Last error: | |
[rank2]: Cuda failure 101 'invalid device ordinal' | |
tg10907:995235:995317 [0] NCCL INFO Bootstrap timings total 1.073976 (create 0.000027, send 0.000088, recv 1.058721, ring 0.007501, delay 0.000000) | |
tg10907:995235:995317 [0] NCCL INFO Setting affinity for GPU 0 to aaaa,aa000000 | |
tg10907:995235:995317 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
tg10907:995235:995317 [0] NCCL INFO comm 0x19679cc0 rank 3 nRanks 8 nNodes 2 localRanks 4 localRank 3 MNNVL 0 | |
tg10907:995235:995317 [0] NCCL INFO Trees [0] 2/-1/-1->3->1 [1] 0/-1/-1->3->2 [2] 1/-1/-1->3->2 [3] 0/7/-1->3->-1 [4] 2/-1/-1->3->1 [5] 0/-1/-1->3->2 [6] 1/-1/-1->3->2 [7] 0/-1/-1->3->7 | |
tg10907:995235:995317 [0] NCCL INFO P2P Chunksize set to 131072 | |
tg10907:995235:995344 [0] NCCL INFO [Proxy Service UDS] Device 0 CPU core 41 | |
tg10907:995235:995342 [0] NCCL INFO [Proxy Service] Device 0 CPU core 37 | |
tg10907:995235:995317 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
[rank3]: Traceback (most recent call last): | |
[rank3]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank3]: main() | |
[rank3]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank3]: model = nn.parallel.DistributedDataParallel( | |
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank3]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank3]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank3]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank3]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank3]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995235:995317 [0] NCCL INFO 8 coll channels, 8 collnet channels, 0 nvls channels, 8 p2p channels, 2 p2p channels per peer | |
tg10907:995235:995317 [0] NCCL INFO TUNER/Plugin: Could not find: libnccl-tuner.so. Using internal tuner plugin. | |
tg10907:995235:995317 [0] NCCL INFO ncclCommInitRankConfig comm 0x19679cc0 rank 3 nranks 8 cudaDev 0 nvmlDev 0 busId db000 commId 0x2ce70ba701de3f9c - Init COMPLETE | |
tg10907:995235:995317 [0] NCCL INFO Init timings - ncclCommInitRankConfig: rank 3 nranks 8 total 1.57 (kernels 0.27, alloc 0.08, bootstrap 1.07, allgathers 0.06, topo 0.02, graphs 0.06, connections 0.01, rest 0.00) | |
tg10907:995235:995351 [0] NCCL INFO Channel 01/0 : 3[0] -> 5[0] [send] via NET/IB/6 | |
tg10907:995235:995351 [0] NCCL INFO Channel 05/0 : 3[0] -> 5[0] [send] via NET/IB/6 | |
tg10907:995235:995353 [0] NCCL INFO [Proxy Progress] Device 0 CPU core 45 | |
tg10907:995235:995351 [0] NCCL INFO Channel 03/0 : 5[0] -> 3[0] [receive] via NET/IB/7 | |
[rank3]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank3]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank3]: Last error: | |
[rank3]: Cuda failure 101 'invalid device ordinal' | |
tg10907:995235:995351 [0] NCCL INFO Channel 07/0 : 5[0] -> 3[0] [receive] via NET/IB/7 | |
tg10907:995235:995351 [0] NCCL INFO Channel 00/0 : 3[0] -> 1[0] via P2P/CUMEM | |
tg10907:995235:995351 [0] NCCL INFO Channel 02/0 : 3[0] -> 1[0] via P2P/CUMEM | |
tg10907:995235:995351 [0] NCCL INFO Channel 04/0 : 3[0] -> 1[0] via P2P/CUMEM | |
tg10907:995235:995351 [0] NCCL INFO Channel 06/0 : 3[0] -> 1[0] via P2P/CUMEM | |
tg10907:995235:995351 [0] NCCL INFO Channel 03/0 : 3[0] -> 2[0] via P2P/CUMEM | |
tg10907:995235:995351 [0] NCCL INFO Channel 07/0 : 3[0] -> 2[0] via P2P/CUMEM | |
[2025-04-23 13:06:23] tg10907:995235:995351 [0] transport/p2p.cc:274 NCCL WARN Cuda failure 101 'invalid device ordinal' | |
tg10907:995235:995351 [0] NCCL INFO transport/p2p.cc:352 -> 1 | |
tg10907:995235:995351 [0] NCCL INFO transport/p2p.cc:537 -> 1 | |
tg10907:995235:995351 [0] NCCL INFO transport.cc:216 -> 1 | |
tg10907:995235:995351 [0] NCCL INFO transport/generic.cc:19 -> 1 | |
tg10907:995235:995351 [0] NCCL INFO group.cc:148 -> 1 | |
[rank0]: Traceback (most recent call last): | |
[rank0]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 329, in <module> | |
[rank0]: main() | |
[rank0]: File "/scratch/n/normandf/torch_distributed_debug/main.py", line 75, in main | |
[rank0]: model = nn.parallel.DistributedDataParallel( | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/nn/parallel/distributed.py", line 835, in __init__ | |
[rank0]: _verify_param_shape_across_processes(self.process_group, parameters) | |
[rank0]: File "/scratch/n/normandf/torch_distributed_debug/.venv/lib/python3.12/site-packages/torch/distributed/utils.py", line 282, in _verify_param_shape_across_processes | |
[rank0]: return dist._verify_params_across_processes(process_group, tensors, logger) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
tg10907:995235:995351 [0] NCCL INFO group.cc:75 -> 1 [Async thread] | |
tg10907:995235:995235 [0] NCCL INFO group.cc:460 -> 1 | |
tg10907:995235:995235 [0] NCCL INFO group.cc:581 -> 1 | |
tg10907:995235:995235 [0] NCCL INFO enqueue.cc:2299 -> 1 | |
[rank0]: torch.distributed.DistBackendError: NCCL error in: /pytorch/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:3353, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.26.2 | |
[rank0]: ncclUnhandledCudaError: Call to CUDA function failed. | |
[rank0]: Last error: | |
[rank0]: Cuda failure 101 'invalid device ordinal' | |
[rank1]:[W423 13:06:23.342925903 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
[rank4]:[W423 13:06:23.343823987 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
tg10908:875807:875910 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
[rank5]:[W423 13:06:23.357692461 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
tg10907:995237:995340 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
[rank2]:[W423 13:06:23.364046822 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
[rank7]:[W423 13:06:23.363516833 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
tg10908:875806:875909 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10908:875806:875909 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
[rank6]:[W423 13:06:23.385873250 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
[rank0]:[W423 13:06:23.389691207 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
[rank3]:[W423 13:06:23.399161934 ProcessGroupNCCL.cpp:1476] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875806:875931 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875806:875909 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10908:875806:875931 [0] NCCL INFO comm 0x2d22f240 rank 6 nranks 8 cudaDev 0 busId 5f000 - Abort COMPLETE | |
tg10908:875807:875910 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875807:875910 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10908:875807:875927 [0] NCCL INFO comm 0x12941f80 rank 5 nranks 8 cudaDev 0 busId cb000 - Abort COMPLETE | |
tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995237:995359 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995237:995340 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10907:995237:995359 [0] NCCL INFO comm 0x2341dd40 rank 2 nranks 8 cudaDev 0 busId 5f000 - Abort COMPLETE | |
tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995236:995341 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995236:995357 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995236:995341 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10907:995236:995357 [0] NCCL INFO comm 0x258ed380 rank 1 nranks 8 cudaDev 0 busId cb000 - Abort COMPLETE | |
tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875805:875908 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875805:875925 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875805:875925 [0] NCCL INFO comm 0xa39eac0 rank 4 nranks 8 cudaDev 0 busId 4e000 - Abort COMPLETE | |
tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10908:875804:875929 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10908:875804:875911 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10908:875804:875929 [0] NCCL INFO comm 0x1ff86a80 rank 7 nranks 8 cudaDev 0 busId db000 - Abort COMPLETE | |
tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995238:995346 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995238:995361 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995238:995361 [0] NCCL INFO comm 0x12a20e80 rank 0 nranks 8 cudaDev 0 busId 4e000 - Abort COMPLETE | |
tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:64 -> 3 | |
tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:80 -> 3 | |
tg10907:995235:995363 [0] NCCL INFO misc/socket.cc:829 -> 3 | |
tg10907:995235:995342 [0] NCCL INFO misc/socket.cc:881 -> 3 | |
tg10907:995235:995363 [0] NCCL INFO comm 0x19679cc0 rank 3 nranks 8 cudaDev 0 busId db000 - Abort COMPLETE | |
======== GPU REPORT ======== | |
======== GPU REPORT ======== | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:24 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:24 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875805 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6051 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875806 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5857 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875807 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5761 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875804 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 0 ms | |
Is Running : 1 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995238 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5928 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995237 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6022 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995236 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5734 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995235 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 0 ms | |
Is Running : 1 | |
Wed Apr 23 13:06:24 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 35C P0 118W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 34C P0 116W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
Wed Apr 23 13:06:25 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 35C P0 120W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 35C P0 118W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 35C P0 120W / 700W | 1MiB / 81559MiB | 1% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 35C P0 123W / 700W | 1MiB / 81559MiB | 1% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 35C P0 122W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 35C P0 118W / 700W | 1MiB / 81559MiB | 1% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
======== GPU REPORT ======== | |
======== GPU REPORT ======== | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:25 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875805 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6051 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875806 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5857 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875807 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5761 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875804 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6158 ms | |
Is Running : 0 | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:25 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995238 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5928 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995237 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6022 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995236 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5734 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995235 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6147 ms | |
Is Running : 0 | |
Wed Apr 23 13:06:25 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 35C P0 120W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 34C P0 118W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
Wed Apr 23 13:06:25 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 35C P0 121W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 35C P0 122W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 35C P0 119W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 35C P0 125W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 35C P0 124W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 35C P0 119W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
======== GPU REPORT ======== | |
======== GPU REPORT ======== | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:26 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875805 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6051 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875806 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5857 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875807 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5761 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875804 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6158 ms | |
Is Running : 0 | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:26 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995238 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5928 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995237 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6022 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995236 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5734 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995235 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6147 ms | |
Is Running : 0 | |
Wed Apr 23 13:06:26 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 97W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 34C P0 118W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 100W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 35C P0 122W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
Wed Apr 23 13:06:26 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 114W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 33C P0 89W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 121W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 34C P0 102W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
======== GPU REPORT ======== | |
======== GPU REPORT ======== | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:27 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875805 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6051 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875806 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5857 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875807 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5761 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 875804 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6158 ms | |
Is Running : 0 | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:27 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995238 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5928 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995237 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6022 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995236 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5734 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995235 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6147 ms | |
Is Running : 0 | |
Wed Apr 23 13:06:27 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 33C P0 80W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 34C P0 99W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
Wed Apr 23 13:06:27 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 33C P0 71W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 74W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ | |
srun: error: tg10908: tasks 4-7: Exited with exit code 1 | |
srun: Terminating StepId=7014.1 | |
slurmstepd: error: *** STEP 7014.1 ON tg10907 CANCELLED AT 2025-04-23T17:06:27 *** | |
slurmstepd: error: --task-epilog failed status=15 | |
srun: error: tg10907: tasks 0-3: Exited with exit code 1 | |
======== GPU REPORT ======== | |
==============NVSMI LOG============== | |
Timestamp : Wed Apr 23 13:06:28 2025 | |
Driver Version : 570.124.06 | |
CUDA Version : 12.8 | |
Attached GPUs : 4 | |
GPU 00000000:4E:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995238 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5928 ms | |
Is Running : 0 | |
GPU 00000000:5F:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995237 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6022 ms | |
Is Running : 0 | |
GPU 00000000:CB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995236 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 5734 ms | |
Is Running : 0 | |
GPU 00000000:DB:00.0 | |
Accounting Mode : Enabled | |
Accounting Mode Buffer Size : 4000 | |
Accounted Processes | |
Process ID : 995235 | |
GPU Utilization : 0 % | |
Memory Utilization : 0 % | |
Max memory usage : 972 MiB | |
Time : 6147 ms | |
Is Running : 0 | |
Wed Apr 23 13:06:29 2025 | |
+-----------------------------------------------------------------------------------------+ | |
| NVIDIA-SMI 570.124.06 Driver Version: 570.124.06 CUDA Version: 12.8 | | |
|-----------------------------------------+------------------------+----------------------+ | |
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC | | |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. | | |
| | | MIG M. | | |
|=========================================+========================+======================| | |
| 0 NVIDIA H100 80GB HBM3 On | 00000000:4E:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 1 NVIDIA H100 80GB HBM3 On | 00000000:5F:00.0 Off | 0 | | |
| N/A 33C P0 71W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 2 NVIDIA H100 80GB HBM3 On | 00000000:CB:00.0 Off | 0 | | |
| N/A 34C P0 72W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
| 3 NVIDIA H100 80GB HBM3 On | 00000000:DB:00.0 Off | 0 | | |
| N/A 34C P0 69W / 700W | 1MiB / 81559MiB | 0% Default | | |
| | | Disabled | | |
+-----------------------------------------+------------------------+----------------------+ | |
+-----------------------------------------------------------------------------------------+ | |
| Processes: | | |
| GPU GI CI PID Type Process name GPU Memory | | |
| ID ID Usage | | |
|=========================================================================================| | |
| No running processes found | | |
+-----------------------------------------------------------------------------------------+ |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
version = 1 | |
revision = 1 | |
requires-python = ">=3.12" | |
[[package]] | |
name = "colorama" | |
version = "0.4.6" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, | |
] | |
[[package]] | |
name = "filelock" | |
version = "3.18.0" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/0a/10/c23352565a6544bdc5353e0b15fc1c563352101f30e24bf500207a54df9a/filelock-3.18.0.tar.gz", hash = "sha256:adbc88eabb99d2fec8c9c1b229b171f18afa655400173ddc653d5d01501fb9f2", size = 18075 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/4d/36/2a115987e2d8c300a974597416d9de88f2444426de9571f4b59b2cca3acc/filelock-3.18.0-py3-none-any.whl", hash = "sha256:c401f4f8377c4464e6db25fff06205fd89bdd83b65eb0488ed1b160f780e21de", size = 16215 }, | |
] | |
[[package]] | |
name = "fsspec" | |
version = "2025.3.2" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/45/d8/8425e6ba5fcec61a1d16e41b1b71d2bf9344f1fe48012c2b48b9620feae5/fsspec-2025.3.2.tar.gz", hash = "sha256:e52c77ef398680bbd6a98c0e628fbc469491282981209907bbc8aea76a04fdc6", size = 299281 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/44/4b/e0cfc1a6f17e990f3e64b7d941ddc4acdc7b19d6edd51abf495f32b1a9e4/fsspec-2025.3.2-py3-none-any.whl", hash = "sha256:2daf8dc3d1dfa65b6aa37748d112773a7a08416f6c70d96b264c96476ecaf711", size = 194435 }, | |
] | |
[[package]] | |
name = "jinja2" | |
version = "3.1.6" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "markupsafe" }, | |
] | |
sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899 }, | |
] | |
[[package]] | |
name = "markdown-it-py" | |
version = "3.0.0" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "mdurl" }, | |
] | |
sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528 }, | |
] | |
[[package]] | |
name = "markupsafe" | |
version = "3.0.2" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/b2/97/5d42485e71dfc078108a86d6de8fa46db44a1a9295e89c5d6d4a06e23a62/markupsafe-3.0.2.tar.gz", hash = "sha256:ee55d3edf80167e48ea11a923c7386f4669df67d7994554387f84e7d8b0a2bf0", size = 20537 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/22/09/d1f21434c97fc42f09d290cbb6350d44eb12f09cc62c9476effdb33a18aa/MarkupSafe-3.0.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:9778bd8ab0a994ebf6f84c2b949e65736d5575320a17ae8984a77fab08db94cf", size = 14274 }, | |
{ url = "https://files.pythonhosted.org/packages/6b/b0/18f76bba336fa5aecf79d45dcd6c806c280ec44538b3c13671d49099fdd0/MarkupSafe-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:846ade7b71e3536c4e56b386c2a47adf5741d2d8b94ec9dc3e92e5e1ee1e2225", size = 12348 }, | |
{ url = "https://files.pythonhosted.org/packages/e0/25/dd5c0f6ac1311e9b40f4af06c78efde0f3b5cbf02502f8ef9501294c425b/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1c99d261bd2d5f6b59325c92c73df481e05e57f19837bdca8413b9eac4bd8028", size = 24149 }, | |
{ url = "https://files.pythonhosted.org/packages/f3/f0/89e7aadfb3749d0f52234a0c8c7867877876e0a20b60e2188e9850794c17/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e17c96c14e19278594aa4841ec148115f9c7615a47382ecb6b82bd8fea3ab0c8", size = 23118 }, | |
{ url = "https://files.pythonhosted.org/packages/d5/da/f2eeb64c723f5e3777bc081da884b414671982008c47dcc1873d81f625b6/MarkupSafe-3.0.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:88416bd1e65dcea10bc7569faacb2c20ce071dd1f87539ca2ab364bf6231393c", size = 22993 }, | |
{ url = "https://files.pythonhosted.org/packages/da/0e/1f32af846df486dce7c227fe0f2398dc7e2e51d4a370508281f3c1c5cddc/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2181e67807fc2fa785d0592dc2d6206c019b9502410671cc905d132a92866557", size = 24178 }, | |
{ url = "https://files.pythonhosted.org/packages/c4/f6/bb3ca0532de8086cbff5f06d137064c8410d10779c4c127e0e47d17c0b71/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:52305740fe773d09cffb16f8ed0427942901f00adedac82ec8b67752f58a1b22", size = 23319 }, | |
{ url = "https://files.pythonhosted.org/packages/a2/82/8be4c96ffee03c5b4a034e60a31294daf481e12c7c43ab8e34a1453ee48b/MarkupSafe-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ad10d3ded218f1039f11a75f8091880239651b52e9bb592ca27de44eed242a48", size = 23352 }, | |
{ url = "https://files.pythonhosted.org/packages/51/ae/97827349d3fcffee7e184bdf7f41cd6b88d9919c80f0263ba7acd1bbcb18/MarkupSafe-3.0.2-cp312-cp312-win32.whl", hash = "sha256:0f4ca02bea9a23221c0182836703cbf8930c5e9454bacce27e767509fa286a30", size = 15097 }, | |
{ url = "https://files.pythonhosted.org/packages/c1/80/a61f99dc3a936413c3ee4e1eecac96c0da5ed07ad56fd975f1a9da5bc630/MarkupSafe-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:8e06879fc22a25ca47312fbe7c8264eb0b662f6db27cb2d3bbbc74b1df4b9b87", size = 15601 }, | |
{ url = "https://files.pythonhosted.org/packages/83/0e/67eb10a7ecc77a0c2bbe2b0235765b98d164d81600746914bebada795e97/MarkupSafe-3.0.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:ba9527cdd4c926ed0760bc301f6728ef34d841f405abf9d4f959c478421e4efd", size = 14274 }, | |
{ url = "https://files.pythonhosted.org/packages/2b/6d/9409f3684d3335375d04e5f05744dfe7e9f120062c9857df4ab490a1031a/MarkupSafe-3.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f8b3d067f2e40fe93e1ccdd6b2e1d16c43140e76f02fb1319a05cf2b79d99430", size = 12352 }, | |
{ url = "https://files.pythonhosted.org/packages/d2/f5/6eadfcd3885ea85fe2a7c128315cc1bb7241e1987443d78c8fe712d03091/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569511d3b58c8791ab4c2e1285575265991e6d8f8700c7be0e88f86cb0672094", size = 24122 }, | |
{ url = "https://files.pythonhosted.org/packages/0c/91/96cf928db8236f1bfab6ce15ad070dfdd02ed88261c2afafd4b43575e9e9/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:15ab75ef81add55874e7ab7055e9c397312385bd9ced94920f2802310c930396", size = 23085 }, | |
{ url = "https://files.pythonhosted.org/packages/c2/cf/c9d56af24d56ea04daae7ac0940232d31d5a8354f2b457c6d856b2057d69/MarkupSafe-3.0.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3818cb119498c0678015754eba762e0d61e5b52d34c8b13d770f0719f7b1d79", size = 22978 }, | |
{ url = "https://files.pythonhosted.org/packages/2a/9f/8619835cd6a711d6272d62abb78c033bda638fdc54c4e7f4272cf1c0962b/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:cdb82a876c47801bb54a690c5ae105a46b392ac6099881cdfb9f6e95e4014c6a", size = 24208 }, | |
{ url = "https://files.pythonhosted.org/packages/f9/bf/176950a1792b2cd2102b8ffeb5133e1ed984547b75db47c25a67d3359f77/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:cabc348d87e913db6ab4aa100f01b08f481097838bdddf7c7a84b7575b7309ca", size = 23357 }, | |
{ url = "https://files.pythonhosted.org/packages/ce/4f/9a02c1d335caabe5c4efb90e1b6e8ee944aa245c1aaaab8e8a618987d816/MarkupSafe-3.0.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:444dcda765c8a838eaae23112db52f1efaf750daddb2d9ca300bcae1039adc5c", size = 23344 }, | |
{ url = "https://files.pythonhosted.org/packages/ee/55/c271b57db36f748f0e04a759ace9f8f759ccf22b4960c270c78a394f58be/MarkupSafe-3.0.2-cp313-cp313-win32.whl", hash = "sha256:bcf3e58998965654fdaff38e58584d8937aa3096ab5354d493c77d1fdd66d7a1", size = 15101 }, | |
{ url = "https://files.pythonhosted.org/packages/29/88/07df22d2dd4df40aba9f3e402e6dc1b8ee86297dddbad4872bd5e7b0094f/MarkupSafe-3.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:e6a2a455bd412959b57a172ce6328d2dd1f01cb2135efda2e4576e8a23fa3b0f", size = 15603 }, | |
{ url = "https://files.pythonhosted.org/packages/62/6a/8b89d24db2d32d433dffcd6a8779159da109842434f1dd2f6e71f32f738c/MarkupSafe-3.0.2-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:b5a6b3ada725cea8a5e634536b1b01c30bcdcd7f9c6fff4151548d5bf6b3a36c", size = 14510 }, | |
{ url = "https://files.pythonhosted.org/packages/7a/06/a10f955f70a2e5a9bf78d11a161029d278eeacbd35ef806c3fd17b13060d/MarkupSafe-3.0.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a904af0a6162c73e3edcb969eeeb53a63ceeb5d8cf642fade7d39e7963a22ddb", size = 12486 }, | |
{ url = "https://files.pythonhosted.org/packages/34/cf/65d4a571869a1a9078198ca28f39fba5fbb910f952f9dbc5220afff9f5e6/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4aa4e5faecf353ed117801a068ebab7b7e09ffb6e1d5e412dc852e0da018126c", size = 25480 }, | |
{ url = "https://files.pythonhosted.org/packages/0c/e3/90e9651924c430b885468b56b3d597cabf6d72be4b24a0acd1fa0e12af67/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0ef13eaeee5b615fb07c9a7dadb38eac06a0608b41570d8ade51c56539e509d", size = 23914 }, | |
{ url = "https://files.pythonhosted.org/packages/66/8c/6c7cf61f95d63bb866db39085150df1f2a5bd3335298f14a66b48e92659c/MarkupSafe-3.0.2-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d16a81a06776313e817c951135cf7340a3e91e8c1ff2fac444cfd75fffa04afe", size = 23796 }, | |
{ url = "https://files.pythonhosted.org/packages/bb/35/cbe9238ec3f47ac9a7c8b3df7a808e7cb50fe149dc7039f5f454b3fba218/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:6381026f158fdb7c72a168278597a5e3a5222e83ea18f543112b2662a9b699c5", size = 25473 }, | |
{ url = "https://files.pythonhosted.org/packages/e6/32/7621a4382488aa283cc05e8984a9c219abad3bca087be9ec77e89939ded9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:3d79d162e7be8f996986c064d1c7c817f6df3a77fe3d6859f6f9e7be4b8c213a", size = 24114 }, | |
{ url = "https://files.pythonhosted.org/packages/0d/80/0985960e4b89922cb5a0bac0ed39c5b96cbc1a536a99f30e8c220a996ed9/MarkupSafe-3.0.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:131a3c7689c85f5ad20f9f6fb1b866f402c445b220c19fe4308c0b147ccd2ad9", size = 24098 }, | |
{ url = "https://files.pythonhosted.org/packages/82/78/fedb03c7d5380df2427038ec8d973587e90561b2d90cd472ce9254cf348b/MarkupSafe-3.0.2-cp313-cp313t-win32.whl", hash = "sha256:ba8062ed2cf21c07a9e295d5b8a2a5ce678b913b45fdf68c32d95d6c1291e0b6", size = 15208 }, | |
{ url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739 }, | |
] | |
[[package]] | |
name = "mdurl" | |
version = "0.1.2" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/d6/54/cfe61301667036ec958cb99bd3efefba235e65cdeb9c84d24a8293ba1d90/mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba", size = 8729 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/b3/38/89ba8ad64ae25be8de66a6d463314cf1eb366222074cfda9ee839c56a4b4/mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8", size = 9979 }, | |
] | |
[[package]] | |
name = "mpmath" | |
version = "1.3.0" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/e0/47/dd32fa426cc72114383ac549964eecb20ecfd886d1e5ccf5340b55b02f57/mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f", size = 508106 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/43/e3/7d92a15f894aa0c9c4b49b8ee9ac9850d6e63b03c9c32c0367a13ae62209/mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c", size = 536198 }, | |
] | |
[[package]] | |
name = "networkx" | |
version = "3.4.2" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263 }, | |
] | |
[[package]] | |
name = "numpy" | |
version = "2.2.5" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/dc/b2/ce4b867d8cd9c0ee84938ae1e6a6f7926ebf928c9090d036fc3c6a04f946/numpy-2.2.5.tar.gz", hash = "sha256:a9c0d994680cd991b1cb772e8b297340085466a6fe964bc9d4e80f5e2f43c291", size = 20273920 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/e2/f7/1fd4ff108cd9d7ef929b8882692e23665dc9c23feecafbb9c6b80f4ec583/numpy-2.2.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ee461a4eaab4f165b68780a6a1af95fb23a29932be7569b9fab666c407969051", size = 20948633 }, | |
{ url = "https://files.pythonhosted.org/packages/12/03/d443c278348371b20d830af155ff2079acad6a9e60279fac2b41dbbb73d8/numpy-2.2.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec31367fd6a255dc8de4772bd1658c3e926d8e860a0b6e922b615e532d320ddc", size = 14176123 }, | |
{ url = "https://files.pythonhosted.org/packages/2b/0b/5ca264641d0e7b14393313304da48b225d15d471250376f3fbdb1a2be603/numpy-2.2.5-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:47834cde750d3c9f4e52c6ca28a7361859fcaf52695c7dc3cc1a720b8922683e", size = 5163817 }, | |
{ url = "https://files.pythonhosted.org/packages/04/b3/d522672b9e3d28e26e1613de7675b441bbd1eaca75db95680635dd158c67/numpy-2.2.5-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:2c1a1c6ccce4022383583a6ded7bbcda22fc635eb4eb1e0a053336425ed36dfa", size = 6698066 }, | |
{ url = "https://files.pythonhosted.org/packages/a0/93/0f7a75c1ff02d4b76df35079676b3b2719fcdfb39abdf44c8b33f43ef37d/numpy-2.2.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d75f338f5f79ee23548b03d801d28a505198297534f62416391857ea0479571", size = 14087277 }, | |
{ url = "https://files.pythonhosted.org/packages/b0/d9/7c338b923c53d431bc837b5b787052fef9ae68a56fe91e325aac0d48226e/numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a801fef99668f309b88640e28d261991bfad9617c27beda4a3aec4f217ea073", size = 16135742 }, | |
{ url = "https://files.pythonhosted.org/packages/2d/10/4dec9184a5d74ba9867c6f7d1e9f2e0fb5fe96ff2bf50bb6f342d64f2003/numpy-2.2.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:abe38cd8381245a7f49967a6010e77dbf3680bd3627c0fe4362dd693b404c7f8", size = 15581825 }, | |
{ url = "https://files.pythonhosted.org/packages/80/1f/2b6fcd636e848053f5b57712a7d1880b1565eec35a637fdfd0a30d5e738d/numpy-2.2.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5a0ac90e46fdb5649ab6369d1ab6104bfe5854ab19b645bf5cda0127a13034ae", size = 17899600 }, | |
{ url = "https://files.pythonhosted.org/packages/ec/87/36801f4dc2623d76a0a3835975524a84bd2b18fe0f8835d45c8eae2f9ff2/numpy-2.2.5-cp312-cp312-win32.whl", hash = "sha256:0cd48122a6b7eab8f06404805b1bd5856200e3ed6f8a1b9a194f9d9054631beb", size = 6312626 }, | |
{ url = "https://files.pythonhosted.org/packages/8b/09/4ffb4d6cfe7ca6707336187951992bd8a8b9142cf345d87ab858d2d7636a/numpy-2.2.5-cp312-cp312-win_amd64.whl", hash = "sha256:ced69262a8278547e63409b2653b372bf4baff0870c57efa76c5703fd6543282", size = 12645715 }, | |
{ url = "https://files.pythonhosted.org/packages/e2/a0/0aa7f0f4509a2e07bd7a509042967c2fab635690d4f48c6c7b3afd4f448c/numpy-2.2.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:059b51b658f4414fff78c6d7b1b4e18283ab5fa56d270ff212d5ba0c561846f4", size = 20935102 }, | |
{ url = "https://files.pythonhosted.org/packages/7e/e4/a6a9f4537542912ec513185396fce52cdd45bdcf3e9d921ab02a93ca5aa9/numpy-2.2.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:47f9ed103af0bc63182609044b0490747e03bd20a67e391192dde119bf43d52f", size = 14191709 }, | |
{ url = "https://files.pythonhosted.org/packages/be/65/72f3186b6050bbfe9c43cb81f9df59ae63603491d36179cf7a7c8d216758/numpy-2.2.5-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:261a1ef047751bb02f29dfe337230b5882b54521ca121fc7f62668133cb119c9", size = 5149173 }, | |
{ url = "https://files.pythonhosted.org/packages/e5/e9/83e7a9432378dde5802651307ae5e9ea07bb72b416728202218cd4da2801/numpy-2.2.5-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:4520caa3807c1ceb005d125a75e715567806fed67e315cea619d5ec6e75a4191", size = 6684502 }, | |
{ url = "https://files.pythonhosted.org/packages/ea/27/b80da6c762394c8ee516b74c1f686fcd16c8f23b14de57ba0cad7349d1d2/numpy-2.2.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d14b17b9be5f9c9301f43d2e2a4886a33b53f4e6fdf9ca2f4cc60aeeee76372", size = 14084417 }, | |
{ url = "https://files.pythonhosted.org/packages/aa/fc/ebfd32c3e124e6a1043e19c0ab0769818aa69050ce5589b63d05ff185526/numpy-2.2.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ba321813a00e508d5421104464510cc962a6f791aa2fca1c97b1e65027da80d", size = 16133807 }, | |
{ url = "https://files.pythonhosted.org/packages/bf/9b/4cc171a0acbe4666f7775cfd21d4eb6bb1d36d3a0431f48a73e9212d2278/numpy-2.2.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a4cbdef3ddf777423060c6f81b5694bad2dc9675f110c4b2a60dc0181543fac7", size = 15575611 }, | |
{ url = "https://files.pythonhosted.org/packages/a3/45/40f4135341850df48f8edcf949cf47b523c404b712774f8855a64c96ef29/numpy-2.2.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:54088a5a147ab71a8e7fdfd8c3601972751ded0739c6b696ad9cb0343e21ab73", size = 17895747 }, | |
{ url = "https://files.pythonhosted.org/packages/f8/4c/b32a17a46f0ffbde8cc82df6d3daeaf4f552e346df143e1b188a701a8f09/numpy-2.2.5-cp313-cp313-win32.whl", hash = "sha256:c8b82a55ef86a2d8e81b63da85e55f5537d2157165be1cb2ce7cfa57b6aef38b", size = 6309594 }, | |
{ url = "https://files.pythonhosted.org/packages/13/ae/72e6276feb9ef06787365b05915bfdb057d01fceb4a43cb80978e518d79b/numpy-2.2.5-cp313-cp313-win_amd64.whl", hash = "sha256:d8882a829fd779f0f43998e931c466802a77ca1ee0fe25a3abe50278616b1471", size = 12638356 }, | |
{ url = "https://files.pythonhosted.org/packages/79/56/be8b85a9f2adb688e7ded6324e20149a03541d2b3297c3ffc1a73f46dedb/numpy-2.2.5-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:e8b025c351b9f0e8b5436cf28a07fa4ac0204d67b38f01433ac7f9b870fa38c6", size = 20963778 }, | |
{ url = "https://files.pythonhosted.org/packages/ff/77/19c5e62d55bff507a18c3cdff82e94fe174957bad25860a991cac719d3ab/numpy-2.2.5-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8dfa94b6a4374e7851bbb6f35e6ded2120b752b063e6acdd3157e4d2bb922eba", size = 14207279 }, | |
{ url = "https://files.pythonhosted.org/packages/75/22/aa11f22dc11ff4ffe4e849d9b63bbe8d4ac6d5fae85ddaa67dfe43be3e76/numpy-2.2.5-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:97c8425d4e26437e65e1d189d22dff4a079b747ff9c2788057bfb8114ce1e133", size = 5199247 }, | |
{ url = "https://files.pythonhosted.org/packages/4f/6c/12d5e760fc62c08eded0394f62039f5a9857f758312bf01632a81d841459/numpy-2.2.5-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:352d330048c055ea6db701130abc48a21bec690a8d38f8284e00fab256dc1376", size = 6711087 }, | |
{ url = "https://files.pythonhosted.org/packages/ef/94/ece8280cf4218b2bee5cec9567629e61e51b4be501e5c6840ceb593db945/numpy-2.2.5-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b4c0773b6ada798f51f0f8e30c054d32304ccc6e9c5d93d46cb26f3d385ab19", size = 14059964 }, | |
{ url = "https://files.pythonhosted.org/packages/39/41/c5377dac0514aaeec69115830a39d905b1882819c8e65d97fc60e177e19e/numpy-2.2.5-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:55f09e00d4dccd76b179c0f18a44f041e5332fd0e022886ba1c0bbf3ea4a18d0", size = 16121214 }, | |
{ url = "https://files.pythonhosted.org/packages/db/54/3b9f89a943257bc8e187145c6bc0eb8e3d615655f7b14e9b490b053e8149/numpy-2.2.5-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:02f226baeefa68f7d579e213d0f3493496397d8f1cff5e2b222af274c86a552a", size = 15575788 }, | |
{ url = "https://files.pythonhosted.org/packages/b1/c4/2e407e85df35b29f79945751b8f8e671057a13a376497d7fb2151ba0d290/numpy-2.2.5-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c26843fd58f65da9491165072da2cccc372530681de481ef670dcc8e27cfb066", size = 17893672 }, | |
{ url = "https://files.pythonhosted.org/packages/29/7e/d0b44e129d038dba453f00d0e29ebd6eaf2f06055d72b95b9947998aca14/numpy-2.2.5-cp313-cp313t-win32.whl", hash = "sha256:1a161c2c79ab30fe4501d5a2bbfe8b162490757cf90b7f05be8b80bc02f7bb8e", size = 6377102 }, | |
{ url = "https://files.pythonhosted.org/packages/63/be/b85e4aa4bf42c6502851b971f1c326d583fcc68227385f92089cf50a7b45/numpy-2.2.5-cp313-cp313t-win_amd64.whl", hash = "sha256:d403c84991b5ad291d3809bace5e85f4bbf44a04bdc9a88ed2bb1807b3360bb8", size = 12750096 }, | |
] | |
[[package]] | |
name = "nvidia-cublas-cu12" | |
version = "12.6.4.1" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/af/eb/ff4b8c503fa1f1796679dce648854d58751982426e4e4b37d6fce49d259c/nvidia_cublas_cu12-12.6.4.1-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:08ed2686e9875d01b58e3cb379c6896df8e76c75e0d4a7f7dace3d7b6d9ef8eb", size = 393138322 }, | |
] | |
[[package]] | |
name = "nvidia-cuda-cupti-cu12" | |
version = "12.6.80" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/49/60/7b6497946d74bcf1de852a21824d63baad12cd417db4195fc1bfe59db953/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6768bad6cab4f19e8292125e5f1ac8aa7d1718704012a0e3272a6f61c4bce132", size = 8917980 }, | |
{ url = "https://files.pythonhosted.org/packages/a5/24/120ee57b218d9952c379d1e026c4479c9ece9997a4fb46303611ee48f038/nvidia_cuda_cupti_cu12-12.6.80-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a3eff6cdfcc6a4c35db968a06fcadb061cbc7d6dde548609a941ff8701b98b73", size = 8917972 }, | |
] | |
[[package]] | |
name = "nvidia-cuda-nvrtc-cu12" | |
version = "12.6.77" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/75/2e/46030320b5a80661e88039f59060d1790298b4718944a65a7f2aeda3d9e9/nvidia_cuda_nvrtc_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:35b0cc6ee3a9636d5409133e79273ce1f3fd087abb0532d2d2e8fff1fe9efc53", size = 23650380 }, | |
] | |
[[package]] | |
name = "nvidia-cuda-runtime-cu12" | |
version = "12.6.77" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/e1/23/e717c5ac26d26cf39a27fbc076240fad2e3b817e5889d671b67f4f9f49c5/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ba3b56a4f896141e25e19ab287cd71e52a6a0f4b29d0d31609f60e3b4d5219b7", size = 897690 }, | |
{ url = "https://files.pythonhosted.org/packages/f0/62/65c05e161eeddbafeca24dc461f47de550d9fa8a7e04eb213e32b55cfd99/nvidia_cuda_runtime_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a84d15d5e1da416dd4774cb42edf5e954a3e60cc945698dc1d5be02321c44dc8", size = 897678 }, | |
] | |
[[package]] | |
name = "nvidia-cudnn-cu12" | |
version = "9.5.1.17" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "nvidia-cublas-cu12" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386 }, | |
] | |
[[package]] | |
name = "nvidia-cufft-cu12" | |
version = "11.3.0.4" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "nvidia-nvjitlink-cu12" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632 }, | |
{ url = "https://files.pythonhosted.org/packages/60/de/99ec247a07ea40c969d904fc14f3a356b3e2a704121675b75c366b694ee1/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.whl", hash = "sha256:768160ac89f6f7b459bee747e8d175dbf53619cfe74b2a5636264163138013ca", size = 200221622 }, | |
] | |
[[package]] | |
name = "nvidia-cufile-cu12" | |
version = "1.11.1.6" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/b2/66/cc9876340ac68ae71b15c743ddb13f8b30d5244af344ec8322b449e35426/nvidia_cufile_cu12-1.11.1.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cc23469d1c7e52ce6c1d55253273d32c565dd22068647f3aa59b3c6b005bf159", size = 1142103 }, | |
] | |
[[package]] | |
name = "nvidia-curand-cu12" | |
version = "10.3.7.77" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/73/1b/44a01c4e70933637c93e6e1a8063d1e998b50213a6b65ac5a9169c47e98e/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a42cd1344297f70b9e39a1e4f467a4e1c10f1da54ff7a85c12197f6c652c8bdf", size = 56279010 }, | |
{ url = "https://files.pythonhosted.org/packages/4a/aa/2c7ff0b5ee02eaef890c0ce7d4f74bc30901871c5e45dee1ae6d0083cd80/nvidia_curand_cu12-10.3.7.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:99f1a32f1ac2bd134897fc7a203f779303261268a65762a623bf30cc9fe79117", size = 56279000 }, | |
] | |
[[package]] | |
name = "nvidia-cusolver-cu12" | |
version = "11.7.1.2" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "nvidia-cublas-cu12" }, | |
{ name = "nvidia-cusparse-cu12" }, | |
{ name = "nvidia-nvjitlink-cu12" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790 }, | |
{ url = "https://files.pythonhosted.org/packages/9f/81/baba53585da791d043c10084cf9553e074548408e04ae884cfe9193bd484/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6cf28f17f64107a0c4d7802be5ff5537b2130bfc112f25d5a30df227058ca0e6", size = 158229780 }, | |
] | |
[[package]] | |
name = "nvidia-cusparse-cu12" | |
version = "12.5.4.2" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "nvidia-nvjitlink-cu12" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367 }, | |
{ url = "https://files.pythonhosted.org/packages/43/ac/64c4316ba163e8217a99680c7605f779accffc6a4bcd0c778c12948d3707/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.whl", hash = "sha256:23749a6571191a215cb74d1cdbff4a86e7b19f1200c071b3fcf844a5bea23a2f", size = 216561357 }, | |
] | |
[[package]] | |
name = "nvidia-cusparselt-cu12" | |
version = "0.6.3" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/3b/9a/72ef35b399b0e183bc2e8f6f558036922d453c4d8237dab26c666a04244b/nvidia_cusparselt_cu12-0.6.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:e5c8a26c36445dd2e6812f1177978a24e2d37cacce7e090f297a688d1ec44f46", size = 156785796 }, | |
] | |
[[package]] | |
name = "nvidia-nccl-cu12" | |
version = "2.26.2" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/67/ca/f42388aed0fddd64ade7493dbba36e1f534d4e6fdbdd355c6a90030ae028/nvidia_nccl_cu12-2.26.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:694cf3879a206553cc9d7dbda76b13efaf610fdb70a50cba303de1b0d1530ac6", size = 201319755 }, | |
] | |
[[package]] | |
name = "nvidia-nvjitlink-cu12" | |
version = "12.6.85" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/9d/d7/c5383e47c7e9bf1c99d5bd2a8c935af2b6d705ad831a7ec5c97db4d82f4f/nvidia_nvjitlink_cu12-12.6.85-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl", hash = "sha256:eedc36df9e88b682efe4309aa16b5b4e78c2407eac59e8c10a6a47535164369a", size = 19744971 }, | |
] | |
[[package]] | |
name = "nvidia-nvtx-cu12" | |
version = "12.6.77" | |
source = { registry = "https://pypi.org/simple" } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/56/9a/fff8376f8e3d084cd1530e1ef7b879bb7d6d265620c95c1b322725c694f4/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b90bed3df379fa79afbd21be8e04a0314336b8ae16768b58f2d34cb1d04cd7d2", size = 89276 }, | |
{ url = "https://files.pythonhosted.org/packages/9e/4e/0d0c945463719429b7bd21dece907ad0bde437a2ff12b9b12fee94722ab0/nvidia_nvtx_cu12-12.6.77-py3-none-manylinux2014_x86_64.whl", hash = "sha256:6574241a3ec5fdc9334353ab8c479fe75841dbe8f4532a8fc97ce63503330ba1", size = 89265 }, | |
] | |
[[package]] | |
name = "pillow" | |
version = "11.2.1" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/af/cb/bb5c01fcd2a69335b86c22142b2bccfc3464087efb7fd382eee5ffc7fdf7/pillow-11.2.1.tar.gz", hash = "sha256:a64dd61998416367b7ef979b73d3a85853ba9bec4c2925f74e588879a58716b6", size = 47026707 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/c7/40/052610b15a1b8961f52537cc8326ca6a881408bc2bdad0d852edeb6ed33b/pillow-11.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:78afba22027b4accef10dbd5eed84425930ba41b3ea0a86fa8d20baaf19d807f", size = 3190185 }, | |
{ url = "https://files.pythonhosted.org/packages/e5/7e/b86dbd35a5f938632093dc40d1682874c33dcfe832558fc80ca56bfcb774/pillow-11.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:78092232a4ab376a35d68c4e6d5e00dfd73454bd12b230420025fbe178ee3b0b", size = 3030306 }, | |
{ url = "https://files.pythonhosted.org/packages/a4/5c/467a161f9ed53e5eab51a42923c33051bf8d1a2af4626ac04f5166e58e0c/pillow-11.2.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25a5f306095c6780c52e6bbb6109624b95c5b18e40aab1c3041da3e9e0cd3e2d", size = 4416121 }, | |
{ url = "https://files.pythonhosted.org/packages/62/73/972b7742e38ae0e2ac76ab137ca6005dcf877480da0d9d61d93b613065b4/pillow-11.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c7b29dbd4281923a2bfe562acb734cee96bbb129e96e6972d315ed9f232bef4", size = 4501707 }, | |
{ url = "https://files.pythonhosted.org/packages/e4/3a/427e4cb0b9e177efbc1a84798ed20498c4f233abde003c06d2650a6d60cb/pillow-11.2.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3e645b020f3209a0181a418bffe7b4a93171eef6c4ef6cc20980b30bebf17b7d", size = 4522921 }, | |
{ url = "https://files.pythonhosted.org/packages/fe/7c/d8b1330458e4d2f3f45d9508796d7caf0c0d3764c00c823d10f6f1a3b76d/pillow-11.2.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:b2dbea1012ccb784a65349f57bbc93730b96e85b42e9bf7b01ef40443db720b4", size = 4612523 }, | |
{ url = "https://files.pythonhosted.org/packages/b3/2f/65738384e0b1acf451de5a573d8153fe84103772d139e1e0bdf1596be2ea/pillow-11.2.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:da3104c57bbd72948d75f6a9389e6727d2ab6333c3617f0a89d72d4940aa0443", size = 4587836 }, | |
{ url = "https://files.pythonhosted.org/packages/6a/c5/e795c9f2ddf3debb2dedd0df889f2fe4b053308bb59a3cc02a0cd144d641/pillow-11.2.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:598174aef4589af795f66f9caab87ba4ff860ce08cd5bb447c6fc553ffee603c", size = 4669390 }, | |
{ url = "https://files.pythonhosted.org/packages/96/ae/ca0099a3995976a9fce2f423166f7bff9b12244afdc7520f6ed38911539a/pillow-11.2.1-cp312-cp312-win32.whl", hash = "sha256:1d535df14716e7f8776b9e7fee118576d65572b4aad3ed639be9e4fa88a1cad3", size = 2332309 }, | |
{ url = "https://files.pythonhosted.org/packages/7c/18/24bff2ad716257fc03da964c5e8f05d9790a779a8895d6566e493ccf0189/pillow-11.2.1-cp312-cp312-win_amd64.whl", hash = "sha256:14e33b28bf17c7a38eede290f77db7c664e4eb01f7869e37fa98a5aa95978941", size = 2676768 }, | |
{ url = "https://files.pythonhosted.org/packages/da/bb/e8d656c9543276517ee40184aaa39dcb41e683bca121022f9323ae11b39d/pillow-11.2.1-cp312-cp312-win_arm64.whl", hash = "sha256:21e1470ac9e5739ff880c211fc3af01e3ae505859392bf65458c224d0bf283eb", size = 2415087 }, | |
{ url = "https://files.pythonhosted.org/packages/36/9c/447528ee3776e7ab8897fe33697a7ff3f0475bb490c5ac1456a03dc57956/pillow-11.2.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fdec757fea0b793056419bca3e9932eb2b0ceec90ef4813ea4c1e072c389eb28", size = 3190098 }, | |
{ url = "https://files.pythonhosted.org/packages/b5/09/29d5cd052f7566a63e5b506fac9c60526e9ecc553825551333e1e18a4858/pillow-11.2.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b0e130705d568e2f43a17bcbe74d90958e8a16263868a12c3e0d9c8162690830", size = 3030166 }, | |
{ url = "https://files.pythonhosted.org/packages/71/5d/446ee132ad35e7600652133f9c2840b4799bbd8e4adba881284860da0a36/pillow-11.2.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bdb5e09068332578214cadd9c05e3d64d99e0e87591be22a324bdbc18925be0", size = 4408674 }, | |
{ url = "https://files.pythonhosted.org/packages/69/5f/cbe509c0ddf91cc3a03bbacf40e5c2339c4912d16458fcb797bb47bcb269/pillow-11.2.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d189ba1bebfbc0c0e529159631ec72bb9e9bc041f01ec6d3233d6d82eb823bc1", size = 4496005 }, | |
{ url = "https://files.pythonhosted.org/packages/f9/b3/dd4338d8fb8a5f312021f2977fb8198a1184893f9b00b02b75d565c33b51/pillow-11.2.1-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:191955c55d8a712fab8934a42bfefbf99dd0b5875078240943f913bb66d46d9f", size = 4518707 }, | |
{ url = "https://files.pythonhosted.org/packages/13/eb/2552ecebc0b887f539111c2cd241f538b8ff5891b8903dfe672e997529be/pillow-11.2.1-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:ad275964d52e2243430472fc5d2c2334b4fc3ff9c16cb0a19254e25efa03a155", size = 4610008 }, | |
{ url = "https://files.pythonhosted.org/packages/72/d1/924ce51bea494cb6e7959522d69d7b1c7e74f6821d84c63c3dc430cbbf3b/pillow-11.2.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:750f96efe0597382660d8b53e90dd1dd44568a8edb51cb7f9d5d918b80d4de14", size = 4585420 }, | |
{ url = "https://files.pythonhosted.org/packages/43/ab/8f81312d255d713b99ca37479a4cb4b0f48195e530cdc1611990eb8fd04b/pillow-11.2.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:fe15238d3798788d00716637b3d4e7bb6bde18b26e5d08335a96e88564a36b6b", size = 4667655 }, | |
{ url = "https://files.pythonhosted.org/packages/94/86/8f2e9d2dc3d308dfd137a07fe1cc478df0a23d42a6c4093b087e738e4827/pillow-11.2.1-cp313-cp313-win32.whl", hash = "sha256:3fe735ced9a607fee4f481423a9c36701a39719252a9bb251679635f99d0f7d2", size = 2332329 }, | |
{ url = "https://files.pythonhosted.org/packages/6d/ec/1179083b8d6067a613e4d595359b5fdea65d0a3b7ad623fee906e1b3c4d2/pillow-11.2.1-cp313-cp313-win_amd64.whl", hash = "sha256:74ee3d7ecb3f3c05459ba95eed5efa28d6092d751ce9bf20e3e253a4e497e691", size = 2676388 }, | |
{ url = "https://files.pythonhosted.org/packages/23/f1/2fc1e1e294de897df39fa8622d829b8828ddad938b0eaea256d65b84dd72/pillow-11.2.1-cp313-cp313-win_arm64.whl", hash = "sha256:5119225c622403afb4b44bad4c1ca6c1f98eed79db8d3bc6e4e160fc6339d66c", size = 2414950 }, | |
{ url = "https://files.pythonhosted.org/packages/c4/3e/c328c48b3f0ead7bab765a84b4977acb29f101d10e4ef57a5e3400447c03/pillow-11.2.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:8ce2e8411c7aaef53e6bb29fe98f28cd4fbd9a1d9be2eeea434331aac0536b22", size = 3192759 }, | |
{ url = "https://files.pythonhosted.org/packages/18/0e/1c68532d833fc8b9f404d3a642991441d9058eccd5606eab31617f29b6d4/pillow-11.2.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9ee66787e095127116d91dea2143db65c7bb1e232f617aa5957c0d9d2a3f23a7", size = 3033284 }, | |
{ url = "https://files.pythonhosted.org/packages/b7/cb/6faf3fb1e7705fd2db74e070f3bf6f88693601b0ed8e81049a8266de4754/pillow-11.2.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9622e3b6c1d8b551b6e6f21873bdcc55762b4b2126633014cea1803368a9aa16", size = 4445826 }, | |
{ url = "https://files.pythonhosted.org/packages/07/94/8be03d50b70ca47fb434a358919d6a8d6580f282bbb7af7e4aa40103461d/pillow-11.2.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:63b5dff3a68f371ea06025a1a6966c9a1e1ee452fc8020c2cd0ea41b83e9037b", size = 4527329 }, | |
{ url = "https://files.pythonhosted.org/packages/fd/a4/bfe78777076dc405e3bd2080bc32da5ab3945b5a25dc5d8acaa9de64a162/pillow-11.2.1-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:31df6e2d3d8fc99f993fd253e97fae451a8db2e7207acf97859732273e108406", size = 4549049 }, | |
{ url = "https://files.pythonhosted.org/packages/65/4d/eaf9068dc687c24979e977ce5677e253624bd8b616b286f543f0c1b91662/pillow-11.2.1-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:062b7a42d672c45a70fa1f8b43d1d38ff76b63421cbbe7f88146b39e8a558d91", size = 4635408 }, | |
{ url = "https://files.pythonhosted.org/packages/1d/26/0fd443365d9c63bc79feb219f97d935cd4b93af28353cba78d8e77b61719/pillow-11.2.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4eb92eca2711ef8be42fd3f67533765d9fd043b8c80db204f16c8ea62ee1a751", size = 4614863 }, | |
{ url = "https://files.pythonhosted.org/packages/49/65/dca4d2506be482c2c6641cacdba5c602bc76d8ceb618fd37de855653a419/pillow-11.2.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f91ebf30830a48c825590aede79376cb40f110b387c17ee9bd59932c961044f9", size = 4692938 }, | |
{ url = "https://files.pythonhosted.org/packages/b3/92/1ca0c3f09233bd7decf8f7105a1c4e3162fb9142128c74adad0fb361b7eb/pillow-11.2.1-cp313-cp313t-win32.whl", hash = "sha256:e0b55f27f584ed623221cfe995c912c61606be8513bfa0e07d2c674b4516d9dd", size = 2335774 }, | |
{ url = "https://files.pythonhosted.org/packages/a5/ac/77525347cb43b83ae905ffe257bbe2cc6fd23acb9796639a1f56aa59d191/pillow-11.2.1-cp313-cp313t-win_amd64.whl", hash = "sha256:36d6b82164c39ce5482f649b437382c0fb2395eabc1e2b1702a6deb8ad647d6e", size = 2681895 }, | |
{ url = "https://files.pythonhosted.org/packages/67/32/32dc030cfa91ca0fc52baebbba2e009bb001122a1daa8b6a79ad830b38d3/pillow-11.2.1-cp313-cp313t-win_arm64.whl", hash = "sha256:225c832a13326e34f212d2072982bb1adb210e0cc0b153e688743018c94a2681", size = 2417234 }, | |
] | |
[[package]] | |
name = "pygments" | |
version = "2.19.1" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/7c/2d/c3338d48ea6cc0feb8446d8e6937e1408088a72a39937982cc6111d17f84/pygments-2.19.1.tar.gz", hash = "sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f", size = 4968581 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/8a/0b/9fcc47d19c48b59121088dd6da2488a49d5f72dacf8262e2790a1d2c7d15/pygments-2.19.1-py3-none-any.whl", hash = "sha256:9ea1544ad55cecf4b8242fab6dd35a93bbce657034b0611ee383099054ab6d8c", size = 1225293 }, | |
] | |
[[package]] | |
name = "rich" | |
version = "14.0.0" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "markdown-it-py" }, | |
{ name = "pygments" }, | |
] | |
sdist = { url = "https://files.pythonhosted.org/packages/a1/53/830aa4c3066a8ab0ae9a9955976fb770fe9c6102117c8ec4ab3ea62d89e8/rich-14.0.0.tar.gz", hash = "sha256:82f1bc23a6a21ebca4ae0c45af9bdbc492ed20231dcb63f297d6d1021a9d5725", size = 224078 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/0d/9b/63f4c7ebc259242c89b3acafdb37b41d1185c07ff0011164674e9076b491/rich-14.0.0-py3-none-any.whl", hash = "sha256:1c9491e1951aac09caffd42f448ee3d04e58923ffe14993f6e83068dc395d7e0", size = 243229 }, | |
] | |
[[package]] | |
name = "setuptools" | |
version = "79.0.0" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/7d/19/fecb7e2825616270f34512b3394cdcf6f45a79b5b6d94fdbd86a509e67b5/setuptools-79.0.0.tar.gz", hash = "sha256:9828422e7541213b0aacb6e10bbf9dd8febeaa45a48570e09b6d100e063fc9f9", size = 1367685 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/cc/ea/d53f2f8897c46a36df085964d07761ea4c2d1f2cf92019693b6742b7aabb/setuptools-79.0.0-py3-none-any.whl", hash = "sha256:b9ab3a104bedb292323f53797b00864e10e434a3ab3906813a7169e4745b912a", size = 1256065 }, | |
] | |
[[package]] | |
name = "sympy" | |
version = "1.13.3" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "mpmath" }, | |
] | |
sdist = { url = "https://files.pythonhosted.org/packages/11/8a/5a7fd6284fa8caac23a26c9ddf9c30485a48169344b4bd3b0f02fef1890f/sympy-1.13.3.tar.gz", hash = "sha256:b27fd2c6530e0ab39e275fc9b683895367e51d5da91baa8d3d64db2565fec4d9", size = 7533196 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/99/ff/c87e0622b1dadea79d2fb0b25ade9ed98954c9033722eb707053d310d4f3/sympy-1.13.3-py3-none-any.whl", hash = "sha256:54612cf55a62755ee71824ce692986f23c88ffa77207b30c1368eda4a7060f73", size = 6189483 }, | |
] | |
[[package]] | |
name = "torch" | |
version = "2.7.0" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "filelock" }, | |
{ name = "fsspec" }, | |
{ name = "jinja2" }, | |
{ name = "networkx" }, | |
{ name = "nvidia-cublas-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cuda-cupti-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cuda-nvrtc-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cuda-runtime-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cudnn-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cufft-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cufile-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-curand-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cusolver-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cusparse-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-cusparselt-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-nccl-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-nvjitlink-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "nvidia-nvtx-cu12", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "setuptools" }, | |
{ name = "sympy" }, | |
{ name = "triton", marker = "platform_machine == 'x86_64' and sys_platform == 'linux'" }, | |
{ name = "typing-extensions" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/aa/5e/ac759f4c0ab7c01feffa777bd68b43d2ac61560a9770eeac074b450f81d4/torch-2.7.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:36a6368c7ace41ad1c0f69f18056020b6a5ca47bedaca9a2f3b578f5a104c26c", size = 99013250 }, | |
{ url = "https://files.pythonhosted.org/packages/9c/58/2d245b6f1ef61cf11dfc4aceeaacbb40fea706ccebac3f863890c720ab73/torch-2.7.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:15aab3e31c16feb12ae0a88dba3434a458874636f360c567caa6a91f6bfba481", size = 865042157 }, | |
{ url = "https://files.pythonhosted.org/packages/44/80/b353c024e6b624cd9ce1d66dcb9d24e0294680f95b369f19280e241a0159/torch-2.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:f56d4b2510934e072bab3ab8987e00e60e1262fb238176168f5e0c43a1320c6d", size = 212482262 }, | |
{ url = "https://files.pythonhosted.org/packages/ee/8d/b2939e5254be932db1a34b2bd099070c509e8887e0c5a90c498a917e4032/torch-2.7.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:30b7688a87239a7de83f269333651d8e582afffce6f591fff08c046f7787296e", size = 68574294 }, | |
{ url = "https://files.pythonhosted.org/packages/14/24/720ea9a66c29151b315ea6ba6f404650834af57a26b2a04af23ec246b2d5/torch-2.7.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:868ccdc11798535b5727509480cd1d86d74220cfdc42842c4617338c1109a205", size = 99015553 }, | |
{ url = "https://files.pythonhosted.org/packages/4b/27/285a8cf12bd7cd71f9f211a968516b07dcffed3ef0be585c6e823675ab91/torch-2.7.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:9b52347118116cf3dff2ab5a3c3dd97c719eb924ac658ca2a7335652076df708", size = 865046389 }, | |
{ url = "https://files.pythonhosted.org/packages/74/c8/2ab2b6eadc45554af8768ae99668c5a8a8552e2012c7238ded7e9e4395e1/torch-2.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:434cf3b378340efc87c758f250e884f34460624c0523fe5c9b518d205c91dd1b", size = 212490304 }, | |
{ url = "https://files.pythonhosted.org/packages/28/fd/74ba6fde80e2b9eef4237fe668ffae302c76f0e4221759949a632ca13afa/torch-2.7.0-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:edad98dddd82220465b106506bb91ee5ce32bd075cddbcf2b443dfaa2cbd83bf", size = 68856166 }, | |
{ url = "https://files.pythonhosted.org/packages/cb/b4/8df3f9fe6bdf59e56a0e538592c308d18638eb5f5dc4b08d02abb173c9f0/torch-2.7.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2a885fc25afefb6e6eb18a7d1e8bfa01cc153e92271d980a49243b250d5ab6d9", size = 99091348 }, | |
{ url = "https://files.pythonhosted.org/packages/9d/f5/0bd30e9da04c3036614aa1b935a9f7e505a9e4f1f731b15e165faf8a4c74/torch-2.7.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:176300ff5bc11a5f5b0784e40bde9e10a35c4ae9609beed96b4aeb46a27f5fae", size = 865104023 }, | |
{ url = "https://files.pythonhosted.org/packages/d1/b7/2235d0c3012c596df1c8d39a3f4afc1ee1b6e318d469eda4c8bb68566448/torch-2.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:d0ca446a93f474985d81dc866fcc8dccefb9460a29a456f79d99c29a78a66993", size = 212750916 }, | |
{ url = "https://files.pythonhosted.org/packages/90/48/7e6477cf40d48cc0a61fa0d41ee9582b9a316b12772fcac17bc1a40178e7/torch-2.7.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:27f5007bdf45f7bb7af7f11d1828d5c2487e030690afb3d89a651fd7036a390e", size = 68575074 }, | |
] | |
[[package]] | |
name = "torch-distributed-debug" | |
version = "0.1.0" | |
source = { virtual = "." } | |
dependencies = [ | |
{ name = "rich" }, | |
{ name = "torch" }, | |
{ name = "torchvision" }, | |
{ name = "tqdm" }, | |
] | |
[package.metadata] | |
requires-dist = [ | |
{ name = "rich", specifier = ">=14.0.0" }, | |
{ name = "torch", specifier = ">=2.7.0" }, | |
{ name = "torchvision", specifier = ">=0.22.0" }, | |
{ name = "tqdm", specifier = ">=4.67.1" }, | |
] | |
[[package]] | |
name = "torchvision" | |
version = "0.22.0" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "numpy" }, | |
{ name = "pillow" }, | |
{ name = "torch" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/cb/ea/887d1d61cf4431a46280972de665f350af1898ce5006cd046326e5d0a2f2/torchvision-0.22.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:31c3165418fe21c3d81fe3459e51077c2f948801b8933ed18169f54652796a0f", size = 1947826 }, | |
{ url = "https://files.pythonhosted.org/packages/72/ef/21f8b6122e13ae045b8e49658029c695fd774cd21083b3fa5c3f9c5d3e35/torchvision-0.22.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:8f116bc82e0c076e70ba7776e611ed392b9666aa443662e687808b08993d26af", size = 2514571 }, | |
{ url = "https://files.pythonhosted.org/packages/7c/48/5f7617f6c60d135f86277c53f9d5682dfa4e66f4697f505f1530e8b69fb1/torchvision-0.22.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ce4dc334ebd508de2c534817c9388e928bc2500cf981906ae8d6e2ca3bf4727a", size = 7446522 }, | |
{ url = "https://files.pythonhosted.org/packages/99/94/a015e93955f5d3a68689cc7c385a3cfcd2d62b84655d18b61f32fb04eb67/torchvision-0.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:24b8c9255c209ca419cc7174906da2791c8b557b75c23496663ec7d73b55bebf", size = 1716664 }, | |
{ url = "https://files.pythonhosted.org/packages/e1/2a/9b34685599dcb341d12fc2730055155623db7a619d2415a8d31f17050952/torchvision-0.22.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:ece17995857dd328485c9c027c0b20ffc52db232e30c84ff6c95ab77201112c5", size = 1947823 }, | |
{ url = "https://files.pythonhosted.org/packages/77/77/88f64879483d66daf84f1d1c4d5c31ebb08e640411139042a258d5f7dbfe/torchvision-0.22.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:471c6dd75bb984c6ebe4f60322894a290bf3d4b195e769d80754f3689cd7f238", size = 2471592 }, | |
{ url = "https://files.pythonhosted.org/packages/f7/82/2f813eaae7c1fae1f9d9e7829578f5a91f39ef48d6c1c588a8900533dd3d/torchvision-0.22.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:2b839ac0610a38f56bef115ee5b9eaca5f9c2da3c3569a68cc62dbcc179c157f", size = 7446333 }, | |
{ url = "https://files.pythonhosted.org/packages/58/19/ca7a4f8907a56351dfe6ae0a708f4e6b3569b5c61d282e3e7f61cf42a4ce/torchvision-0.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:4ada1c08b2f761443cd65b7c7b4aec9e2fc28f75b0d4e1b1ebc9d3953ebccc4d", size = 1716693 }, | |
{ url = "https://files.pythonhosted.org/packages/6f/a7/f43e9c8d13118b4ffbaebea664c9338ab20fa115a908125afd2238ff16e7/torchvision-0.22.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:cdc96daa4658b47ce9384154c86ed1e70cba9d972a19f5de6e33f8f94a626790", size = 2137621 }, | |
{ url = "https://files.pythonhosted.org/packages/6a/9a/2b59f5758ba7e3f23bc84e16947493bbce97392ec6d18efba7bdf0a3b10e/torchvision-0.22.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:753d3c84eeadd5979a33b3b73a25ecd0aa4af44d6b45ed2c70d44f5e0ac68312", size = 2476555 }, | |
{ url = "https://files.pythonhosted.org/packages/7d/40/a7bc2ab9b1e56d10a7fd9ae83191bb425fa308caa23d148f1c568006e02c/torchvision-0.22.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:b30e3ed29e4a61f7499bca50f57d8ebd23dfc52b14608efa17a534a55ee59a03", size = 7617924 }, | |
{ url = "https://files.pythonhosted.org/packages/c1/7b/30d423bdb2546250d719d7821aaf9058cc093d165565b245b159c788a9dd/torchvision-0.22.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e5d680162694fac4c8a374954e261ddfb4eb0ce103287b0f693e4e9c579ef957", size = 1638621 }, | |
] | |
[[package]] | |
name = "tqdm" | |
version = "4.67.1" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "colorama", marker = "sys_platform == 'win32'" }, | |
] | |
sdist = { url = "https://files.pythonhosted.org/packages/a8/4b/29b4ef32e036bb34e4ab51796dd745cdba7ed47ad142a9f4a1eb8e0c744d/tqdm-4.67.1.tar.gz", hash = "sha256:f8aef9c52c08c13a65f30ea34f4e5aac3fd1a34959879d7e59e63027286627f2", size = 169737 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540 }, | |
] | |
[[package]] | |
name = "triton" | |
version = "3.3.0" | |
source = { registry = "https://pypi.org/simple" } | |
dependencies = [ | |
{ name = "setuptools" }, | |
] | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/11/53/ce18470914ab6cfbec9384ee565d23c4d1c55f0548160b1c7b33000b11fd/triton-3.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b68c778f6c4218403a6bd01be7484f6dc9e20fe2083d22dd8aef33e3b87a10a3", size = 156504509 }, | |
{ url = "https://files.pythonhosted.org/packages/7d/74/4bf2702b65e93accaa20397b74da46fb7a0356452c1bb94dbabaf0582930/triton-3.3.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:47bc87ad66fa4ef17968299acacecaab71ce40a238890acc6ad197c3abe2b8f1", size = 156516468 }, | |
{ url = "https://files.pythonhosted.org/packages/0a/93/f28a696fa750b9b608baa236f8225dd3290e5aff27433b06143adc025961/triton-3.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce4700fc14032af1e049005ae94ba908e71cd6c2df682239aed08e49bc71b742", size = 156580729 }, | |
] | |
[[package]] | |
name = "typing-extensions" | |
version = "4.13.2" | |
source = { registry = "https://pypi.org/simple" } | |
sdist = { url = "https://files.pythonhosted.org/packages/f6/37/23083fcd6e35492953e8d2aaaa68b860eb422b34627b13f2ce3eb6106061/typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef", size = 106967 } | |
wheels = [ | |
{ url = "https://files.pythonhosted.org/packages/8b/54/b1ae86c0973cc6f0210b53d508ca3641fb6d0c56823f288d108bc7ab3cc8/typing_extensions-4.13.2-py3-none-any.whl", hash = "sha256:a439e7c04b49fec3e5d3e2beaa21755cadbbdc391694e28ccdd36ca4a1408f8c", size = 45806 }, | |
] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment