Created
August 14, 2024 02:08
-
-
Save relyt0925/8933fa4862dc711d3a6691013a3225cc to your computer and use it in GitHub Desktop.
oom fail logs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[root@dev-rhel-ai-training-client-11 ~]# cat /var/mnt/inststg1/instructlab/job/checkpoints/skills/full_logs_global0.log | |
W0814 01:44:19.387000 139736190685632 torch/distributed/run.py:757] | |
W0814 01:44:19.387000 139736190685632 torch/distributed/run.py:757] ***************************************** | |
W0814 01:44:19.387000 139736190685632 torch/distributed/run.py:757] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
W0814 01:44:19.387000 139736190685632 torch/distributed/run.py:757] ***************************************** | |
[2024-08-14 01:44:22,434] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,675] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,722] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,758] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,793] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,799] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,814] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[2024-08-14 01:44:22,830] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect) | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH [WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] async_io requires the dev libaio .so object and headers but these were not found. | |
[WARNING] async_io: please install the libaio-devel package with yum | |
[WARNING] If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found. | |
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3 | |
[WARNING] using untested triton version (2.3.1), only 1.0.0 is known to be compatible | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
[2024-08-14 01:44:26,525] [INFO] [comm.py:637:init_distributed] cdb=None | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
model_name_or_path: /var/mnt/inststg1/instructlab/job/checkpoints/knowledge/hf_format/samples_1792 | |
data_path: /root/.local/share/instructlab/internal/data.jsonl | |
output_dir: /var/mnt/inststg1/instructlab/job/checkpoints/skills | |
num_epochs: 10 | |
last_step: 0 | |
effective_batch_size: 128 | |
learning_rate: 2.0e-05 | |
lr_scheduler: cosine | |
num_warmup_steps: 25 | |
save_samples: 6404 | |
save_samples_ds: null | |
save_last: false | |
log_level: INFO | |
seed: 42 | |
mock_data: false | |
mock_len: 2600 | |
sharding_strategy: FULL_SHARD | |
is_granite: false | |
lora_r: 0 | |
lora_alpha: 32 | |
lora_dropout: 0.1 | |
lora_quant_bits: null | |
lora_target_modules: null | |
max_batch_len: 16 | |
cpu_offload_optimizer: false | |
cpu_offload_optimizer_pin_memory: false | |
cpu_offload_optimizer_ratio: 1.0 | |
NEFTune_alpha: null | |
chat_tmpl_path: /opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/chat_templates/ibm_generic_tmpl.py | |
disable_flash_attn: false | |
{ | |
"script_params": { | |
"model_name_or_path": "/var/mnt/inststg1/instructlab/job/checkpoints/knowledge/hf_format/samples_1792", | |
"data_path": "/root/.local/share/instructlab/internal/data.jsonl", | |
"output_dir": "/var/mnt/inststg1/instructlab/job/checkpoints/skills", | |
"num_epochs": 10, | |
"last_step": 0, | |
"effective_batch_size": 128, | |
"learning_rate": 2e-05, | |
"lr_scheduler": "cosine", | |
"num_warmup_steps": 25, | |
"save_samples": 6404, | |
"save_samples_ds": null, | |
"save_last": false, | |
"log_level": "INFO", | |
"seed": 42, | |
"mock_data": false, | |
"mock_len": 2600, | |
"sharding_strategy": "FULL_SHARD", | |
"is_granite": false, | |
"lora_r": 0, | |
"lora_alpha": 32, | |
"lora_dropout": 0.1, | |
"lora_quant_bits": null, | |
"lora_target_modules": null, | |
"max_batch_len": 16, | |
"cpu_offload_optimizer": false, | |
"cpu_offload_optimizer_pin_memory": false, | |
"cpu_offload_optimizer_ratio": 1.0, | |
"NEFTune_alpha": null, | |
"chat_tmpl_path": "/opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/chat_templates/ibm_generic_tmpl.py", | |
"disable_flash_attn": false | |
}, | |
"timestamp": "2024-08-14T01:44:26.724399" | |
} | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
[2024-08-14 01:44:26,800] [INFO] [comm.py:637:init_distributed] cdb=None | |
[2024-08-14 01:44:26,800] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained. | |
[2024-08-14 01:44:27,465] [INFO] [comm.py:637:init_distributed] cdb=None | |
[2024-08-14 01:44:27,475] [INFO] [comm.py:637:init_distributed] cdb=None | |
[2024-08-14 01:44:27,592] [INFO] [comm.py:637:init_distributed] cdb=None | |
dev-rhel-ai-training-client-11:592:592 [0] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:592:592 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
dev-rhel-ai-training-client-11:592:592 [0] NCCL INFO cudaDriverVersion 12040 | |
NCCL version 2.20.5+cuda12.4 | |
dev-rhel-ai-training-client-11:593:593 [1] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:593:593 [1] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:593:593 [1] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
dev-rhel-ai-training-client-11:599:599 [7] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:599:599 [7] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:599:599 [7] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
dev-rhel-ai-training-client-11:597:597 [5] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:597:597 [5] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:597:597 [5] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
[2024-08-14 01:44:27,702] [INFO] [comm.py:637:init_distributed] cdb=None | |
dev-rhel-ai-training-client-11:594:594 [2] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:594:594 [2] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:594:594 [2] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
[2024-08-14 01:44:27,709] [INFO] [comm.py:637:init_distributed] cdb=None | |
dev-rhel-ai-training-client-11:595:595 [3] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:595:595 [3] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:595:595 [3] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
dev-rhel-ai-training-client-11:598:598 [6] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:598:598 [6] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:598:598 [6] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
[2024-08-14 01:44:27,783] [INFO] [comm.py:637:init_distributed] cdb=None | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:596:596 [4] NCCL INFO cudaDriverVersion 12040 | |
dev-rhel-ai-training-client-11:596:596 [4] NCCL INFO Bootstrap : Using enp8s0:192.168.48.69<0> | |
dev-rhel-ai-training-client-11:596:596 [4] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO NET/IB : No device found. | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO NET/Socket : Using [0]enp8s0:192.168.48.69<0> [1]podman0:10.88.0.1<0> [2]veth0:fe80::9480:4ff:fe5b:f62%veth0<0> | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO comm 0x557cf4282240 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId a040 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO comm 0x55eedd14c5a0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId a030 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO comm 0x562c7d22bea0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 8020 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO comm 0x56220841d1b0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8010 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO comm 0x56407da8d960 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e080 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO comm 0x558e0e5f32b0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e070 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO comm 0x55bec8e39f90 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId c060 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO comm 0x555b32df49b0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId c050 commId 0xa04025dda3be1a2 - Init START | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO NVLS multicast support is not available on dev 2 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO NVLS multicast support is not available on dev 4 | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO NVLS multicast support is not available on dev 1 | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO NVLS multicast support is not available on dev 3 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO NVLS multicast support is not available on dev 7 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO NVLS multicast support is not available on dev 6 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO NVLS multicast support is not available on dev 5 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO comm 0x56407da8d960 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO comm 0x555b32df49b0 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO comm 0x558e0e5f32b0 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO comm 0x55bec8e39f90 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO comm 0x56220841d1b0 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO comm 0x557cf4282240 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO comm 0x55eedd14c5a0 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2 | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO comm 0x562c7d22bea0 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 15/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 16/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 16/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 17/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 17/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 18/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 18/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 19/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 19/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 20/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 20/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 21/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 21/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 22/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 22/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Channel 23/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Channel 23/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 16/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 17/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 18/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 19/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 20/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 21/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 22/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Channel 23/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 13/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 16/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 17/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 16/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 18/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 16/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 17/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 19/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 16/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 17/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 18/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 20/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 17/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 18/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 19/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 21/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 18/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 19/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 22/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 20/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 19/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Channel 23/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 20/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 21/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 20/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 21/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 22/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 21/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 22/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Channel 23/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Channel 23/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 22/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Channel 23/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO NCCL_WORK_FIFO_DEPTH set by environment to 4194304. | |
dev-rhel-ai-training-client-11:596:1360 [4] NCCL INFO comm 0x555b32df49b0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId c050 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:592:1347 [0] NCCL INFO comm 0x56220841d1b0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8010 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:597:1352 [5] NCCL INFO comm 0x55bec8e39f90 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId c060 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:598:1357 [6] NCCL INFO comm 0x558e0e5f32b0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e070 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:593:1348 [1] NCCL INFO comm 0x562c7d22bea0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 8020 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:594:1355 [2] NCCL INFO comm 0x55eedd14c5a0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId a030 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:599:1349 [7] NCCL INFO comm 0x56407da8d960 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e080 commId 0xa04025dda3be1a2 - Init COMPLETE | |
dev-rhel-ai-training-client-11:595:1356 [3] NCCL INFO comm 0x557cf4282240 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId a040 commId 0xa04025dda3be1a2 - Init COMPLETE | |
Generating train split: 6404 examples [00:00, 6975.37 examples/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:03<00:00, 1813.12it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:03<00:00, 1772.79it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:03<00:00, 1769.79it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:03<00:00, 1663.53it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:04<00:00, 1591.99it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:04<00:00, 1591.63it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:04<00:00, 1593.99it/s] | |
Data length calculation: 100%|██████████| 6404/6404 [00:04<00:00, 1536.17it/s] | |
Effective batch size is too low for multipack sampling, max sample length=4077 and min packing length=3689. Switching to naive distributed sampling. | |
{ | |
"num_gpus": 8, | |
"avg_sample_len": 691.7688944409743, | |
"effective_batch_size": 128, | |
"max_batch_len_per_gpu": 16, | |
"packing_max_batch_len": null, | |
"grad_accum": 1, | |
"num_batches": 51, | |
"avg_samples_per_batch": 125.56862745098039, | |
"samples_per_gpu": 16, | |
"timestamp": "2024-08-14T01:44:43.669431" | |
} | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`. | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja... | |
/opt/python3.11/venv/lib64/python3.11/site-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. | |
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. | |
warnings.warn( | |
Building extension module fused_adam... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
ninja: no work to do. | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.16361117362976074 seconds | |
[2024-08-14 01:44:49,806] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.4+d254d75, git-hash=d254d75, git-branch=HEAD | |
[2024-08-14 01:44:49,806] [INFO] [comm.py:662:init_distributed] Distributed backend already initialized | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.10302138328552246 seconds | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.10244607925415039 seconds | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.20242667198181152 seconds | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.10235929489135742 seconds | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja... | |
/opt/python3.11/venv/lib64/python3.11/site-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. | |
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. | |
warnings.warn( | |
Building extension module fused_adam... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
ninja: no work to do. | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.1345815658569336 seconds | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja... | |
/opt/python3.11/venv/lib64/python3.11/site-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. | |
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. | |
warnings.warn( | |
Building extension module fused_adam... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
ninja: no work to do. | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.13213324546813965 seconds | |
Using /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121 as PyTorch extensions root... | |
Detected CUDA files, patching ldflags | |
Emitting ninja build file /var/mnt/inststg1/instructlab/.cache/torch_extensions/py311_cu121/fused_adam/build.ninja... | |
/opt/python3.11/venv/lib64/python3.11/site-packages/torch/utils/cpp_extension.py:1967: UserWarning: TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. | |
If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST']. | |
warnings.warn( | |
Building extension module fused_adam... | |
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) | |
ninja: no work to do. | |
Loading extension module fused_adam... | |
Time to load fused_adam op: 0.13481664657592773 seconds | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Using non-device net plugin version 0 | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Using network Socket | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO bootstrapSplit: comm 0x562c7eda1ec0 parent 0x562c7d22bea0 rank 1 nranks 8 color -934961569 key 1 prev 0 next 2 - DONE | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO bootstrapSplit: comm 0x562209f07c50 parent 0x56220841d1b0 rank 0 nranks 8 color -934961569 key 0 prev 7 next 1 - DONE | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO bootstrapSplit: comm 0x558e1010fdc0 parent 0x558e0e5f32b0 rank 6 nranks 8 color -934961569 key 6 prev 5 next 7 - DONE | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO bootstrapSplit: comm 0x56407f612870 parent 0x56407da8d960 rank 7 nranks 8 color -934961569 key 7 prev 6 next 0 - DONE | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO comm 0x562c7eda1ec0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 8020 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO comm 0x562209f07c50 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8010 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO bootstrapSplit: comm 0x55eedd549910 parent 0x55eedd14c5a0 rank 2 nranks 8 color -934961569 key 2 prev 1 next 3 - DONE | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO comm 0x558e1010fdc0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e070 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO comm 0x56407f612870 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e080 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO bootstrapSplit: comm 0x55beca91b930 parent 0x55bec8e39f90 rank 5 nranks 8 color -934961569 key 5 prev 4 next 6 - DONE | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO comm 0x55eedd549910 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId a030 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO bootstrapSplit: comm 0x555b34850780 parent 0x555b32df49b0 rank 4 nranks 8 color -934961569 key 4 prev 3 next 5 - DONE | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO comm 0x55beca91b930 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId c060 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO comm 0x555b34850780 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId c050 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO bootstrapSplit: comm 0x557cf5c800a0 parent 0x557cf4282240 rank 3 nranks 8 color -934961569 key 3 prev 2 next 4 - DONE | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO comm 0x557cf5c800a0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId a040 commId 0x71579157c6f8d946 - Init START | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Setting affinity for GPU 2 to ff,ffffffff | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO NVLS multicast support is not available on dev 2 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO NVLS multicast support is not available on dev 4 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO NVLS multicast support is not available on dev 6 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO NVLS multicast support is not available on dev 7 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffff00,00000000 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO NVLS multicast support is not available on dev 5 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Setting affinity for GPU 0 to ff,ffffffff | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Setting affinity for GPU 1 to ff,ffffffff | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO NVLS multicast support is not available on dev 1 | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Setting affinity for GPU 3 to ff,ffffffff | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO NVLS multicast support is not available on dev 3 | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO comm 0x557cf5c800a0 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO comm 0x55beca91b930 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO comm 0x558e1010fdc0 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0 | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2 | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO comm 0x562209f07c50 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO comm 0x555b34850780 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO comm 0x56407f612870 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO comm 0x55eedd549910 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO comm 0x562c7eda1ec0 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO P2P Chunksize set to 524288 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Connected all rings | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 15/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 16/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 17/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 18/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 19/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 20/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 21/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 22/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Channel 23/0 : 7[7] -> 6[6] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 13/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 16/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 16/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 17/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 16/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 17/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 18/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 17/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 18/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 19/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 18/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 19/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 20/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 19/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 20/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 21/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 20/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 21/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 22/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Channel 23/0 : 1[1] -> 0[0] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 21/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 22/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Channel 23/0 : 5[5] -> 4[4] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 22/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 16/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Channel 23/0 : 6[6] -> 5[5] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 17/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 18/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 16/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 19/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 16/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 17/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 17/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 20/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 18/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 18/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 21/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 22/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 19/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Channel 23/0 : 3[3] -> 2[2] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 19/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 20/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 20/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 21/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 21/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 22/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Channel 23/0 : 4[4] -> 3[3] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 22/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Channel 23/0 : 2[2] -> 1[1] via P2P/CUMEM/read | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO Connected all trees | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO 24 coll channels, 0 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
dev-rhel-ai-training-client-11:597:1456 [5] NCCL INFO comm 0x55beca91b930 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId c060 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:593:1444 [1] NCCL INFO comm 0x562c7eda1ec0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 8020 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:595:1450 [3] NCCL INFO comm 0x557cf5c800a0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId a040 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:599:1476 [7] NCCL INFO comm 0x56407f612870 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e080 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:594:1473 [2] NCCL INFO comm 0x55eedd549910 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId a030 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:598:1447 [6] NCCL INFO comm 0x558e1010fdc0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId e070 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:596:1453 [4] NCCL INFO comm 0x555b34850780 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId c050 commId 0x71579157c6f8d946 - Init COMPLETE | |
dev-rhel-ai-training-client-11:592:1443 [0] NCCL INFO comm 0x562209f07c50 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8010 commId 0x71579157c6f8d946 - Init COMPLETE | |
[2024-08-14 01:45:01,702] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False | |
[2024-08-14 01:45:01,704] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer | |
[2024-08-14 01:45:01,704] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer | |
[2024-08-14 01:45:01,716] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam | |
[2024-08-14 01:45:01,717] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'> | |
[2024-08-14 01:45:01,717] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 2 optimizer | |
[2024-08-14 01:45:01,717] [INFO] [stage_1_and_2.py:148:__init__] Reduce bucket size 500,000,000 | |
[2024-08-14 01:45:01,717] [INFO] [stage_1_and_2.py:149:__init__] Allgather bucket size 500,000,000 | |
[2024-08-14 01:45:01,717] [INFO] [stage_1_and_2.py:150:__init__] CPU Offload: False | |
[2024-08-14 01:45:01,717] [INFO] [stage_1_and_2.py:151:__init__] Round robin gradient partitioning: False | |
[2024-08-14 01:45:14,130] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:14,572] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:15,623] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:16,072] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:16,114] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:16,307] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:16,466] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
[2024-08-14 01:45:16,736] [INFO] [utils.py:781:see_memory_usage] Before initializing optimizer states | |
[2024-08-14 01:45:16,737] [INFO] [utils.py:782:see_memory_usage] MA 15.69 GB Max_MA 17.26 GB CA 17.26 GB Max_CA 17 GB | |
[2024-08-14 01:45:16,738] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 65.71 GB, percent = 5.2% | |
[2024-08-14 01:45:16,955] [INFO] [utils.py:781:see_memory_usage] After initializing optimizer states | |
[2024-08-14 01:45:16,955] [INFO] [utils.py:782:see_memory_usage] MA 15.69 GB Max_MA 18.83 GB CA 20.4 GB Max_CA 20 GB | |
[2024-08-14 01:45:16,956] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 65.7 GB, percent = 5.2% | |
[2024-08-14 01:45:16,956] [INFO] [stage_1_and_2.py:543:__init__] optimizer state initialized | |
[2024-08-14 01:45:17,160] [INFO] [utils.py:781:see_memory_usage] After initializing ZeRO optimizer | |
[2024-08-14 01:45:17,161] [INFO] [utils.py:782:see_memory_usage] MA 15.69 GB Max_MA 15.69 GB CA 20.4 GB Max_CA 20 GB | |
[2024-08-14 01:45:17,161] [INFO] [utils.py:789:see_memory_usage] CPU Virtual Memory: used = 65.71 GB, percent = 5.2% | |
[2024-08-14 01:45:17,163] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = DeepSpeedZeroOptimizer | |
[2024-08-14 01:45:17,163] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler | |
[2024-08-14 01:45:17,163] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7ff7f40e3ed0> | |
[2024-08-14 01:45:17,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[(0.9, 0.95)] | |
[2024-08-14 01:45:17,164] [INFO] [config.py:997:print] DeepSpeedEngine configuration: | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] activation_checkpointing_config { | |
"partition_activations": false, | |
"contiguous_memory_optimization": false, | |
"cpu_checkpointing": false, | |
"number_checkpoints": null, | |
"synchronize_checkpoint_boundary": false, | |
"profile": false | |
} | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] amp_enabled .................. False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] amp_params ................... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] autotuning_config ............ { | |
"enabled": false, | |
"start_step": null, | |
"end_step": null, | |
"metric_path": null, | |
"arg_mappings": null, | |
"metric": "throughput", | |
"model_info": null, | |
"results_dir": "autotuning_results", | |
"exps_dir": "autotuning_exps", | |
"overwrite": true, | |
"fast": true, | |
"start_profile_step": 3, | |
"end_profile_step": 5, | |
"tuner_type": "gridsearch", | |
"tuner_early_stopping": 5, | |
"tuner_num_trials": 50, | |
"model_info_path": null, | |
"mp_size": 1, | |
"max_train_batch_size": null, | |
"min_train_batch_size": 1, | |
"max_train_micro_batch_size_per_gpu": 1.024000e+03, | |
"min_train_micro_batch_size_per_gpu": 1, | |
"num_tuning_micro_batch_sizes": 3 | |
} | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] bfloat16_enabled ............. True | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] bfloat16_immediate_grad_update False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] checkpoint_parallel_write_pipeline False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] checkpoint_tag_validation_enabled True | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] checkpoint_tag_validation_fail False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7ff7b02b5d10> | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] communication_data_type ...... None | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] curriculum_enabled_legacy .... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] curriculum_params_legacy ..... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] data_efficiency_enabled ...... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] dataloader_drop_last ......... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] disable_allgather ............ False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] dump_state ................... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] dynamic_loss_scale_args ...... None | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_enabled ........... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_gas_boundary_resolution 1 | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_layer_name ........ bert.encoder.layer | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_layer_num ......... 0 | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_max_iter .......... 100 | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_stability ......... 1e-06 | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_tol ............... 0.01 | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] eigenvalue_verbose ........... False | |
[2024-08-14 01:45:17,165] [INFO] [config.py:1001:print] elasticity_enabled ........... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] flops_profiler_config ........ { | |
"enabled": false, | |
"recompute_fwd_factor": 0.0, | |
"profile_step": 1, | |
"module_depth": -1, | |
"top_modules": 1, | |
"detailed": true, | |
"output_file": null | |
} | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] fp16_auto_cast ............... None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] fp16_enabled ................. False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] fp16_master_weights_and_gradients False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] global_rank .................. 0 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] grad_accum_dtype ............. None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] gradient_accumulation_steps .. 1 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] gradient_clipping ............ 1.0 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] gradient_predivide_factor .... 1.0 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] graph_harvesting ............. False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] initial_dynamic_scale ........ 1 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] load_universal_checkpoint .... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] loss_scale ................... 1.0 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] memory_breakdown ............. False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] mics_hierarchial_params_gather False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] mics_shard_size .............. -1 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') comet=CometConfig(enabled=False, samples_log_interval=100, project=None, workspace=None, api_key=None, experiment_name=None, experiment_key=None, online=None, mode=None) wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] nebula_config ................ { | |
"enabled": false, | |
"persistent_storage_path": null, | |
"persistent_time_interval": 100, | |
"num_of_version_in_retention": 2, | |
"enable_nebula_load": true, | |
"load_path": null | |
} | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] optimizer_legacy_fusion ...... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] optimizer_name ............... None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] optimizer_params ............. None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] pld_enabled .................. False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] pld_params ................... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] prescale_gradients ........... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] scheduler_name ............... None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] scheduler_params ............. None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] seq_parallel_communication_data_type torch.float32 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] sparse_attention ............. None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] sparse_gradients_enabled ..... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] steps_per_print .............. 1 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] timers_config ................ enabled=True synchronized=True | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] train_batch_size ............. 128 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] train_micro_batch_size_per_gpu 16 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] use_data_before_expert_parallel_ False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] use_node_local_storage ....... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] wall_clock_breakdown ......... False | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] weight_quantization_config ... None | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] world_size ................... 8 | |
[2024-08-14 01:45:17,166] [INFO] [config.py:1001:print] zero_allow_untested_optimizer False | |
[2024-08-14 01:45:17,167] [INFO] [config.py:1001:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False use_all_reduce_for_fetch_params=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True | |
[2024-08-14 01:45:17,167] [INFO] [config.py:1001:print] zero_enabled ................. True | |
[2024-08-14 01:45:17,167] [INFO] [config.py:1001:print] zero_force_ds_cpu_optimizer .. True | |
[2024-08-14 01:45:17,167] [INFO] [config.py:1001:print] zero_optimization_stage ...... 2 | |
[2024-08-14 01:45:17,167] [INFO] [config.py:987:print_user_config] json = { | |
"train_batch_size": 128, | |
"gradient_accumulation_steps": 1, | |
"train_micro_batch_size_per_gpu": 16, | |
"steps_per_print": 1, | |
"zero_optimization": { | |
"stage": 2, | |
"offload_param": { | |
"device": "none" | |
}, | |
"offload_optimizer": { | |
"device": "none" | |
} | |
}, | |
"bf16": { | |
"enabled": true | |
}, | |
"gradient_clipping": 1.0, | |
"prescale_gradients": false, | |
"wall_clock_breakdown": false | |
} | |
[2024-08-14 01:45:17,167] [WARNING] [engine.py:2749:load_checkpoint] Unable to find latest file at /var/mnt/inststg1/instructlab/job/checkpoints/skills/ds_native/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. | |
Number of samples per save: 6400 | |
Epoch 0: 0%| | 0/51 [00:00<?, ?it/s] total tokens: 21792 num samples: 16 num padding tokens: 15537 - rank: 7 max len: 1362 min len: 38 avg len: 390.9375 num_loss_counted_tokens: 2699 | |
total tokens: 24320 num samples: 16 num padding tokens: 14577 - rank: 7 max len: 1520 min len: 38 avg len: 608.9375 num_loss_counted_tokens: 2116 | |
total tokens: 22928 num samples: 16 num padding tokens: 15843 - rank: 7 max len: 1433 min len: 38 avg len: 442.8125 num_loss_counted_tokens: 3184 | |
total tokens: 23168 num samples: 16 num padding tokens: 12949 - rank: 7 max len: 1448 min len: 33 avg len: 638.6875 num_loss_counted_tokens: 2746 | |
total tokens: 22032 num samples: 16 num padding tokens: 13003 - rank: 2 max len: 1377 min len: 43 avg len: 564.3125 num_loss_counted_tokens: 2812 | |
total tokens: 26176 num samples: 16 num padding tokens: 16168 - rank: 7 max len: 1636 min len: 43 avg len: 625.5 num_loss_counted_tokens: 1791 | |
total tokens: 28720 num samples: 16 num padding tokens: 19930 - rank: 7 max len: 1795 min len: 45 avg len: 549.375 num_loss_counted_tokens: 2282 | |
total tokens: 33808 num samples: 16 num padding tokens: 23031 - rank: 2 max len: 2113 min len: 47 avg len: 673.5625 num_loss_counted_tokens: 2282 | |
total tokens: 46608 num samples: 16 num padding tokens: 35016 - rank: 2 max len: 2913 min len: 49 avg len: 724.5 num_loss_counted_tokens: 1399 | |
total tokens: 32272 num samples: 16 num padding tokens: 17537 - rank: 2 max len: 2017 min len: 105 avg len: 920.9375 num_loss_counted_tokens: 2400 | |
total tokens: 22704 num samples: 16 num padding tokens: 12179 - rank: 4 max len: 1419 min len: 47 avg len: 657.8125 num_loss_counted_tokens: 2868 | |
total tokens: 20672 num samples: 16 num padding tokens: 11497 - rank: 4 max len: 1292 min len: 44 avg len: 573.4375 num_loss_counted_tokens: 1618 | |
total tokens: 32464 num samples: 16 num padding tokens: 19304 - rank: 4 max len: 2029 min len: 50 avg len: 822.5 num_loss_counted_tokens: 2534 | |
total tokens: 20656 num samples: 16 num padding tokens: 13082 - rank: 5 max len: 1291 min len: 43 avg len: 473.375 num_loss_counted_tokens: 2355 | |
total tokens: 22704 num samples: 16 num padding tokens: 13824 - rank: 2 max len: 1419 min len: 50 avg len: 555.0 num_loss_counted_tokens: 2467 | |
total tokens: 22016 num samples: 16 num padding tokens: 12716 - rank: 7 max len: 1376 min len: 35 avg len: 581.25 num_loss_counted_tokens: 1886 | |
total tokens: 61488 num samples: 16 num padding tokens: 47978 - rank: 7 max len: 3843 min len: 25 avg len: 844.375 num_loss_counted_tokens: 1882 | |
total tokens: 27968 num samples: 16 num padding tokens: 16457 - rank: 7 max len: 1748 min len: 46 avg len: 719.4375 num_loss_counted_tokens: 1488 total tokens: 22000 num samples: 16 num padding tokens: 10160 - rank: 7 max len: 1375 min len: 39 avg len: 740.0 num_loss_counted_tokens: 2771 | |
total tokens: 25584 num samples: 16 num padding tokens: 14081 - rank: 7 max len: 1599 min len: 49 avg len: 718.9375 num_loss_counted_tokens: 1585 | |
total tokens: 54192 num samples: 16 num padding tokens: 42366 - rank: 7 max len: 3387 min len: 71 avg len: 739.125 num_loss_counted_tokens: 1836 | |
total tokens: 24256 num samples: 16 num padding tokens: 13077 - rank: 5 max len: 1516 min len: 40 avg len: 698.6875 num_loss_counted_tokens: 1475 | |
total tokens: 35520 num samples: 16 num padding tokens: 21717 - rank: 5 max len: 2220 min len: 68 avg len: 862.6875 num_loss_counted_tokens: 2541 | |
total tokens: 27072 num samples: 16 num padding tokens: 17916 - rank: 2 max len: 1692 min len: 46 avg len: 572.25 num_loss_counted_tokens: 2075 | |
total tokens: 29056 num samples: 16 num padding tokens: 15692 - rank: 4 max len: 1816 min len: 38 avg len: 835.25 num_loss_counted_tokens: 1684 | |
total tokens: 21664 num samples: 16 num padding tokens: 12035 - rank: 2 max len: 1354 min len: 23 avg len: 601.8125 num_loss_counted_tokens: 1390 | |
total tokens: 17824 num samples: 16 num padding tokens: 10827 - rank: 5 max len: 1114 min len: 40 avg len: 437.3125 num_loss_counted_tokens: 1794 | |
total tokens: 23280 num samples: 16 num padding tokens: 11825 - rank: 5 max len: 1455 min len: 29 avg len: 715.9375 num_loss_counted_tokens: 2334 | |
total tokens: 23440 num samples: 16 num padding tokens: 11253 - rank: 2 max len: 1465 min len: 45 avg len: 761.6875 num_loss_counted_tokens: 2146 | |
total tokens: 18512 num samples: 16 num padding tokens: 10505 - rank: 1 max len: 1157 min len: 39 avg len: 500.4375 num_loss_counted_tokens: 2036 | |
total tokens: 36160 num samples: 16 num padding tokens: 19060 - rank: 4 max len: 2260 min len: 101 avg len: 1068.75 num_loss_counted_tokens: 2828 | |
total tokens: 22016 num samples: 16 num padding tokens: 12263 - rank: 2 max len: 1376 min len: 45 avg len: 609.5625 num_loss_counted_tokens: 1678 | |
total tokens: 32288 num samples: 16 num padding tokens: 21423 - rank: 7 max len: 2018 min len: 45 avg len: 679.0625 num_loss_counted_tokens: 2816 | |
total tokens: 19856 num samples: 16 num padding tokens: 12240 - rank: 4 max len: 1241 min len: 55 avg len: 476.0 num_loss_counted_tokens: 1763 | |
total tokens: 22656 num samples: 16 num padding tokens: 8794 - rank: 1 max len: 1416 min len: 34 avg len: 866.375 num_loss_counted_tokens: 2046 | |
total tokens: 49456 num samples: 16 num padding tokens: 36945 - rank: 2 max len: 3091 min len: 42 avg len: 781.9375 num_loss_counted_tokens: 2106 | |
total tokens: 26768 num samples: 16 num padding tokens: 18148 - rank: 4 max len: 1673 min len: 39 avg len: 538.75 num_loss_counted_tokens: 2142 | |
total tokens: 50992 num samples: 16 num padding tokens: 39728 - rank: 2 max len: 3187 min len: 90 avg len: 704.0 num_loss_counted_tokens: 1354 | |
total tokens: 27792 num samples: 16 num padding tokens: 19027 - rank: 7 max len: 1737 min len: 70 avg len: 547.8125 num_loss_counted_tokens: 2063 | |
total tokens: 29376 num samples: 16 num padding tokens: 15634 - rank: 2 max len: 1836 min len: 63 avg len: 858.875 num_loss_counted_tokens: 2026 | |
total tokens: 27488 num samples: 16 num padding tokens: 17232 - rank: 2 max len: 1718 min len: 55 avg len: 641.0 num_loss_counted_tokens: 2252 | |
total tokens: 47760 num samples: 16 num padding tokens: 33112 - rank: 7 max len: 2985 min len: 61 avg len: 915.5 num_loss_counted_tokens: 1638 | |
total tokens: 50832 num samples: 16 num padding tokens: 35796 - rank: 5 max len: 3177 min len: 40 avg len: 939.75 num_loss_counted_tokens: 1795 | |
total tokens: 25696 num samples: 16 num padding tokens: 12397 - rank: 4 max len: 1606 min len: 43 avg len: 831.1875 num_loss_counted_tokens: 2561 | |
total tokens: 61024 num samples: 16 num padding tokens: 47241 - rank: 1 max len: 3814 min len: 24 avg len: 861.4375 num_loss_counted_tokens: 1823 | |
total tokens: 48544 num samples: 16 num padding tokens: 34266 - rank: 5 max len: 3034 min len: 40 avg len: 892.375 num_loss_counted_tokens: 1372 | |
total tokens: 21168 num samples: 16 num padding tokens: 11020 - rank: 4 max len: 1323 min len: 57 avg len: 634.25 num_loss_counted_tokens: 2681 | |
total tokens: 29920 num samples: 16 num padding tokens: 18288 - rank: 4 max len: 1870 min len: 54 avg len: 727.0 num_loss_counted_tokens: 2835 | |
total tokens: 45248 num samples: 16 num padding tokens: 32728 - rank: 1 max len: 2828 min len: 64 avg len: 782.5 num_loss_counted_tokens: 2456 | |
total tokens: 33488 num samples: 16 num padding tokens: 21788 - rank: 7 max len: 2093 min len: 36 avg len: 731.25 num_loss_counted_tokens: 1081 | |
total tokens: 27136 num samples: 16 num padding tokens: 20160 - rank: 4 max len: 1696 min len: 42 avg len: 436.0 num_loss_counted_tokens: 2419 | |
total tokens: 46176 num samples: 16 num padding tokens: 30240 - rank: 1 max len: 2886 min len: 77 avg len: 996.0 num_loss_counted_tokens: 2499 | |
total tokens: 22256 num samples: 16 num padding tokens: 11571 - rank: 5 max len: 1391 min len: 46 avg len: 667.8125 num_loss_counted_tokens: 2061 | |
total tokens: 39824 num samples: 16 num padding tokens: 27382 - rank: 2 max len: 2489 min len: 54 avg len: 777.625 num_loss_counted_tokens: 1090 | |
total tokens: 22592 num samples: 16 num padding tokens: 14520 - rank: 5 max len: 1412 min len: 31 avg len: 504.5 num_loss_counted_tokens: 1767 | |
total tokens: 18128 num samples: 16 num padding tokens: 10417 - rank: 5 max len: 1133 min len: 24 avg len: 481.9375 num_loss_counted_tokens: 1230 | |
total tokens: 29232 num samples: 16 num padding tokens: 17299 - rank: 5 max len: 1827 min len: 42 avg len: 745.8125 num_loss_counted_tokens: 2065 | |
total tokens: 24704 num samples: 16 num padding tokens: 15052 - rank: 2 max len: 1544 min len: 45 avg len: 603.25 num_loss_counted_tokens: 2382 | |
total tokens: 23120 num samples: 16 num padding tokens: 14590 - rank: 5 max len: 1445 min len: 46 avg len: 533.125 num_loss_counted_tokens: 2168 | |
total tokens: 59504 num samples: 16 num padding tokens: 47275 - rank: 7 max len: 3719 min len: 32 avg len: 764.3125 num_loss_counted_tokens: 1511 | |
total tokens: 27456 num samples: 16 num padding tokens: 18656 - rank: 1 max len: 1716 min len: 36 avg len: 550.0 num_loss_counted_tokens: 1502 | |
total tokens: 21760 num samples: 16 num padding tokens: 11638 - rank: 2 max len: 1360 min len: 47 avg len: 632.625 num_loss_counted_tokens: 2473 | |
total tokens: 46096 num samples: 16 num padding tokens: 33319 - rank: 5 max len: 2881 min len: 61 avg len: 798.5625 num_loss_counted_tokens: 1982 | |
total tokens: 25040 num samples: 16 num padding tokens: 14670 - rank: 4 max len: 1565 min len: 26 avg len: 648.125 num_loss_counted_tokens: 1623 | |
total tokens: 33248 num samples: 16 num padding tokens: 23096 - rank: 1 max len: 2078 min len: 46 avg len: 634.5 num_loss_counted_tokens: 1489 | |
total tokens: 37808 num samples: 16 num padding tokens: 28732 - rank: 4 max len: 2363 min len: 63 avg len: 567.25 num_loss_counted_tokens: 2415 | |
total tokens: 23984 num samples: 16 num padding tokens: 13252 - rank: 5 max len: 1499 min len: 62 avg len: 670.75 num_loss_counted_tokens: 2194 | |
total tokens: 22224 num samples: 16 num padding tokens: 11762 - rank: 1 max len: 1389 min len: 44 avg len: 653.875 num_loss_counted_tokens: 1747 | |
total tokens: 46752 num samples: 16 num padding tokens: 32809 - rank: 4 max len: 2922 min len: 52 avg len: 871.4375 num_loss_counted_tokens: 1606 | |
total tokens: 46544 num samples: 16 num padding tokens: 33825 - rank: 1 max len: 2909 min len: 50 avg len: 794.9375 num_loss_counted_tokens: 2515 | |
total tokens: 19920 num samples: 16 num padding tokens: 9089 - rank: 4 max len: 1245 min len: 68 avg len: 676.9375 num_loss_counted_tokens: 3124 | |
total tokens: 61376 num samples: 16 num padding tokens: 48833 - rank: 2 max len: 3836 min len: 25 avg len: 783.9375 num_loss_counted_tokens: 1455 | |
total tokens: 28224 num samples: 16 num padding tokens: 16347 - rank: 1 max len: 1764 min len: 50 avg len: 742.3125 num_loss_counted_tokens: 1516 | |
total tokens: 25264 num samples: 16 num padding tokens: 15027 - rank: 4 max len: 1579 min len: 28 avg len: 639.8125 num_loss_counted_tokens: 1200 | |
total tokens: 24608 num samples: 16 num padding tokens: 15212 - rank: 0 max len: 1538 min len: 52 avg len: 587.25 num_loss_counted_tokens: 1858 | |
total tokens: 22528 num samples: 16 num padding tokens: 13635 - rank: 0 max len: 1408 min len: 139 avg len: 555.8125 num_loss_counted_tokens: 3126 | |
total tokens: 25568 num samples: 16 num padding tokens: 13320 - rank: 5 max len: 1598 min len: 51 avg len: 765.5 num_loss_counted_tokens: 2257 | |
total tokens: 35040 num samples: 16 num padding tokens: 21359 - rank: 1 max len: 2190 min len: 99 avg len: 855.0625 num_loss_counted_tokens: 1913 | |
total tokens: 20048 num samples: 16 num padding tokens: 10241 - rank: 0 max len: 1253 min len: 25 avg len: 612.9375 num_loss_counted_tokens: 1003 | |
total tokens: 21120 num samples: 16 num padding tokens: 11957 - rank: 0 max len: 1320 min len: 45 avg len: 572.6875 num_loss_counted_tokens: 3093 | |
total tokens: 51456 num samples: 16 num padding tokens: 38197 - rank: 1 max len: 3216 min len: 51 avg len: 828.6875 num_loss_counted_tokens: 1835 | |
total tokens: 26112 num samples: 16 num padding tokens: 16647 - rank: 6 max len: 1632 min len: 44 avg len: 591.5625 num_loss_counted_tokens: 2699 | |
total tokens: 32736 num samples: 16 num padding tokens: 21702 - rank: 1 max len: 2046 min len: 25 avg len: 689.625 num_loss_counted_tokens: 2019 | |
total tokens: 21040 num samples: 16 num padding tokens: 11645 - rank: 1 max len: 1315 min len: 44 avg len: 587.1875 num_loss_counted_tokens: 1594 | |
total tokens: 36640 num samples: 16 num padding tokens: 26034 - rank: 6 max len: 2290 min len: 39 avg len: 662.875 num_loss_counted_tokens: 2439 | |
total tokens: 34656 num samples: 16 num padding tokens: 20164 - rank: 0 max len: 2166 min len: 94 avg len: 905.75 num_loss_counted_tokens: 1896 | |
total tokens: 32672 num samples: 16 num padding tokens: 19996 - rank: 1 max len: 2042 min len: 64 avg len: 792.25 num_loss_counted_tokens: 1398 | |
total tokens: 28976 num samples: 16 num padding tokens: 19248 - rank: 6 max len: 1811 min len: 48 avg len: 608.0 num_loss_counted_tokens: 1578 | |
total tokens: 58544 num samples: 16 num padding tokens: 45656 - rank: 0 max len: 3659 min len: 48 avg len: 805.5 num_loss_counted_tokens: 2208 | |
total tokens: 29584 num samples: 16 num padding tokens: 16885 - rank: 5 max len: 1849 min len: 26 avg len: 793.6875 num_loss_counted_tokens: 2171 | |
total tokens: 29424 num samples: 16 num padding tokens: 13482 - rank: 6 max len: 1839 min len: 149 avg len: 996.375 num_loss_counted_tokens: 2907 | |
total tokens: 26352 num samples: 16 num padding tokens: 15754 - rank: 6 max len: 1647 min len: 102 avg len: 662.375 num_loss_counted_tokens: 1628 | |
total tokens: 28608 num samples: 16 num padding tokens: 18345 - rank: 5 max len: 1788 min len: 25 avg len: 641.4375 num_loss_counted_tokens: 1701 | |
total tokens: 23552 num samples: 16 num padding tokens: 13352 - rank: 4 max len: 1472 min len: 92 avg len: 637.5 num_loss_counted_tokens: 2378 | |
total tokens: 23328 num samples: 16 num padding tokens: 11702 - rank: 3 max len: 1458 min len: 41 avg len: 726.625 num_loss_counted_tokens: 2060 | |
total tokens: 23104 num samples: 16 num padding tokens: 12999 - rank: 3 max len: 1444 min len: 37 avg len: 631.5625 num_loss_counted_tokens: 2421 | |
total tokens: 33056 num samples: 16 num padding tokens: 25347 - rank: 1 max len: 2066 min len: 28 avg len: 481.8125 num_loss_counted_tokens: 2532 | |
total tokens: 22128 num samples: 16 num padding tokens: 14822 - rank: 3 max len: 1383 min len: 44 avg len: 456.625 num_loss_counted_tokens: 847 | |
total tokens: 20896 num samples: 16 num padding tokens: 13643 - rank: 3 max len: 1306 min len: 34 avg len: 453.3125 num_loss_counted_tokens: 2790 | |
total tokens: 27728 num samples: 16 num padding tokens: 16207 - rank: 0 max len: 1733 min len: 25 avg len: 720.0625 num_loss_counted_tokens: 2094 | |
total tokens: 26144 num samples: 16 num padding tokens: 15239 - rank: 3 max len: 1634 min len: 43 avg len: 681.5625 num_loss_counted_tokens: 1818 | |
total tokens: 23888 num samples: 16 num padding tokens: 13951 - rank: 6 max len: 1493 min len: 44 avg len: 621.0625 num_loss_counted_tokens: 2239 | |
total tokens: 44064 num samples: 16 num padding tokens: 28283 - rank: 3 max len: 2754 min len: 40 avg len: 986.3125 num_loss_counted_tokens: 1756 | |
total tokens: 30784 num samples: 16 num padding tokens: 21767 - rank: 0 max len: 1924 min len: 35 avg len: 563.5625 num_loss_counted_tokens: 1439 | |
total tokens: 29888 num samples: 16 num padding tokens: 17662 - rank: 3 max len: 1868 min len: 79 avg len: 764.125 num_loss_counted_tokens: 1545 | |
total tokens: 45792 num samples: 16 num padding tokens: 28869 - rank: 6 max len: 2862 min len: 68 avg len: 1057.6875 num_loss_counted_tokens: 2244 | |
total tokens: 48128 num samples: 16 num padding tokens: 35579 - rank: 1 max len: 3008 min len: 49 avg len: 784.3125 num_loss_counted_tokens: 1822 | |
total tokens: 24640 num samples: 16 num padding tokens: 17365 - rank: 3 max len: 1540 min len: 23 avg len: 454.6875 num_loss_counted_tokens: 1554 | |
total tokens: 21520 num samples: 16 num padding tokens: 10345 - rank: 6 max len: 1345 min len: 40 avg len: 698.4375 num_loss_counted_tokens: 1994 | |
total tokens: 33952 num samples: 16 num padding tokens: 23227 - rank: 0 max len: 2122 min len: 28 avg len: 670.3125 num_loss_counted_tokens: 1048 | |
total tokens: 27104 num samples: 16 num padding tokens: 17829 - rank: 0 max len: 1694 min len: 40 avg len: 579.6875 num_loss_counted_tokens: 1596 | |
total tokens: 28512 num samples: 16 num padding tokens: 18425 - rank: 0 max len: 1782 min len: 78 avg len: 630.4375 num_loss_counted_tokens: 2139 | |
total tokens: 21056 num samples: 16 num padding tokens: 10079 - rank: 0 max len: 1316 min len: 54 avg len: 686.0625 num_loss_counted_tokens: 2155 | |
total tokens: 26592 num samples: 16 num padding tokens: 13334 - rank: 0 max len: 1662 min len: 43 avg len: 828.625 num_loss_counted_tokens: 1734 | |
total tokens: 27200 num samples: 16 num padding tokens: 11739 - rank: 6 max len: 1700 min len: 60 avg len: 966.3125 num_loss_counted_tokens: 1522 | |
total tokens: 27888 num samples: 16 num padding tokens: 15163 - rank: 0 max len: 1743 min len: 45 avg len: 795.3125 num_loss_counted_tokens: 3376 | |
total tokens: 43600 num samples: 16 num padding tokens: 29026 - rank: 6 max len: 2725 min len: 28 avg len: 910.875 num_loss_counted_tokens: 2007 | |
total tokens: 25104 num samples: 16 num padding tokens: 18082 - rank: 6 max len: 1569 min len: 40 avg len: 438.875 num_loss_counted_tokens: 2118 | |
total tokens: 39024 num samples: 16 num padding tokens: 29402 - rank: 6 max len: 2439 min len: 44 avg len: 601.375 num_loss_counted_tokens: 3089 | |
total tokens: 29056 num samples: 16 num padding tokens: 16367 - rank: 6 max len: 1816 min len: 53 avg len: 793.0625 num_loss_counted_tokens: 1584 | |
total tokens: 20848 num samples: 16 num padding tokens: 11329 - rank: 3 max len: 1303 min len: 36 avg len: 594.9375 num_loss_counted_tokens: 3612 | |
total tokens: 37120 num samples: 16 num padding tokens: 23416 - rank: 3 max len: 2320 min len: 41 avg len: 856.5 num_loss_counted_tokens: 1989 | |
total tokens: 24096 num samples: 16 num padding tokens: 14590 - rank: 3 max len: 1506 min len: 41 avg len: 594.125 num_loss_counted_tokens: 3425 | |
total tokens: 28960 num samples: 16 num padding tokens: 18715 - rank: 0 max len: 1810 min len: 42 avg len: 640.3125 num_loss_counted_tokens: 2582 | |
total tokens: 29232 num samples: 16 num padding tokens: 18283 - rank: 0 max len: 1827 min len: 31 avg len: 684.3125 num_loss_counted_tokens: 1859 | |
total tokens: 27488 num samples: 16 num padding tokens: 17819 - rank: 3 max len: 1718 min len: 53 avg len: 604.3125 num_loss_counted_tokens: 1370 | |
total tokens: 47216 num samples: 16 num padding tokens: 36241 - rank: 6 max len: 2951 min len: 47 avg len: 685.9375 num_loss_counted_tokens: 1592 | |
total tokens: 20736 num samples: 16 num padding tokens: 10992 - rank: 6 max len: 1296 min len: 75 avg len: 609.0 num_loss_counted_tokens: 1759 | |
total tokens: 23344 num samples: 16 num padding tokens: 13570 - rank: 0 max len: 1459 min len: 27 avg len: 610.875 num_loss_counted_tokens: 2906 | |
total tokens: 28160 num samples: 16 num padding tokens: 16435 - rank: 3 max len: 1760 min len: 25 avg len: 732.8125 num_loss_counted_tokens: 1378 | |
total tokens: 27536 num samples: 16 num padding tokens: 18149 - rank: 3 max len: 1721 min len: 44 avg len: 586.6875 num_loss_counted_tokens: 836 | |
total tokens: 22272 num samples: 16 num padding tokens: 9760 - rank: 3 max len: 1392 min len: 33 avg len: 782.0 num_loss_counted_tokens: 2193 | |
total tokens: 22448 num samples: 16 num padding tokens: 13598 - rank: 6 max len: 1403 min len: 83 avg len: 553.125 num_loss_counted_tokens: 1609 | |
total tokens: 40096 num samples: 16 num padding tokens: 26342 - rank: 3 max len: 2506 min len: 33 avg len: 859.625 num_loss_counted_tokens: 2911 | |
total tokens: 18992 num samples: 16 num padding tokens: 9570 - rank: 6 max len: 1187 min len: 40 avg len: 588.875 num_loss_counted_tokens: 3087 | |
total tokens: 55408 num samples: 16 num padding tokens: 40860 - rank: 3 max len: 3463 min len: 48 avg len: 909.25 num_loss_counted_tokens: 1997 | |
Per-token loss scaled by world size: 0.00046148421824909747 | |
Per-token loss scaled by world size: 0.0003078000736422837Per-token loss scaled by world size: 0.0005909534520469606 | |
Epoch: 0, Step: 1, Rank: 4, loss = 0.9456965327262878 | |
Per-token loss scaled by world size: 0.00037035910645499825 | |
Epoch: 0, Step: 1, Rank: 3, loss = 0.6307592988014221 | |
Epoch: 0, Step: 1, Rank: 0, loss = 1.2110114097595215 | |
Epoch: 0, Step: 1, Rank: 1, loss = 0.7589583992958069 | |
Per-token loss scaled by world size: 0.00047682944568805397 | |
Epoch: 0, Step: 1, Rank: 7, loss = 0.9771427512168884 | |
Per-token loss scaled by world size: 0.0003861629811581224 | |
Epoch: 0, Step: 1, Rank: 5, loss = 0.7913444638252258 | |
Per-token loss scaled by world size: 0.00029413917218334973 | |
Epoch: 0, Step: 1, Rank: 6, loss = 0.6027647256851196 | |
Per-token loss scaled by world size: 0.0002454993373248726 | |
Epoch: 0, Step: 1, Rank: 2, loss = 0.5030894875526428 | |
[2024-08-14 01:45:30,196] [INFO] [logging.py:96:log_dist] [Rank 0] step=1, skipped=0, lr=[8.000000000000001e-07], mom=[(0.9, 0.95)] | |
Epoch 0: 2%|▏ | 1/51 [00:13<10:54, 13.10s/it] total tokens: 25024 num samples: 16 num padding tokens: 14643 - rank: 1 max len: 1564 min len: 37 avg len: 648.8125 num_loss_counted_tokens: 3176 | |
{ | |
"epoch": 0, | |
"step": 1, | |
"rank": 0, | |
"loss": 1.2110114097595215, | |
"overall_throughput": 10.194154033715293, | |
"lr": 8.000000000000001e-07, | |
"cuda_mem_allocated": 24.50210428237915, | |
"cuda_malloc_retries": 0, | |
"num_loss_counted_tokens": 16394, | |
"batch_size": 128, | |
"total_loss": 0.802595853805542, | |
"gradnorm": 2.6339778900146484, | |
"weight_norm": 393.45489501953125, | |
"timestamp": "2024-08-14T01:45:30.338877" | |
} | |
total tokens: 25952 num samples: 16 num padding tokens: 18060 - rank: 0 max len: 1622 min len: 38 avg len: 493.25 num_loss_counted_tokens: 2633 | |
total tokens: 15984 num samples: 16 num padding tokens: 10463 - rank: 4 max len: 999 min len: 26 avg len: 345.0625 num_loss_counted_tokens: 1996 | |
total tokens: 24960 num samples: 16 num padding tokens: 16798 - rank: 3 max len: 1560 min len: 43 avg len: 510.125 num_loss_counted_tokens: 2483 | |
total tokens: 28064 num samples: 16 num padding tokens: 18504 - rank: 7 max len: 1754 min len: 55 avg len: 597.5 num_loss_counted_tokens: 2635 | |
total tokens: 22656 num samples: 16 num padding tokens: 12379 - rank: 2 max len: 1416 min len: 43 avg len: 642.3125 num_loss_counted_tokens: 1277 | |
total tokens: 49584 num samples: 16 num padding tokens: 35511 - rank: 5 max len: 3099 min len: 146 avg len: 879.5625 num_loss_counted_tokens: 2425 | |
total tokens: 22608 num samples: 16 num padding tokens: 13837 - rank: 6 max len: 1413 min len: 43 avg len: 548.1875 num_loss_counted_tokens: 1060 | |
Per-token loss scaled by world size: 0.0003556730807758868Per-token loss scaled by world size: 0.00022773849195800722Per-token loss scaled by world size: 0.0006776505615562201 | |
Per-token loss scaled by world size: 0.00044580132816918194 | |
Per-token loss scaled by world size: 0.0005896483198739588Per-token loss scaled by world size: 0.0004393818380776793Per-token loss scaled by world size: 0.0001734526886139065 | |
Per-token loss scaled by world size: 0.0004239437112119049Epoch: 0, Step: 2, Rank: 5, loss = 0.892494261264801Epoch: 0, Step: 2, Rank: 3, loss = 0.45593246817588806 | |
Epoch: 0, Step: 2, Rank: 0, loss = 0.3472522795200348Epoch: 0, Step: 2, Rank: 7, loss = 1.3566564321517944 | |
Epoch: 0, Step: 2, Rank: 4, loss = 0.8796424269676208 | |
Epoch: 0, Step: 2, Rank: 2, loss = 1.1804759502410889 | |
Epoch: 0, Step: 2, Rank: 1, loss = 0.7120575308799744 | |
Epoch: 0, Step: 2, Rank: 6, loss = 0.8487353324890137 | |
[2024-08-14 01:45:42,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=2, skipped=0, lr=[1.6000000000000001e-06], mom=[(0.9, 0.95)] | |
Epoch 0: 4%|▍ | 2/51 [00:25<10:19, 12.65s/it] total tokens: 20784 num samples: 16 num padding tokens: 12242 - rank: 5 max len: 1299 min len: 51 avg len: 533.875 num_loss_counted_tokens: 3500 | |
total tokens: 20432 num samples: 16 num padding tokens: 12007 - rank: 7 max len: 1277 min len: 32 avg len: 526.5625 num_loss_counted_tokens: 2069 | |
total tokens: 24128 num samples: 16 num padding tokens: 12641 - rank: 3 max len: 1508 min len: 78 avg len: 717.9375 num_loss_counted_tokens: 2061 | |
{ | |
"epoch": 0, | |
"step": 2, | |
"rank": 0, | |
"loss": 0.3472522795200348, | |
"overall_throughput": 10.509736979631187, | |
"lr": 1.6000000000000001e-06, | |
"cuda_mem_allocated": 24.37437677383423, | |
"cuda_malloc_retries": 0, | |
"num_loss_counted_tokens": 16016, | |
"batch_size": 128, | |
"total_loss": 0.8341558575630188, | |
"gradnorm": 2.4150538444519043, | |
"weight_norm": 393.4549255371094, | |
"timestamp": "2024-08-14T01:45:42.601302" | |
} | |
total tokens: 30864 num samples: 16 num padding tokens: 21875 - rank: 0 max len: 1929 min len: 65 avg len: 561.8125 num_loss_counted_tokens: 1091 | |
total tokens: 23152 num samples: 16 num padding tokens: 10565 - rank: 1 max len: 1447 min len: 44 avg len: 786.6875 num_loss_counted_tokens: 1894 | |
total tokens: 23440 num samples: 16 num padding tokens: 13783 - rank: 4 max len: 1465 min len: 40 avg len: 603.5625 num_loss_counted_tokens: 1728 | |
total tokens: 61712 num samples: 16 num padding tokens: 45301 - rank: 6 max len: 3857 min len: 64 avg len: 1025.6875 num_loss_counted_tokens: 1165 | |
total tokens: 24416 num samples: 16 num padding tokens: 16350 - rank: 2 max len: 1526 min len: 44 avg len: 504.125 num_loss_counted_tokens: 2226 | |
Per-token loss scaled by world size: 0.0003701472014654428Per-token loss scaled by world size: 0.0004194203356746584 | |
Per-token loss scaled by world size: 0.00027423648862168193Per-token loss scaled by world size: 0.00026851141592487693Per-token loss scaled by world size: 0.00023882849200163037 | |
Per-token loss scaled by world size: 0.00040941991028375924 | |
Per-token loss scaled by world size: 0.000399620650568977 | |
Epoch: 0, Step: 3, Rank: 3, loss = 0.988835871219635Epoch: 0, Step: 3, Rank: 5, loss = 0.8726682662963867 | |
Epoch: 0, Step: 3, Rank: 6, loss = 0.9652585983276367 | |
Epoch: 0, Step: 3, Rank: 7, loss = 0.6465467810630798Epoch: 0, Step: 3, Rank: 4, loss = 0.5630680322647095 | |
Per-token loss scaled by world size: 0.00032270338851958513Epoch: 0, Step: 3, Rank: 2, loss = 0.6330492496490479 | |
Epoch: 0, Step: 3, Rank: 0, loss = 0.942155659198761 | |
Epoch: 0, Step: 3, Rank: 1, loss = 0.7608135938644409 | |
[2024-08-14 01:45:54,602] [INFO] [logging.py:96:log_dist] [Rank 0] step=3, skipped=0, lr=[2.4000000000000003e-06], mom=[(0.9, 0.95)] | |
[2024-08-14 01:45:54,679] [INFO] [timer.py:258:stop] epoch=0/micro_step=3/global_step=3, RunningAvgSamplesPerSec=10.69846342292299, CurrSamplesPerSec=10.69846342292299, MemAllocated=24.67GB, MaxMemAllocated=41.3GB | |
Epoch 0: 6%|▌ | 3/51 [00:37<09:54, 12.39s/it] total tokens: 19616 num samples: 16 num padding tokens: 10092 - rank: 5 max len: 1226 min len: 44 avg len: 595.25 num_loss_counted_tokens: 2658 | |
total tokens: 24192 num samples: 16 num padding tokens: 15649 - rank: 3 max len: 1512 min len: 38 avg len: 533.9375 num_loss_counted_tokens: 1510 | |
{ | |
"epoch": 0, | |
"step": 3, | |
"rank": 0, | |
"loss": 0.942155659198761, | |
"overall_throughput": 10.695139924074537, | |
"lr": 2.4000000000000003e-06, | |
"cuda_mem_allocated": 24.670024394989014, | |
"cuda_malloc_retries": 0, | |
"num_loss_counted_tokens": 18861, | |
"batch_size": 128, | |
"total_loss": 0.7965494990348816, | |
"gradnorm": 2.716317653656006, | |
"weight_norm": 393.4549560546875, | |
"timestamp": "2024-08-14T01:45:54.786824" | |
} | |
total tokens: 33488 num samples: 16 num padding tokens: 19481 - rank: 4 max len: 2093 min len: 35 avg len: 875.4375 num_loss_counted_tokens: 1420 | |
total tokens: 36288 num samples: 16 num padding tokens: 24061 - rank: 2 max len: 2268 min len: 64 avg len: 764.1875 num_loss_counted_tokens: 2846 | |
total tokens: 36688 num samples: 16 num padding tokens: 24862 - rank: 0 max len: 2293 min len: 55 avg len: 739.125 num_loss_counted_tokens: 2645 | |
total tokens: 19168 num samples: 16 num padding tokens: 10794 - rank: 6 max len: 1198 min len: 51 avg len: 523.375 num_loss_counted_tokens: 2526 | |
total tokens: 22064 num samples: 16 num padding tokens: 15484 - rank: 7 max len: 1379 min len: 42 avg len: 411.25 num_loss_counted_tokens: 2059 | |
total tokens: 48816 num samples: 16 num padding tokens: 36369 - rank: 1 max len: 3051 min len: 47 avg len: 777.9375 num_loss_counted_tokens: 1535 | |
Per-token loss scaled by world size: 0.0002930409100372344 | |
Per-token loss scaled by world size: 0.0002132939698640257 | |
Per-token loss scaled by world size: 0.0002520309644751251Per-token loss scaled by world size: 0.00039175417623482645Per-token loss scaled by world size: 0.0002627032808959484Per-token loss scaled by world size: 0.0006507682264782488 | |
Per-token loss scaled by world size: 0.00046710125752724707 | |
Epoch: 0, Step: 4, Rank: 1, loss = 0.6686460971832275Epoch: 0, Step: 4, Rank: 3, loss = 0.48668351769447327 | |
Epoch: 0, Step: 4, Rank: 0, loss = 0.5750716328620911 | |
Epoch: 0, Step: 4, Rank: 2, loss = 0.5994232296943665Epoch: 0, Step: 4, Rank: 4, loss = 0.89388507604599 | |
Epoch: 0, Step: 4, Rank: 7, loss = 1.484890341758728 | |
Epoch: 0, Step: 4, Rank: 6, loss = 1.0658082962036133 | |
Per-token loss scaled by world size: 0.000269142648903653 | |
Epoch: 0, Step: 4, Rank: 5, loss = 0.6141162514686584 | |
[2024-08-14 01:46:08,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=4, skipped=0, lr=[3.2000000000000003e-06], mom=[(0.9, 0.95)] | |
[2024-08-14 01:46:09,031] [INFO] [timer.py:258:stop] epoch=0/micro_step=4/global_step=4, RunningAvgSamplesPerSec=9.770269785795474, CurrSamplesPerSec=8.990277548873404, MemAllocated=24.92GB, MaxMemAllocated=43.16GB | |
Epoch 0: 8%|▊ | 4/51 [00:51<10:18, 13.16s/it] total tokens: 20768 num samples: 16 num padding tokens: 13571 - rank: 7 max len: 1298 min len: 57 avg len: 449.8125 num_loss_counted_tokens: 1599 | |
total tokens: 31072 num samples: 16 num padding tokens: 21390 - rank: 5 max len: 1942 min len: 99 avg len: 605.125 num_loss_counted_tokens: 3255 | |
total tokens: 48336 num samples: 16 num padding tokens: 36619 - rank: 1 max len: 3021 min len: 42 avg len: 732.3125 num_loss_counted_tokens: 1139 | |
total tokens: 61856 num samples: 16 num padding tokens: 46385 - rank: 2 max len: 3866 min len: 44 avg len: 966.9375 num_loss_counted_tokens: 1497 | |
{ | |
"epoch": 0, | |
"step": 4, | |
"rank": 0, | |
"loss": 0.5750716328620911, | |
"overall_throughput": 8.987825933695113, | |
"lr": 3.2000000000000003e-06, | |
"cuda_mem_allocated": 24.918088912963867, | |
"cuda_malloc_retries": 0, | |
"num_loss_counted_tokens": 18254, | |
"batch_size": 128, | |
"total_loss": 0.7985655665397644, | |
"gradnorm": 2.3222267627716064, | |
"weight_norm": 393.4549865722656, | |
"timestamp": "2024-08-14T01:46:09.176830" | |
} | |
total tokens: 22288 num samples: 16 num padding tokens: 14419 - rank: 6 max len: 1393 min len: 88 avg len: 491.8125 num_loss_counted_tokens: 2238 | |
total tokens: 19632 num samples: 16 num padding tokens: 9136 - rank: 4 max len: 1227 min len: 48 avg len: 656.0 num_loss_counted_tokens: 2828 | |
total tokens: 26016 num samples: 16 num padding tokens: 17983 - rank: 0 max len: 1626 min len: 24 avg len: 502.0625 num_loss_counted_tokens: 2230 | |
total tokens: 26320 num samples: 16 num padding tokens: 12921 - rank: 3 max len: 1645 min len: 49 avg len: 837.4375 num_loss_counted_tokens: 2190 | |
[rank1]: Traceback (most recent call last): | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 781, in <module> | |
[rank1]: main(args) | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 563, in main | |
[rank1]: train(args, model, tokenizer, train_loader, grad_accum, metric_logger) | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/main_ds.py", line 383, in train | |
[rank1]: output = model( | |
[rank1]: ^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl | |
[rank1]: return self._call_impl(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl | |
[rank1]: return forward_call(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn | |
[rank1]: ret_val = func(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/deepspeed/runtime/engine.py", line 1846, in forward | |
[rank1]: loss = self.module(*inputs, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl | |
[rank1]: return self._call_impl(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl | |
[rank1]: return forward_call(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/utils.py", line 274, in reduce_sum_forward | |
[rank1]: loss = loss_fct(shift_logits, shift_labels) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl | |
[rank1]: return self._call_impl(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl | |
[rank1]: return forward_call(*args, **kwargs) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/modules/loss.py", line 1185, in forward | |
[rank1]: return F.cross_entropy(input, target, weight=self.weight, | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/nn/functional.py", line 3086, in cross_entropy | |
[rank1]: return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing) | |
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank1]: torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 7.28 GiB. GPU has a total capacity of 79.14 GiB of which 5.14 GiB is free. Including non-PyTorch memory, this process has 0 bytes memory in use. Of the allocated memory 66.08 GiB is allocated by PyTorch, and 5.27 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) | |
dev-rhel-ai-training-client-11:593:1371 [1] NCCL INFO [Service thread] Connection closed by localRank 1 | |
dev-rhel-ai-training-client-11:593:5794 [0] NCCL INFO comm 0x562c7d22bea0 rank 1 nranks 8 cudaDev 1 busId 8020 - Abort COMPLETE | |
W0814 01:46:19.402000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 592 closing signal SIGTERM | |
W0814 01:46:19.405000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 594 closing signal SIGTERM | |
W0814 01:46:19.428000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 595 closing signal SIGTERM | |
W0814 01:46:19.445000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 596 closing signal SIGTERM | |
W0814 01:46:19.447000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 597 closing signal SIGTERM | |
W0814 01:46:19.451000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 598 closing signal SIGTERM | |
W0814 01:46:19.453000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 599 closing signal SIGTERM | |
E0814 01:46:19.488000 139736190685632 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 593) of binary: /opt/python3.11/venv/bin/python3.11 | |
Traceback (most recent call last): | |
File "/opt/python3.11/venv/bin/torchrun", line 8, in <module> | |
sys.exit(main()) | |
^^^^^^ | |
File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper | |
return f(*args, **kwargs) | |
^^^^^^^^^^^^^^^^^^ | |
File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/distributed/run.py", line 879, in main | |
run(args) | |
File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/distributed/run.py", line 870, in run | |
elastic_launch( | |
File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/distributed/launcher/api.py", line 132, in __call__ | |
return launch_agent(self._config, self._entrypoint, list(args)) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/opt/python3.11/venv/lib64/python3.11/site-packages/torch/distributed/launcher/api.py", line 263, in launch_agent | |
raise ChildFailedError( | |
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: | |
============================================================ | |
/opt/python3.11/venv/lib64/python3.11/site-packages/instructlab/training/main_ds.py FAILED | |
------------------------------------------------------------ | |
Failures: | |
<NO_OTHER_FAILURES> | |
------------------------------------------------------------ | |
Root Cause (first observed failure): | |
[0]: | |
time : 2024-08-14_01:46:19 | |
host : dev-rhel-ai-training-client-11 | |
rank : 1 (local_rank: 1) | |
exitcode : 1 (pid: 593) | |
error_file: <N/A> | |
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html | |
============================================================ | |
[root@dev-rhel-ai-training-client-11 ~]# |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment