Created
April 10, 2025 12:59
-
-
Save brianlechthaler/0c42c28e1d6f761619639abe4cdb2dbb to your computer and use it in GitHub Desktop.
Log output for torchtune run for commit fe2ad3cbcbe670c70e84c6572a961cb2916f160b in brianlechthaler/torchtune
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Node IP: 10.0.1.84 | |
| W0410 11:54:35.549000 544254 torch/distributed/run.py:793] | |
| W0410 11:54:35.549000 544254 torch/distributed/run.py:793] ***************************************** | |
| W0410 11:54:35.549000 544254 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
| W0410 11:54:35.549000 544254 torch/distributed/run.py:793] ***************************************** | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs: | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] entrypoint : /home/brian/venv/lib/python3.10/site-packages/recipes/full_finetune_distributed.py | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] min_nodes : 2 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] max_nodes : 2 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] nproc_per_node : 8 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] run_id : 101 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] rdzv_backend : c10d | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.0.1.84:29500 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900} | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] max_restarts : 0 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] monitor_interval : 0.1 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] log_dir : /tmp/torchelastic_baotuic1 | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] metrics_cfg : {} | |
| I0410 11:54:35.549000 544254 torch/distributed/launcher/api.py:194] | |
| I0410 11:54:35.560000 544254 torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python | |
| I0410 11:54:35.561000 544254 torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group | |
| W0410 11:54:36.850000 544220 torch/distributed/run.py:793] | |
| W0410 11:54:36.850000 544220 torch/distributed/run.py:793] ***************************************** | |
| W0410 11:54:36.850000 544220 torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. | |
| W0410 11:54:36.850000 544220 torch/distributed/run.py:793] ***************************************** | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] Starting elastic_operator with launch configs: | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] entrypoint : /home/brian/venv/lib/python3.10/site-packages/recipes/full_finetune_distributed.py | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] min_nodes : 2 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] max_nodes : 2 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] nproc_per_node : 8 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] run_id : 101 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] rdzv_backend : c10d | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] rdzv_endpoint : 10.0.1.84:29500 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] rdzv_configs : {'timeout': 900} | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] max_restarts : 0 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] monitor_interval : 0.1 | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] log_dir : /tmp/torchelastic__gi9rtbw | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] metrics_cfg : {} | |
| I0410 11:54:36.851000 544220 torch/distributed/launcher/api.py:194] | |
| I0410 11:54:36.860000 544220 torch/distributed/elastic/agent/server/api.py:845] [default] starting workers for entrypoint: python | |
| I0410 11:54:36.861000 544220 torch/distributed/elastic/agent/server/api.py:662] [default] Rendezvous'ing worker group | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result: | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] restart_count=0 | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] master_addr=10.0.1.84 | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] master_port=29500 | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] group_rank=0 | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] group_world_size=2 | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3, 4, 5, 6, 7] | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] role_ranks=[0, 1, 2, 3, 4, 5, 6, 7] | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] global_ranks=[0, 1, 2, 3, 4, 5, 6, 7] | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[16, 16, 16, 16, 16, 16, 16, 16] | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[16, 16, 16, 16, 16, 16, 16, 16] | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:525] | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group | |
| I0410 11:54:37.903000 544254 torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True | |
| I0410 11:54:37.904000 544254 torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer. | |
| I0410 11:54:37.904000 544254 torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check. | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] [default] Rendezvous complete for workers. Result: | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] restart_count=0 | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] master_addr=10.0.1.84 | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] master_port=29500 | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] group_rank=1 | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] group_world_size=2 | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] local_ranks=[0, 1, 2, 3, 4, 5, 6, 7] | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] role_ranks=[8, 9, 10, 11, 12, 13, 14, 15] | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] global_ranks=[8, 9, 10, 11, 12, 13, 14, 15] | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] role_world_sizes=[16, 16, 16, 16, 16, 16, 16, 16] | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] global_world_sizes=[16, 16, 16, 16, 16, 16, 16, 16] | |
| I0410 11:54:37.904000 544220 torch/distributed/elastic/agent/server/api.py:525] | |
| I0410 11:54:37.905000 544220 torch/distributed/elastic/agent/server/api.py:670] [default] Starting worker group | |
| I0410 11:54:37.905000 544220 torch/distributed/elastic/agent/server/local_elastic_agent.py:291] use_agent_store: True | |
| I0410 11:54:37.905000 544220 torch/distributed/elastic/agent/server/local_elastic_agent.py:192] Environment variable 'TORCHELASTIC_ENABLE_FILE_TIMER' not found. Do not start FileTimerServer. | |
| I0410 11:54:37.905000 544220 torch/distributed/elastic/agent/server/local_elastic_agent.py:229] Environment variable 'TORCHELASTIC_HEALTH_CHECK_PORT' not found. Do not start health check. | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Running FullFinetuneRecipeDistributed with resolved config: | |
| batch_size: 4 | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpointer: | |
| _component_: torchtune.training.FullModelHFCheckpointer | |
| checkpoint_dir: /home/brian/model/Llama-3.3-70B-Instruct | |
| checkpoint_files: | |
| filename_format: model-{}-of-{}.safetensors | |
| max_filename: '00030' | |
| model_type: LLAMA3 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| recipe_checkpoint: null | |
| clip_grad_norm: null | |
| compile: true | |
| custom_sharded_layers: | |
| - tok_embeddings | |
| - output | |
| data_parallel_replicate_dim: 1 | |
| data_parallel_shard_dim: -1 | |
| dataset: | |
| _component_: torchtune.datasets.alpaca_dataset | |
| packed: true | |
| device: cuda | |
| dtype: bf16 | |
| enable_activation_checkpointing: true | |
| enable_activation_offloading: false | |
| epochs: 1 | |
| fsdp_cpu_offload: false | |
| gradient_accumulation_steps: 1 | |
| log_every_n_steps: 1 | |
| log_peak_memory_stats: true | |
| loss: | |
| _component_: torchtune.modules.loss.CEWithChunkedOutputLoss | |
| max_steps_per_epoch: null | |
| metric_logger: | |
| _component_: torchtune.training.metric_logging.DiskLogger | |
| log_dir: /home/brian/model/Llama3.3-70B-fft-output/logs | |
| model: | |
| _component_: torchtune.models.llama3_3.llama3_3_70b | |
| optimizer: | |
| _component_: torch.optim.AdamW | |
| fused: false | |
| lr: 2.0e-05 | |
| optimizer_in_bwd: false | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output | |
| profiler: | |
| _component_: torchtune.training.setup_torch_profiler | |
| active_steps: 2 | |
| cpu: true | |
| cuda: true | |
| enabled: false | |
| num_cycles: 1 | |
| output_dir: /home/brian/model/Llama3.3-70B-fft-output/profiling_outputs | |
| profile_memory: false | |
| record_shapes: true | |
| wait_steps: 5 | |
| warmup_steps: 3 | |
| with_flops: false | |
| with_stack: false | |
| resume_from_checkpoint: false | |
| seed: null | |
| shuffle: true | |
| tensor_parallel_dim: 8 | |
| tensor_parallel_plan: | |
| _component_: torchtune.models.llama3.base_llama_tp_plan | |
| tokenizer: | |
| _component_: torchtune.models.llama3.llama3_tokenizer | |
| max_seq_len: 1024 | |
| path: /home/brian/model/Llama-3.3-70B-Instruct/original/tokenizer.model | |
| INFO:torchtune.utils._logging:Hint: enable_activation_checkpointing is True, but enable_activation_offloading isn't. Enabling activation offloading should reduce memory further. | |
| worker-0:544571:544571 [0] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544571:544571 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544571:544571 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544571:544571 [0] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544571:544571 [0] NCCL INFO cudaDriverVersion 12040 | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544574:544574 [3] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544574:544574 [3] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544574:544574 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544574:544574 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544574:544574 [3] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544573:544573 [2] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544576:544576 [5] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544573:544573 [2] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544576:544576 [5] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544577:544577 [6] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544577:544577 [6] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544575:544575 [4] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544575:544575 [4] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544572:544572 [1] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544572:544572 [1] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544578:544578 [7] NCCL INFO cudaDriverVersion 12040 | |
| worker-0:544578:544578 [7] NCCL INFO Bootstrap : Using eth0:10.0.1.84<0> | |
| worker-0:544576:544576 [5] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544576:544576 [5] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544576:544576 [5] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544573:544573 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544573:544573 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544573:544573 [2] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544575:544575 [4] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544575:544575 [4] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544575:544575 [4] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544577:544577 [6] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544577:544577 [6] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544577:544577 [6] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544578:544578 [7] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544578:544578 [7] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544578:544578 [7] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544572:544572 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-0:544572:544572 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-0:544572:544572 [1] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544442:544442 [3] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544442:544442 [3] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544441:544441 [2] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544441:544441 [2] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544440:544440 [1] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544446:544446 [7] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544440:544440 [1] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544446:544446 [7] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544439:544439 [0] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544445:544445 [6] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544439:544439 [0] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544445:544445 [6] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544442:544442 [3] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544442:544442 [3] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544442:544442 [3] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544444:544444 [5] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544443:544443 [4] NCCL INFO cudaDriverVersion 12040 | |
| worker-1:544444:544444 [5] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544443:544443 [4] NCCL INFO Bootstrap : Using eth0:10.55.121.131<0> | |
| worker-1:544441:544441 [2] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544441:544441 [2] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544441:544441 [2] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544439:544439 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544439:544439 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544439:544439 [0] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544440:544440 [1] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544440:544440 [1] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544440:544440 [1] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544445:544445 [6] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544445:544445 [6] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544445:544445 [6] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544443:544443 [4] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544443:544443 [4] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544443:544443 [4] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544446:544446 [7] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544446:544446 [7] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544446:544446 [7] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-1:544444:544444 [5] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
| worker-1:544444:544444 [5] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
| worker-1:544444:544444 [5] NCCL INFO NET/Plugin: Using internal network plugin. | |
| worker-0:544571:544795 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544571:544795 [0] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544571:544795 [0] NCCL INFO Using network IB | |
| worker-0:544574:544799 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544574:544799 [3] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544574:544799 [3] NCCL INFO Using network IB | |
| worker-0:544577:544803 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544577:544803 [6] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544577:544803 [6] NCCL INFO Using network IB | |
| worker-0:544576:544800 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544576:544800 [5] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544576:544800 [5] NCCL INFO Using network IB | |
| worker-0:544573:544801 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544573:544801 [2] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544573:544801 [2] NCCL INFO Using network IB | |
| worker-0:544575:544802 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544575:544802 [4] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544575:544802 [4] NCCL INFO Using network IB | |
| worker-1:544442:544776 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544442:544776 [3] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544442:544776 [3] NCCL INFO Using network IB | |
| worker-0:544578:544804 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544578:544804 [7] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544578:544804 [7] NCCL INFO Using network IB | |
| worker-0:544572:544805 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.0.1.84<0> | |
| worker-0:544572:544805 [1] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544572:544805 [1] NCCL INFO Using network IB | |
| worker-1:544439:544778 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544439:544778 [0] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544439:544778 [0] NCCL INFO Using network IB | |
| worker-1:544446:544783 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544446:544783 [7] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544446:544783 [7] NCCL INFO Using network IB | |
| worker-1:544443:544782 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544443:544782 [4] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544443:544782 [4] NCCL INFO Using network IB | |
| worker-1:544444:544780 [5] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544444:544780 [5] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544444:544780 [5] NCCL INFO Using network IB | |
| worker-1:544440:544779 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544440:544779 [1] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544440:544779 [1] NCCL INFO Using network IB | |
| worker-1:544441:544777 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544441:544777 [2] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544441:544777 [2] NCCL INFO Using network IB | |
| worker-1:544445:544781 [6] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.55.121.131<0> | |
| worker-1:544445:544781 [6] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544445:544781 [6] NCCL INFO Using network IB | |
| worker-0:544573:544801 [2] NCCL INFO ncclCommInitRank comm 0x56405f55b420 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 95000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544576:544800 [5] NCCL INFO ncclCommInitRank comm 0x55c64f7ca4d0 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId af000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544577:544803 [6] NCCL INFO ncclCommInitRank comm 0x559a407dbca0 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId b3000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544575:544802 [4] NCCL INFO ncclCommInitRank comm 0x5571099efd80 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId ab000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544578:544804 [7] NCCL INFO ncclCommInitRank comm 0x55a940ed73e0 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId b7000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544574:544799 [3] NCCL INFO ncclCommInitRank comm 0x5558c9e593a0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 99000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544572:544805 [1] NCCL INFO ncclCommInitRank comm 0x55691b0c7fb0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 91000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-0:544571:544795 [0] NCCL INFO ncclCommInitRank comm 0x55ec0bcf2090 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544445:544781 [6] NCCL INFO ncclCommInitRank comm 0x558c0bcb8dd0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId b3000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544440:544779 [1] NCCL INFO ncclCommInitRank comm 0x55a1402a7ca0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 91000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544441:544777 [2] NCCL INFO ncclCommInitRank comm 0x560fbf6ae730 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 95000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544442:544776 [3] NCCL INFO ncclCommInitRank comm 0x55c0f3540b00 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 99000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544444:544780 [5] NCCL INFO ncclCommInitRank comm 0x55ca04bc7280 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId af000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544443:544782 [4] NCCL INFO ncclCommInitRank comm 0x5645e5c93110 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId ab000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544446:544783 [7] NCCL INFO ncclCommInitRank comm 0x561dd66d3a80 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId b7000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544439:544778 [0] NCCL INFO ncclCommInitRank comm 0x55d1a2434c10 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x9d8e2a3242367e4a - Init START | |
| worker-1:544446:544783 [7] NCCL INFO MNNVL busId 0xb7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544439:544778 [0] NCCL INFO MNNVL busId 0x8d000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544445:544781 [6] NCCL INFO MNNVL busId 0xb3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544444:544780 [5] NCCL INFO MNNVL busId 0xaf000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544442:544776 [3] NCCL INFO MNNVL busId 0x99000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544443:544782 [4] NCCL INFO MNNVL busId 0xab000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544441:544777 [2] NCCL INFO MNNVL busId 0x95000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544440:544779 [1] NCCL INFO MNNVL busId 0x91000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544577:544803 [6] NCCL INFO MNNVL busId 0xb3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544576:544800 [5] NCCL INFO MNNVL busId 0xaf000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544578:544804 [7] NCCL INFO MNNVL busId 0xb7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544575:544802 [4] NCCL INFO MNNVL busId 0xab000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544573:544801 [2] NCCL INFO MNNVL busId 0x95000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544574:544799 [3] NCCL INFO MNNVL busId 0x99000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544572:544805 [1] NCCL INFO MNNVL busId 0x91000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544571:544795 [0] NCCL INFO MNNVL busId 0x8d000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544445:544781 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544445:544781 [6] NCCL INFO NVLS multicast support is available on dev 6 | |
| worker-0:544572:544805 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff | |
| worker-0:544572:544805 [1] NCCL INFO NVLS multicast support is available on dev 1 | |
| worker-1:544439:544778 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff | |
| worker-1:544439:544778 [0] NCCL INFO NVLS multicast support is available on dev 0 | |
| worker-1:544443:544782 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544443:544782 [4] NCCL INFO NVLS multicast support is available on dev 4 | |
| worker-1:544444:544780 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544444:544780 [5] NCCL INFO NVLS multicast support is available on dev 5 | |
| worker-1:544440:544779 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff | |
| worker-1:544440:544779 [1] NCCL INFO NVLS multicast support is available on dev 1 | |
| worker-1:544442:544776 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff | |
| worker-1:544442:544776 [3] NCCL INFO NVLS multicast support is available on dev 3 | |
| worker-1:544446:544783 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544446:544783 [7] NCCL INFO NVLS multicast support is available on dev 7 | |
| worker-1:544441:544777 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff | |
| worker-1:544441:544777 [2] NCCL INFO NVLS multicast support is available on dev 2 | |
| worker-0:544575:544802 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS multicast support is available on dev 4 | |
| worker-0:544576:544800 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS multicast support is available on dev 5 | |
| worker-0:544571:544795 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff | |
| worker-0:544571:544795 [0] NCCL INFO NVLS multicast support is available on dev 0 | |
| worker-0:544573:544801 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff | |
| worker-0:544573:544801 [2] NCCL INFO NVLS multicast support is available on dev 2 | |
| worker-0:544578:544804 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS multicast support is available on dev 7 | |
| worker-0:544577:544803 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS multicast support is available on dev 6 | |
| worker-0:544574:544799 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff | |
| worker-0:544574:544799 [3] NCCL INFO NVLS multicast support is available on dev 3 | |
| worker-0:544578:544804 [7] NCCL INFO comm 0x55a940ed73e0 rank 7 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 | |
| worker-0:544577:544803 [6] NCCL INFO comm 0x559a407dbca0 rank 6 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 | |
| worker-0:544576:544800 [5] NCCL INFO comm 0x55c64f7ca4d0 rank 5 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544578:544804 [7] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544578:544804 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] 0/-1/-1->7->6 [2] 0/-1/-1->7->6 [3] 0/-1/-1->7->6 [4] 0/-1/-1->7->6 [5] 0/-1/-1->7->6 [6] 0/-1/-1->7->6 [7] 0/15/-1->7->-1 [8] -1/-1/-1->7->6 [9] 0/-1/-1->7->6 [10] 0/-1/-1->7->6 [11] 0/-1/-1->7->6 [12] 0/-1/-1->7->6 [13] 0/-1/-1->7->6 [14] 0/-1/-1->7->6 [15] 0/-1/-1->7->15 | |
| worker-0:544577:544803 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/14/-1->6->-1 [7] -1/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->14 [15] -1/-1/-1->6->5 | |
| worker-0:544578:544804 [7] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544577:544803 [6] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544576:544800 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/13/-1->5->-1 [6] -1/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->13 [14] -1/-1/-1->5->4 [15] 6/-1/-1->5->4 | |
| worker-0:544576:544800 [5] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544574:544799 [3] NCCL INFO comm 0x5558c9e593a0 rank 3 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 | |
| worker-0:544573:544801 [2] NCCL INFO comm 0x56405f55b420 rank 2 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 | |
| worker-0:544575:544802 [4] NCCL INFO comm 0x5571099efd80 rank 4 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544575:544802 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/12/-1->4->-1 [5] -1/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->12 [13] -1/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544575:544802 [4] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544573:544801 [2] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544574:544799 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/11/-1->3->-1 [4] -1/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->11 [12] -1/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 | |
| worker-0:544574:544799 [3] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544573:544801 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/10/-1->2->-1 [3] -1/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->10 [11] -1/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 | |
| worker-0:544573:544801 [2] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544571:544795 [0] NCCL INFO comm 0x55ec0bcf2090 rank 0 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 | |
| worker-0:544572:544805 [1] NCCL INFO comm 0x55691b0c7fb0 rank 1 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 | |
| worker-1:544439:544778 [0] NCCL INFO comm 0x55d1a2434c10 rank 8 nRanks 16 nNodes 2 localRanks 8 localRank 0 MNNVL 0 | |
| worker-1:544440:544779 [1] NCCL INFO comm 0x55a1402a7ca0 rank 9 nRanks 16 nNodes 2 localRanks 8 localRank 1 MNNVL 0 | |
| worker-1:544441:544777 [2] NCCL INFO comm 0x560fbf6ae730 rank 10 nRanks 16 nNodes 2 localRanks 8 localRank 2 MNNVL 0 | |
| worker-1:544442:544776 [3] NCCL INFO comm 0x55c0f3540b00 rank 11 nRanks 16 nNodes 2 localRanks 8 localRank 3 MNNVL 0 | |
| worker-1:544442:544776 [3] NCCL INFO Trees [0] 12/-1/-1->11->10 [1] 12/-1/-1->11->10 [2] 12/-1/-1->11->10 [3] 12/-1/-1->11->3 [4] -1/-1/-1->11->10 [5] 12/-1/-1->11->10 [6] 12/-1/-1->11->10 [7] 12/-1/-1->11->10 [8] 12/-1/-1->11->10 [9] 12/-1/-1->11->10 [10] 12/-1/-1->11->10 [11] 12/3/-1->11->-1 [12] -1/-1/-1->11->10 [13] 12/-1/-1->11->10 [14] 12/-1/-1->11->10 [15] 12/-1/-1->11->10 | |
| worker-1:544439:544778 [0] NCCL INFO Trees [0] 9/-1/-1->8->0 [1] -1/-1/-1->8->15 [2] 9/-1/-1->8->15 [3] 9/-1/-1->8->15 [4] 9/-1/-1->8->15 [5] 9/-1/-1->8->15 [6] 9/-1/-1->8->15 [7] 9/-1/-1->8->15 [8] 9/0/-1->8->-1 [9] -1/-1/-1->8->15 [10] 9/-1/-1->8->15 [11] 9/-1/-1->8->15 [12] 9/-1/-1->8->15 [13] 9/-1/-1->8->15 [14] 9/-1/-1->8->15 [15] 9/-1/-1->8->15 | |
| worker-1:544440:544779 [1] NCCL INFO Trees [0] 10/-1/-1->9->8 [1] 10/-1/-1->9->1 [2] -1/-1/-1->9->8 [3] 10/-1/-1->9->8 [4] 10/-1/-1->9->8 [5] 10/-1/-1->9->8 [6] 10/-1/-1->9->8 [7] 10/-1/-1->9->8 [8] 10/-1/-1->9->8 [9] 10/1/-1->9->-1 [10] -1/-1/-1->9->8 [11] 10/-1/-1->9->8 [12] 10/-1/-1->9->8 [13] 10/-1/-1->9->8 [14] 10/-1/-1->9->8 [15] 10/-1/-1->9->8 | |
| worker-1:544442:544776 [3] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544441:544777 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 [2] 11/-1/-1->10->2 [3] -1/-1/-1->10->9 [4] 11/-1/-1->10->9 [5] 11/-1/-1->10->9 [6] 11/-1/-1->10->9 [7] 11/-1/-1->10->9 [8] 11/-1/-1->10->9 [9] 11/-1/-1->10->9 [10] 11/2/-1->10->-1 [11] -1/-1/-1->10->9 [12] 11/-1/-1->10->9 [13] 11/-1/-1->10->9 [14] 11/-1/-1->10->9 [15] 11/-1/-1->10->9 | |
| worker-1:544439:544778 [0] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544440:544779 [1] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544441:544777 [2] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544446:544783 [7] NCCL INFO comm 0x561dd66d3a80 rank 15 nRanks 16 nNodes 2 localRanks 8 localRank 7 MNNVL 0 | |
| worker-1:544444:544780 [5] NCCL INFO comm 0x55ca04bc7280 rank 13 nRanks 16 nNodes 2 localRanks 8 localRank 5 MNNVL 0 | |
| worker-1:544446:544783 [7] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] 8/-1/-1->15->14 [2] 8/-1/-1->15->14 [3] 8/-1/-1->15->14 [4] 8/-1/-1->15->14 [5] 8/-1/-1->15->14 [6] 8/-1/-1->15->14 [7] 8/-1/-1->15->7 [8] -1/-1/-1->15->14 [9] 8/-1/-1->15->14 [10] 8/-1/-1->15->14 [11] 8/-1/-1->15->14 [12] 8/-1/-1->15->14 [13] 8/-1/-1->15->14 [14] 8/-1/-1->15->14 [15] 8/7/-1->15->-1 | |
| worker-1:544446:544783 [7] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544443:544782 [4] NCCL INFO comm 0x5645e5c93110 rank 12 nRanks 16 nNodes 2 localRanks 8 localRank 4 MNNVL 0 | |
| worker-1:544445:544781 [6] NCCL INFO comm 0x558c0bcb8dd0 rank 14 nRanks 16 nNodes 2 localRanks 8 localRank 6 MNNVL 0 | |
| worker-1:544444:544780 [5] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/-1/-1->13->12 [2] 14/-1/-1->13->12 [3] 14/-1/-1->13->12 [4] 14/-1/-1->13->12 [5] 14/-1/-1->13->5 [6] -1/-1/-1->13->12 [7] 14/-1/-1->13->12 [8] 14/-1/-1->13->12 [9] 14/-1/-1->13->12 [10] 14/-1/-1->13->12 [11] 14/-1/-1->13->12 [12] 14/-1/-1->13->12 [13] 14/5/-1->13->-1 [14] -1/-1/-1->13->12 [15] 14/-1/-1->13->12 | |
| worker-1:544444:544780 [5] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544443:544782 [4] NCCL INFO Trees [0] 13/-1/-1->12->11 [1] 13/-1/-1->12->11 [2] 13/-1/-1->12->11 [3] 13/-1/-1->12->11 [4] 13/-1/-1->12->4 [5] -1/-1/-1->12->11 [6] 13/-1/-1->12->11 [7] 13/-1/-1->12->11 [8] 13/-1/-1->12->11 [9] 13/-1/-1->12->11 [10] 13/-1/-1->12->11 [11] 13/-1/-1->12->11 [12] 13/4/-1->12->-1 [13] -1/-1/-1->12->11 [14] 13/-1/-1->12->11 [15] 13/-1/-1->12->11 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS Head 7: 7 15 | |
| worker-1:544443:544782 [4] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544445:544781 [6] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 [2] 15/-1/-1->14->13 [3] 15/-1/-1->14->13 [4] 15/-1/-1->14->13 [5] 15/-1/-1->14->13 [6] 15/-1/-1->14->6 [7] -1/-1/-1->14->13 [8] 15/-1/-1->14->13 [9] 15/-1/-1->14->13 [10] 15/-1/-1->14->13 [11] 15/-1/-1->14->13 [12] 15/-1/-1->14->13 [13] 15/-1/-1->14->13 [14] 15/6/-1->14->-1 [15] -1/-1/-1->14->13 | |
| worker-1:544445:544781 [6] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 00/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 01/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 0: 0 8 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 02/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 1: 1 9 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 03/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 2: 2 10 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 3: 3 11 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 04/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 4: 4 12 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 05/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 5: 5 13 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 06/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 6: 6 14 | |
| worker-0:544572:544805 [1] NCCL INFO NVLS Head 7: 7 15 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 07/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 08/16 : 0 7 6 5 4 3 2 1 9 10 11 12 13 14 15 8 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 09/16 : 0 8 15 14 13 12 11 10 9 1 2 3 4 5 6 7 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 10/16 : 0 7 6 5 4 3 11 12 13 14 15 8 9 10 2 1 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 11/16 : 0 1 2 10 9 8 15 14 13 12 11 3 4 5 6 7 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 12/16 : 0 7 6 5 13 14 15 8 9 10 11 12 4 3 2 1 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 13/16 : 0 1 2 3 4 12 11 10 9 8 15 14 13 5 6 7 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 14/16 : 0 7 15 8 9 10 11 12 13 14 6 5 4 3 2 1 | |
| worker-0:544572:544805 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/9/-1->1->-1 [2] -1/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->9 [10] -1/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 15/16 : 0 1 2 3 4 5 6 14 13 12 11 10 9 8 15 7 | |
| worker-0:544572:544805 [1] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544571:544795 [0] NCCL INFO Trees [0] 1/8/-1->0->-1 [1] -1/-1/-1->0->7 [2] 1/-1/-1->0->7 [3] 1/-1/-1->0->7 [4] 1/-1/-1->0->7 [5] 1/-1/-1->0->7 [6] 1/-1/-1->0->7 [7] 1/-1/-1->0->7 [8] 1/-1/-1->0->8 [9] -1/-1/-1->0->7 [10] 1/-1/-1->0->7 [11] 1/-1/-1->0->7 [12] 1/-1/-1->0->7 [13] 1/-1/-1->0->7 [14] 1/-1/-1->0->7 [15] 1/-1/-1->0->7 | |
| worker-0:544571:544795 [0] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544571:544795 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 00/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 02/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 04/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 06/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 00/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 04/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 06/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 08/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 08/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 12/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 10/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 12/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 14/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 14/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 00/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 02/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 04/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 06/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 08/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 10/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 12/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 14/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 00/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 02/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 04/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 08/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 10/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 12/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 02/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 00/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 04/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 02/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 06/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 04/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 10/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 06/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 12/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 08/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 14/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 10/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 12/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 14/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 00/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 02/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 04/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 06/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 08/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 10/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 12/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 14/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 01/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 00/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 08/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 01/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 09/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 03/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 11/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 02/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 10/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 03/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 05/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 06/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 14/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 07/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 15/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 07/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 09/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 11/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 00/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 13/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 02/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 15/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 01/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 06/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 03/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 08/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 05/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 10/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 01/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 09/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 14/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 07/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 00/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 08/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 06/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 14/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 07/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 15/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 00/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 03/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 05/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 09/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 07/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 02/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 05/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 10/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 13/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 04/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 03/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 12/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 11/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 04/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 12/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 05/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 13/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 02/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 11/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 04/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 11/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 13/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 06/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 15/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 08/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 13/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 10/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 15/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 12/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 14/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 01/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 01/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 03/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 05/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 07/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 03/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 09/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 05/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 11/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 09/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 11/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 01/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 13/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 03/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 01/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 05/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 03/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 07/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 01/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 07/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 13/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 09/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 09/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 05/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 11/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 11/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 07/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 13/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 15/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 15/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 09/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 15/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 13/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 15/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Connected all rings | |
| worker-0:544574:544799 [3] NCCL INFO Connected all rings | |
| worker-1:544441:544777 [2] NCCL INFO Connected all rings | |
| worker-1:544442:544776 [3] NCCL INFO Connected all rings | |
| worker-0:544573:544801 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Connected all rings | |
| worker-0:544576:544800 [5] NCCL INFO Connected all rings | |
| worker-1:544443:544782 [4] NCCL INFO Connected all rings | |
| worker-1:544444:544780 [5] NCCL INFO Connected all rings | |
| worker-0:544577:544803 [6] NCCL INFO Connected all rings | |
| worker-0:544578:544804 [7] NCCL INFO Connected all rings | |
| worker-0:544571:544795 [0] NCCL INFO Connected all rings | |
| worker-0:544571:544795 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Connected all rings | |
| worker-0:544575:544802 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Connected all rings | |
| worker-1:544445:544781 [6] NCCL INFO Connected all rings | |
| worker-1:544440:544779 [1] NCCL INFO Connected all rings | |
| worker-1:544439:544778 [0] NCCL INFO Channel 00/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Connected all rings | |
| worker-0:544571:544795 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 03/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 05/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 01/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 01/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 02/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 03/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 07/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 05/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 05/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 08/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 07/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 07/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 11/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 09/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 09/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 13/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 10/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 01/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 11/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 01/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 15/0 : 8[0] -> 9[1] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 13/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 03/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 05/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 07/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 09/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 11/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 13/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 15/0 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:544795 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 01/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 03/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 13/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 03/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 04/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 05/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 07/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 06/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 15/0 : 10[2] -> 11[3] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 09/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 01/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 15/0 : 11[3] -> 12[4] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 11/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 03/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 12/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 05/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 15/0 : 12[4] -> 13[5] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 09/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 07/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 11/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 03/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 11/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:544799 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 09/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 13/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 11/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 14/0 : 14[6] -> 15[7] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 13/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 01/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 15/0 : 13[5] -> 14[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 03/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 04/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 06/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 05/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 14/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 12/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 13/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:544800 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544577:544803 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:544802 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 05/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 07/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 09/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 00/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 11/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 02/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 13/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 04/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 00/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 15/0 : 9[1] -> 10[2] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 06/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 08/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 02/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 10/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 01/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 09/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 04/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544442:544776 [3] NCCL INFO Channel 12/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 06/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 00/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 00/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 02/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 02/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 10/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 14/0 : 11[3] -> 10[2] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 08/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 02/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 10/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 02/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 04/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 04/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 06/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 12/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 06/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 08/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 08/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544444:544780 [5] NCCL INFO Channel 14/0 : 13[5] -> 12[4] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 10/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 10/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 10/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 12/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544445:544781 [6] NCCL INFO Channel 12/0 : 14[6] -> 13[5] via P2P/CUMEM | |
| worker-1:544443:544782 [4] NCCL INFO Channel 14/0 : 12[4] -> 11[3] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:544778 [0] NCCL INFO Channel 14/0 : 8[0] -> 15[7] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 07/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 00/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 08/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 15/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 01/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 03/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 05/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 00/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 07/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:544805 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 04/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 09/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 06/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 11/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 08/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 13/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 12/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 15/0 : 15[7] -> 8[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:544801 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 00/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544441:544777 [2] NCCL INFO Channel 14/0 : 10[2] -> 9[1] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 00/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 02/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 02/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 04/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-0:544578:544804 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 06/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 08/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 04/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 06/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 10/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 08/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 10/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 12/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544440:544779 [1] NCCL INFO Channel 14/0 : 9[1] -> 8[0] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 12/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Channel 14/0 : 15[7] -> 14[6] via P2P/CUMEM | |
| worker-1:544446:544783 [7] NCCL INFO Connected all trees | |
| worker-1:544439:544778 [0] NCCL INFO Connected all trees | |
| worker-0:544578:544804 [7] NCCL INFO Connected all trees | |
| worker-0:544571:544795 [0] NCCL INFO Connected all trees | |
| worker-0:544578:544804 [7] NCCL INFO NVLS comm 0x55a940ed73e0 headRank 7 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544571:544795 [0] NCCL INFO NVLS comm 0x55ec0bcf2090 headRank 0 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544572:544805 [1] NCCL INFO Connected all trees | |
| worker-1:544440:544779 [1] NCCL INFO Connected all trees | |
| worker-0:544572:544805 [1] NCCL INFO NVLS comm 0x55691b0c7fb0 headRank 1 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544439:544778 [0] NCCL INFO NVLS comm 0x55d1a2434c10 headRank 0 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544445:544781 [6] NCCL INFO Connected all trees | |
| worker-1:544444:544780 [5] NCCL INFO Connected all trees | |
| worker-1:544441:544777 [2] NCCL INFO Connected all trees | |
| worker-1:544443:544782 [4] NCCL INFO Connected all trees | |
| worker-1:544442:544776 [3] NCCL INFO Connected all trees | |
| worker-0:544573:544801 [2] NCCL INFO Connected all trees | |
| worker-0:544574:544799 [3] NCCL INFO Connected all trees | |
| worker-0:544575:544802 [4] NCCL INFO Connected all trees | |
| worker-0:544577:544803 [6] NCCL INFO Connected all trees | |
| worker-0:544576:544800 [5] NCCL INFO Connected all trees | |
| worker-0:544573:544801 [2] NCCL INFO NVLS comm 0x56405f55b420 headRank 2 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544574:544799 [3] NCCL INFO NVLS comm 0x5558c9e593a0 headRank 3 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544575:544802 [4] NCCL INFO NVLS comm 0x5571099efd80 headRank 4 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544577:544803 [6] NCCL INFO NVLS comm 0x559a407dbca0 headRank 6 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544446:544783 [7] NCCL INFO NVLS comm 0x561dd66d3a80 headRank 7 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544440:544779 [1] NCCL INFO NVLS comm 0x55a1402a7ca0 headRank 1 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544442:544776 [3] NCCL INFO NVLS comm 0x55c0f3540b00 headRank 3 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544443:544782 [4] NCCL INFO NVLS comm 0x5645e5c93110 headRank 4 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544444:544780 [5] NCCL INFO NVLS comm 0x55ca04bc7280 headRank 5 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544576:544800 [5] NCCL INFO NVLS comm 0x55c64f7ca4d0 headRank 5 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544441:544777 [2] NCCL INFO NVLS comm 0x560fbf6ae730 headRank 2 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-1:544445:544781 [6] NCCL INFO NVLS comm 0x558c0bcb8dd0 headRank 6 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 100663296 nvlsTotalSize 805306368 | |
| worker-0:544577:544803 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544577:544803 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544571:544795 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:544802 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:544801 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544576:544800 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:544799 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:544804 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 00/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 01/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 02/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 00/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 04/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 00/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 00/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 03/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 02/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 01/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 05/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 00/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 01/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 02/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 04/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 06/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 03/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 00/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 01/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 02/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 03/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 07/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 05/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 01/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 04/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 03/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 06/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 08/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 02/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 06/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 02/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 01/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 05/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 04/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 07/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 09/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 03/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 03/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 07/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 05/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 08/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 06/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 04/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 12/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 06/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 04/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 09/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 08/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 10/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 05/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 13/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 07/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 05/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 07/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 10/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 09/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 14/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 11/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 06/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 10/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 08/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 08/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 10/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 11/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 15/0 : 2[2] -> 10[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 07/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 12/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 09/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 09/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 11/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 11/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 00/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 14/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 08/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 10/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 13/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 10/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 01/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 12/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 12/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 15/0 : 5[5] -> 13[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 09/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 11/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 03/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 14/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 11/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 00/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 13/0 : 6[6] -> 14[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 13/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 04/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 12/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 12/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 14/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 01/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 15/0 : 1[1] -> 9[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 00/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 05/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 13/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 13/0 : 7[7] -> 15[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 14/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 02/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 15/0 : 4[4] -> 12[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 01/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 00/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 06/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 00/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 14/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 03/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 02/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 15/0 : 0[0] -> 8[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 07/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 02/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 00/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 01/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 15/0 : 3[3] -> 11[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 04/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 03/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 08/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 01/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 01/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 03/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 02/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 00/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 04/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 09/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 06/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 02/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 02/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 04/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 03/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 01/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 11/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 07/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 05/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 03/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 03/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 05/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 04/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 02/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 12/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 08/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 07/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 05/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 04/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 05/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 06/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 13/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 04/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 09/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 08/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 06/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 06/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 14/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 05/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 07/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 05/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 10/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 09/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 07/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544441:544777 [2] NCCL INFO Channel 15/0 : 10[2] -> 2[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 08/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 06/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 08/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 06/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 11/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 10/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 08/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 09/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 07/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 10/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 07/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 12/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 11/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 09/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 10/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 08/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 11/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 14/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 12/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 09/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 10/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 11/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 09/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 12/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544444:544780 [5] NCCL INFO Channel 15/0 : 13[5] -> 5[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 11/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 13/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 12/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 10/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 10/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 13/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 13/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544445:544781 [6] NCCL INFO Channel 15/0 : 14[6] -> 6[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 13/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 12/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 11/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 14/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 14/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 13/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544446:544783 [7] NCCL INFO Channel 14/0 : 15[7] -> 7[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 12/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 14/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544440:544779 [1] NCCL INFO Channel 15/0 : 9[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544443:544782 [4] NCCL INFO Channel 15/0 : 12[4] -> 4[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 13/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544442:544776 [3] NCCL INFO Channel 15/0 : 11[3] -> 3[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 14/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544439:544778 [0] NCCL INFO Channel 15/0 : 8[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544572:544805 [1] NCCL INFO Connected NVLS tree | |
| worker-0:544572:544805 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544572:544805 [1] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544440:544779 [1] NCCL INFO Connected NVLS tree | |
| worker-1:544440:544779 [1] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544440:544779 [1] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544575:544802 [4] NCCL INFO Connected NVLS tree | |
| worker-0:544575:544802 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544575:544802 [4] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544443:544782 [4] NCCL INFO Connected NVLS tree | |
| worker-1:544443:544782 [4] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544443:544782 [4] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544576:544800 [5] NCCL INFO Connected NVLS tree | |
| worker-0:544576:544800 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544576:544800 [5] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544574:544799 [3] NCCL INFO Connected NVLS tree | |
| worker-0:544574:544799 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544574:544799 [3] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544444:544780 [5] NCCL INFO Connected NVLS tree | |
| worker-1:544444:544780 [5] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544444:544780 [5] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544442:544776 [3] NCCL INFO Connected NVLS tree | |
| worker-1:544442:544776 [3] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544442:544776 [3] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544577:544803 [6] NCCL INFO Connected NVLS tree | |
| worker-0:544577:544803 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544577:544803 [6] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544445:544781 [6] NCCL INFO Connected NVLS tree | |
| worker-1:544445:544781 [6] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544445:544781 [6] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544578:544804 [7] NCCL INFO Connected NVLS tree | |
| worker-0:544578:544804 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544578:544804 [7] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544446:544783 [7] NCCL INFO Connected NVLS tree | |
| worker-1:544446:544783 [7] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544446:544783 [7] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544571:544795 [0] NCCL INFO Connected NVLS tree | |
| worker-0:544571:544795 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544571:544795 [0] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544439:544778 [0] NCCL INFO Connected NVLS tree | |
| worker-1:544439:544778 [0] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544439:544778 [0] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544573:544801 [2] NCCL INFO Connected NVLS tree | |
| worker-0:544573:544801 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-0:544573:544801 [2] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-1:544441:544777 [2] NCCL INFO Connected NVLS tree | |
| worker-1:544441:544777 [2] NCCL INFO threadThresholds 8/8/64 | 128/8/64 | 512 | 512 | |
| worker-1:544441:544777 [2] NCCL INFO 16 coll channels, 16 collnet channels, 16 nvls channels, 16 p2p channels, 2 p2p channels per peer | |
| worker-0:544578:544804 [7] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544578:544804 [7] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544578:544804 [7] NCCL INFO ncclCommInitRank comm 0x55a940ed73e0 rank 7 nranks 16 cudaDev 7 nvmlDev 7 busId b7000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544571:544795 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544571:544795 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544571:544795 [0] NCCL INFO ncclCommInitRank comm 0x55ec0bcf2090 rank 0 nranks 16 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544441:544777 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544441:544777 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544441:544777 [2] NCCL INFO ncclCommInitRank comm 0x560fbf6ae730 rank 10 nranks 16 cudaDev 2 nvmlDev 2 busId 95000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544444:544780 [5] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544444:544780 [5] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544444:544780 [5] NCCL INFO ncclCommInitRank comm 0x55ca04bc7280 rank 13 nranks 16 cudaDev 5 nvmlDev 5 busId af000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544445:544781 [6] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544445:544781 [6] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544445:544781 [6] NCCL INFO ncclCommInitRank comm 0x558c0bcb8dd0 rank 14 nranks 16 cudaDev 6 nvmlDev 6 busId b3000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544439:544778 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544439:544778 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544439:544778 [0] NCCL INFO ncclCommInitRank comm 0x55d1a2434c10 rank 8 nranks 16 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544575:544802 [4] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544575:544802 [4] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544575:544802 [4] NCCL INFO ncclCommInitRank comm 0x5571099efd80 rank 4 nranks 16 cudaDev 4 nvmlDev 4 busId ab000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544442:544776 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544442:544776 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544442:544776 [3] NCCL INFO ncclCommInitRank comm 0x55c0f3540b00 rank 11 nranks 16 cudaDev 3 nvmlDev 3 busId 99000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544440:544779 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544440:544779 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544440:544779 [1] NCCL INFO ncclCommInitRank comm 0x55a1402a7ca0 rank 9 nranks 16 cudaDev 1 nvmlDev 1 busId 91000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544572:544805 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544572:544805 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544572:544805 [1] NCCL INFO ncclCommInitRank comm 0x55691b0c7fb0 rank 1 nranks 16 cudaDev 1 nvmlDev 1 busId 91000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544577:544803 [6] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544577:544803 [6] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544577:544803 [6] NCCL INFO ncclCommInitRank comm 0x559a407dbca0 rank 6 nranks 16 cudaDev 6 nvmlDev 6 busId b3000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544446:544783 [7] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544446:544783 [7] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544446:544783 [7] NCCL INFO ncclCommInitRank comm 0x561dd66d3a80 rank 15 nranks 16 cudaDev 7 nvmlDev 7 busId b7000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544573:544801 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544573:544801 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544573:544801 [2] NCCL INFO ncclCommInitRank comm 0x56405f55b420 rank 2 nranks 16 cudaDev 2 nvmlDev 2 busId 95000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-1:544443:544782 [4] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-1:544443:544782 [4] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-1:544443:544782 [4] NCCL INFO ncclCommInitRank comm 0x5645e5c93110 rank 12 nranks 16 cudaDev 4 nvmlDev 4 busId ab000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544574:544799 [3] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544574:544799 [3] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544574:544799 [3] NCCL INFO ncclCommInitRank comm 0x5558c9e593a0 rank 3 nranks 16 cudaDev 3 nvmlDev 3 busId 99000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| worker-0:544576:544800 [5] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
| worker-0:544576:544800 [5] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
| worker-0:544576:544800 [5] NCCL INFO ncclCommInitRank comm 0x55c64f7ca4d0 rank 5 nranks 16 cudaDev 5 nvmlDev 5 busId af000 commId 0x9d8e2a3242367e4a - Init COMPLETE | |
| DEBUG:torchtune.utils._logging:Setting manual seed to local seed 4192817896. Local seed is seed + rank = 4192817896 + 0 | |
| Writing logs to /home/brian/model/Llama3.3-70B-fft-output/logs/log_1744286111.txt | |
| INFO:torchtune.utils._logging:Distributed training is enabled. Instantiating model and loading checkpoint on Rank 0 ... | |
| INFO:torchtune.utils._logging:Compiling model layers with torch.compile... | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544573:545727 [2] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544573:545727 [2] NCCL INFO Using network IB | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544572:545729 [1] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544572:545729 [1] NCCL INFO Using network IB | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544577:545731 [6] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544577:545731 [6] NCCL INFO Using network IB | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544575:545733 [4] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544575:545733 [4] NCCL INFO Using network IB | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544578:545735 [7] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544578:545735 [7] NCCL INFO Using network IB | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544574:545737 [3] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544574:545737 [3] NCCL INFO Using network IB | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-0:544576:545740 [5] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544576:545740 [5] NCCL INFO Using network IB | |
| worker-0:544571:545741 [0] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544571:545741 [0] NCCL INFO Using network IB | |
| worker-1:544442:545617 [3] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544442:545617 [3] NCCL INFO Using network IB | |
| worker-1:544446:545618 [7] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544446:545618 [7] NCCL INFO Using network IB | |
| worker-1:544444:545620 [5] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544444:545620 [5] NCCL INFO Using network IB | |
| worker-1:544440:545619 [1] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544440:545619 [1] NCCL INFO Using network IB | |
| worker-1:544441:545622 [2] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544441:545622 [2] NCCL INFO Using network IB | |
| worker-1:544439:545621 [0] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544439:545621 [0] NCCL INFO Using network IB | |
| worker-1:544443:545623 [4] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544443:545623 [4] NCCL INFO Using network IB | |
| worker-1:544445:545624 [6] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544445:545624 [6] NCCL INFO Using network IB | |
| worker-0:544578:545735 [7] NCCL INFO ncclCommInitRank comm 0x55a99bcbe170 rank 0 nranks 2 cudaDev 7 nvmlDev 7 busId b7000 commId 0x19f08473bfc639b2 - Init START | |
| worker-0:544578:545735 [7] NCCL INFO MNNVL busId 0xb7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544446:545618 [7] NCCL INFO ncclCommInitRank comm 0x561e314b9bd0 rank 1 nranks 2 cudaDev 7 nvmlDev 7 busId b7000 commId 0x19f08473bfc639b2 - Init START | |
| worker-1:544446:545618 [7] NCCL INFO MNNVL busId 0xb7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544439:545621 [0] NCCL INFO ncclCommInitRank comm 0x55d1fd21b220 rank 1 nranks 2 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x1b1296d0ed2bf674 - Init START | |
| worker-0:544571:545741 [0] NCCL INFO ncclCommInitRank comm 0x55ec66bde920 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x1b1296d0ed2bf674 - Init START | |
| worker-1:544439:545621 [0] NCCL INFO MNNVL busId 0x8d000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544441:545622 [2] NCCL INFO ncclCommInitRank comm 0x56101a4956f0 rank 1 nranks 2 cudaDev 2 nvmlDev 2 busId 95000 commId 0x13df2009c28cb0a1 - Init START | |
| worker-1:544444:545620 [5] NCCL INFO ncclCommInitRank comm 0x55ca5f9cd020 rank 1 nranks 2 cudaDev 5 nvmlDev 5 busId af000 commId 0x48d25f3c2cff0fd6 - Init START | |
| worker-1:544440:545619 [1] NCCL INFO ncclCommInitRank comm 0x55a19b08e290 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 91000 commId 0x4b0c98864598e9fe - Init START | |
| worker-1:544441:545622 [2] NCCL INFO MNNVL busId 0x95000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544444:545620 [5] NCCL INFO MNNVL busId 0xaf000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544440:545619 [1] NCCL INFO MNNVL busId 0x91000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544573:545727 [2] NCCL INFO ncclCommInitRank comm 0x5640ba340ff0 rank 0 nranks 2 cudaDev 2 nvmlDev 2 busId 95000 commId 0x13df2009c28cb0a1 - Init START | |
| worker-0:544576:545740 [5] NCCL INFO ncclCommInitRank comm 0x55c6aa5b0230 rank 0 nranks 2 cudaDev 5 nvmlDev 5 busId af000 commId 0x48d25f3c2cff0fd6 - Init START | |
| worker-0:544571:545741 [0] NCCL INFO MNNVL busId 0x8d000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544576:545740 [5] NCCL INFO MNNVL busId 0xaf000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544573:545727 [2] NCCL INFO MNNVL busId 0x95000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544577:545731 [6] NCCL INFO ncclCommInitRank comm 0x559a9b5c2d10 rank 0 nranks 2 cudaDev 6 nvmlDev 6 busId b3000 commId 0x2131b074d138773a - Init START | |
| worker-0:544572:545729 [1] NCCL INFO ncclCommInitRank comm 0x556975eae0b0 rank 0 nranks 2 cudaDev 1 nvmlDev 1 busId 91000 commId 0x4b0c98864598e9fe - Init START | |
| worker-0:544577:545731 [6] NCCL INFO MNNVL busId 0xb3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544445:545624 [6] NCCL INFO ncclCommInitRank comm 0x558c66abf370 rank 1 nranks 2 cudaDev 6 nvmlDev 6 busId b3000 commId 0x2131b074d138773a - Init START | |
| worker-1:544445:545624 [6] NCCL INFO MNNVL busId 0xb3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544443:545623 [4] NCCL INFO ncclCommInitRank comm 0x5645eba77300 rank 1 nranks 2 cudaDev 4 nvmlDev 4 busId ab000 commId 0xdacff0147e4123c1 - Init START | |
| worker-0:544572:545729 [1] NCCL INFO MNNVL busId 0x91000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544575:545733 [4] NCCL INFO ncclCommInitRank comm 0x5571647d5ab0 rank 0 nranks 2 cudaDev 4 nvmlDev 4 busId ab000 commId 0xdacff0147e4123c1 - Init START | |
| worker-1:544443:545623 [4] NCCL INFO MNNVL busId 0xab000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544575:545733 [4] NCCL INFO MNNVL busId 0xab000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544574:545737 [3] NCCL INFO ncclCommInitRank comm 0x555924c3fb10 rank 0 nranks 2 cudaDev 3 nvmlDev 3 busId 99000 commId 0x14e7d11eb8db4eeb - Init START | |
| worker-1:544442:545617 [3] NCCL INFO ncclCommInitRank comm 0x55c0f43244b0 rank 1 nranks 2 cudaDev 3 nvmlDev 3 busId 99000 commId 0x14e7d11eb8db4eeb - Init START | |
| worker-0:544574:545737 [3] NCCL INFO MNNVL busId 0x99000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544442:545617 [3] NCCL INFO MNNVL busId 0x99000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544575:545733 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544440:545619 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff | |
| worker-1:544442:545617 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff | |
| worker-1:544445:545624 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544573:545727 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff | |
| worker-1:544443:545623 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544446:545618 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544575:545733 [4] NCCL INFO comm 0x5571647d5ab0 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544575:545733 [4] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544575:545733 [4] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544575:545733 [4] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544575:545733 [4] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544575:545733 [4] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544575:545733 [4] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544443:545623 [4] NCCL INFO comm 0x5645eba77300 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544443:545623 [4] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544443:545623 [4] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544441:545622 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff | |
| worker-1:544439:545621 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff | |
| worker-1:544444:545620 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544573:545727 [2] NCCL INFO comm 0x5640ba340ff0 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544573:545727 [2] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544573:545727 [2] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544573:545727 [2] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544573:545727 [2] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544573:545727 [2] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544573:545727 [2] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544441:545622 [2] NCCL INFO comm 0x56101a4956f0 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544441:545622 [2] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544441:545622 [2] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544576:545740 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544443:545623 [4] NCCL INFO Channel 00/0 : 0[4] -> 1[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 00/0 : 0[2] -> 1[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 01/0 : 0[2] -> 1[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 01/0 : 0[4] -> 1[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 02/0 : 0[4] -> 1[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 02/0 : 0[2] -> 1[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 03/0 : 0[4] -> 1[4] [receive] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 03/0 : 0[2] -> 1[2] [receive] via NET/IB/6/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 00/0 : 1[4] -> 0[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 00/0 : 1[2] -> 0[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 01/0 : 1[4] -> 0[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 01/0 : 1[2] -> 0[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 02/0 : 1[4] -> 0[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 02/0 : 1[2] -> 0[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544443:545623 [4] NCCL INFO Channel 03/0 : 1[4] -> 0[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544441:545622 [2] NCCL INFO Channel 03/0 : 1[2] -> 0[2] [send] via NET/IB/6/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO comm 0x55ca5f9cd020 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544444:545620 [5] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544444:545620 [5] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544576:545740 [5] NCCL INFO comm 0x55c6aa5b0230 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544576:545740 [5] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544576:545740 [5] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544576:545740 [5] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544576:545740 [5] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544576:545740 [5] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544576:545740 [5] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544571:545741 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff | |
| worker-1:544444:545620 [5] NCCL INFO Channel 00/0 : 0[5] -> 1[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 01/0 : 0[5] -> 1[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 02/0 : 0[5] -> 1[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 03/0 : 0[5] -> 1[5] [receive] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 00/0 : 1[5] -> 0[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 01/0 : 1[5] -> 0[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 02/0 : 1[5] -> 0[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544444:545620 [5] NCCL INFO Channel 03/0 : 1[5] -> 0[5] [send] via NET/IB/1/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO comm 0x55d1fd21b220 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544439:545621 [0] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544439:545621 [0] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544571:545741 [0] NCCL INFO comm 0x55ec66bde920 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544571:545741 [0] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544571:545741 [0] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544571:545741 [0] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544571:545741 [0] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544571:545741 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544571:545741 [0] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544574:545737 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff | |
| worker-1:544442:545617 [3] NCCL INFO comm 0x55c0f43244b0 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544442:545617 [3] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-0:544574:545737 [3] NCCL INFO comm 0x555924c3fb10 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544574:545737 [3] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544574:545737 [3] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544574:545737 [3] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544574:545737 [3] NCCL INFO Channel 03/04 : 0 1 | |
| worker-1:544442:545617 [3] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544574:545737 [3] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544574:545737 [3] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544439:545621 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[0] [receive] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 02/0 : 1[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544439:545621 [0] NCCL INFO Channel 03/0 : 1[0] -> 0[0] [send] via NET/IB/4/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 00/0 : 0[3] -> 1[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 01/0 : 0[3] -> 1[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 02/0 : 0[3] -> 1[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 03/0 : 0[3] -> 1[3] [receive] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 00/0 : 1[3] -> 0[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 01/0 : 1[3] -> 0[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 02/0 : 1[3] -> 0[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544442:545617 [3] NCCL INFO Channel 03/0 : 1[3] -> 0[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544577:545731 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544578:545735 [7] NCCL INFO comm 0x55a99bcbe170 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544578:545735 [7] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544578:545735 [7] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544578:545735 [7] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544578:545735 [7] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544578:545735 [7] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544578:545735 [7] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544446:545618 [7] NCCL INFO comm 0x561e314b9bd0 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544446:545618 [7] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544446:545618 [7] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544445:545624 [6] NCCL INFO comm 0x558c66abf370 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544445:545624 [6] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544445:545624 [6] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544577:545731 [6] NCCL INFO comm 0x559a9b5c2d10 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544577:545731 [6] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544577:545731 [6] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544577:545731 [6] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544577:545731 [6] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544577:545731 [6] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544577:545731 [6] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544573:545727 [2] NCCL INFO Channel 00/0 : 1[2] -> 0[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 01/0 : 1[2] -> 0[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 02/0 : 1[2] -> 0[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 03/0 : 1[2] -> 0[2] [receive] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 00/0 : 0[2] -> 1[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 01/0 : 0[2] -> 1[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 02/0 : 0[2] -> 1[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544573:545727 [2] NCCL INFO Channel 03/0 : 0[2] -> 1[2] [send] via NET/IB/6/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 02/0 : 1[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 03/0 : 1[0] -> 0[0] [receive] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[0] [send] via NET/IB/4/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 00/0 : 1[4] -> 0[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 01/0 : 1[4] -> 0[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 02/0 : 1[4] -> 0[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 03/0 : 1[4] -> 0[4] [receive] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 00/0 : 0[4] -> 1[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 01/0 : 0[4] -> 1[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 02/0 : 0[4] -> 1[4] [send] via NET/IB/0/GDRDMA | |
| worker-0:544575:545733 [4] NCCL INFO Channel 03/0 : 0[4] -> 1[4] [send] via NET/IB/0/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 00/0 : 0[7] -> 1[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 01/0 : 0[7] -> 1[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 02/0 : 0[7] -> 1[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 03/0 : 0[7] -> 1[7] [receive] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 00/0 : 1[7] -> 0[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 01/0 : 1[7] -> 0[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 02/0 : 1[7] -> 0[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544446:545618 [7] NCCL INFO Channel 03/0 : 1[7] -> 0[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 00/0 : 1[3] -> 0[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 01/0 : 1[3] -> 0[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 02/0 : 1[3] -> 0[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 03/0 : 1[3] -> 0[3] [receive] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 00/0 : 0[3] -> 1[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 01/0 : 0[3] -> 1[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 02/0 : 0[3] -> 1[3] [send] via NET/IB/7/GDRDMA | |
| worker-0:544574:545737 [3] NCCL INFO Channel 03/0 : 0[3] -> 1[3] [send] via NET/IB/7/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 00/0 : 0[6] -> 1[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 01/0 : 0[6] -> 1[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 02/0 : 0[6] -> 1[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 03/0 : 0[6] -> 1[6] [receive] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 00/0 : 1[6] -> 0[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 01/0 : 1[6] -> 0[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 02/0 : 1[6] -> 0[6] [send] via NET/IB/2/GDRDMA | |
| worker-1:544445:545624 [6] NCCL INFO Channel 03/0 : 1[6] -> 0[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff | |
| worker-0:544572:545729 [1] NCCL INFO comm 0x556975eae0b0 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-0:544572:545729 [1] NCCL INFO Channel 00/04 : 0 1 | |
| worker-0:544572:545729 [1] NCCL INFO Channel 01/04 : 0 1 | |
| worker-0:544572:545729 [1] NCCL INFO Channel 02/04 : 0 1 | |
| worker-0:544572:545729 [1] NCCL INFO Channel 03/04 : 0 1 | |
| worker-0:544572:545729 [1] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] -1/-1/-1->0->1 [3] -1/-1/-1->0->1 | |
| worker-0:544572:545729 [1] NCCL INFO P2P Chunksize set to 131072 | |
| worker-1:544440:545619 [1] NCCL INFO comm 0x55a19b08e290 rank 1 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
| worker-1:544440:545619 [1] NCCL INFO Trees [0] -1/-1/-1->1->0 [1] -1/-1/-1->1->0 [2] 0/-1/-1->1->-1 [3] 0/-1/-1->1->-1 | |
| worker-1:544440:545619 [1] NCCL INFO P2P Chunksize set to 131072 | |
| worker-0:544576:545740 [5] NCCL INFO Channel 00/0 : 1[5] -> 0[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 01/0 : 1[5] -> 0[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 02/0 : 1[5] -> 0[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 03/0 : 1[5] -> 0[5] [receive] via NET/IB/1/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 00/0 : 0[5] -> 1[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 01/0 : 0[5] -> 1[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 00/0 : 1[6] -> 0[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 00/0 : 1[7] -> 0[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 02/0 : 0[5] -> 1[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 01/0 : 1[6] -> 0[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544576:545740 [5] NCCL INFO Channel 03/0 : 0[5] -> 1[5] [send] via NET/IB/1/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 01/0 : 1[7] -> 0[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 02/0 : 1[6] -> 0[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 02/0 : 1[7] -> 0[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 03/0 : 1[6] -> 0[6] [receive] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 03/0 : 1[7] -> 0[7] [receive] via NET/IB/3/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 00/0 : 0[6] -> 1[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 00/0 : 0[7] -> 1[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 01/0 : 0[6] -> 1[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 01/0 : 0[7] -> 1[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 02/0 : 0[6] -> 1[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 02/0 : 0[7] -> 1[7] [send] via NET/IB/3/GDRDMA | |
| worker-0:544577:545731 [6] NCCL INFO Channel 03/0 : 0[6] -> 1[6] [send] via NET/IB/2/GDRDMA | |
| worker-0:544578:545735 [7] NCCL INFO Channel 03/0 : 0[7] -> 1[7] [send] via NET/IB/3/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 00/0 : 0[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 01/0 : 0[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 02/0 : 0[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 03/0 : 0[1] -> 1[1] [receive] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[1] [send] via NET/IB/5/GDRDMA | |
| worker-1:544440:545619 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[1] [receive] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 00/0 : 0[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 01/0 : 0[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 02/0 : 0[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544572:545729 [1] NCCL INFO Channel 03/0 : 0[1] -> 1[1] [send] via NET/IB/5/GDRDMA | |
| worker-0:544571:545741 [0] NCCL INFO Connected all rings | |
| worker-0:544571:545741 [0] NCCL INFO Connected all trees | |
| worker-0:544571:545741 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544571:545741 [0] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544439:545621 [0] NCCL INFO Connected all rings | |
| worker-1:544439:545621 [0] NCCL INFO Connected all trees | |
| worker-1:544439:545621 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544439:545621 [0] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544575:545733 [4] NCCL INFO Connected all rings | |
| worker-0:544575:545733 [4] NCCL INFO Connected all trees | |
| worker-0:544575:545733 [4] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544575:545733 [4] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544443:545623 [4] NCCL INFO Connected all rings | |
| worker-1:544443:545623 [4] NCCL INFO Connected all trees | |
| worker-1:544443:545623 [4] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544443:545623 [4] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544574:545737 [3] NCCL INFO Connected all rings | |
| worker-0:544574:545737 [3] NCCL INFO Connected all trees | |
| worker-0:544574:545737 [3] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544574:545737 [3] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544442:545617 [3] NCCL INFO Connected all rings | |
| worker-1:544442:545617 [3] NCCL INFO Connected all trees | |
| worker-1:544442:545617 [3] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544442:545617 [3] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544573:545727 [2] NCCL INFO Connected all rings | |
| worker-0:544573:545727 [2] NCCL INFO Connected all trees | |
| worker-0:544573:545727 [2] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544573:545727 [2] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544441:545622 [2] NCCL INFO Connected all rings | |
| worker-1:544441:545622 [2] NCCL INFO Connected all trees | |
| worker-1:544441:545622 [2] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544441:545622 [2] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544577:545731 [6] NCCL INFO Connected all rings | |
| worker-0:544577:545731 [6] NCCL INFO Connected all trees | |
| worker-0:544577:545731 [6] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544577:545731 [6] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544445:545624 [6] NCCL INFO Connected all rings | |
| worker-1:544445:545624 [6] NCCL INFO Connected all trees | |
| worker-1:544445:545624 [6] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544445:545624 [6] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544578:545735 [7] NCCL INFO Connected all rings | |
| worker-0:544578:545735 [7] NCCL INFO Connected all trees | |
| worker-0:544578:545735 [7] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544578:545735 [7] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544446:545618 [7] NCCL INFO Connected all rings | |
| worker-1:544446:545618 [7] NCCL INFO Connected all trees | |
| worker-1:544446:545618 [7] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544446:545618 [7] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544576:545740 [5] NCCL INFO Connected all rings | |
| worker-0:544576:545740 [5] NCCL INFO Connected all trees | |
| worker-0:544576:545740 [5] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544576:545740 [5] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544444:545620 [5] NCCL INFO Connected all rings | |
| worker-1:544444:545620 [5] NCCL INFO Connected all trees | |
| worker-1:544444:545620 [5] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544444:545620 [5] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544572:545729 [1] NCCL INFO Connected all rings | |
| worker-0:544572:545729 [1] NCCL INFO Connected all trees | |
| worker-0:544572:545729 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-0:544572:545729 [1] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-1:544440:545619 [1] NCCL INFO Connected all rings | |
| worker-1:544440:545619 [1] NCCL INFO Connected all trees | |
| worker-1:544440:545619 [1] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
| worker-1:544440:545619 [1] NCCL INFO 4 coll channels, 4 collnet channels, 0 nvls channels, 4 p2p channels, 2 p2p channels per peer | |
| worker-0:544571:545741 [0] NCCL INFO ncclCommInitRank comm 0x55ec66bde920 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x1b1296d0ed2bf674 - Init COMPLETE | |
| worker-0:544571:545774 [0] NCCL INFO Channel 02/1 : 0[0] -> 1[0] [send] via NET/IB/4/GDRDMA/Shared | |
| worker-0:544571:545774 [0] NCCL INFO Channel 03/1 : 0[0] -> 1[0] [send] via NET/IB/4/GDRDMA/Shared | |
| worker-1:544439:545621 [0] NCCL INFO ncclCommInitRank comm 0x55d1fd21b220 rank 1 nranks 2 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x1b1296d0ed2bf674 - Init COMPLETE | |
| worker-1:544439:545657 [0] NCCL INFO Channel 02/1 : 0[0] -> 1[0] [receive] via NET/IB/4/GDRDMA/Shared | |
| worker-1:544439:545657 [0] NCCL INFO Channel 03/1 : 0[0] -> 1[0] [receive] via NET/IB/4/GDRDMA/Shared | |
| worker-0:544575:545733 [4] NCCL INFO ncclCommInitRank comm 0x5571647d5ab0 rank 0 nranks 2 cudaDev 4 nvmlDev 4 busId ab000 commId 0xdacff0147e4123c1 - Init COMPLETE | |
| worker-0:544574:545737 [3] NCCL INFO ncclCommInitRank comm 0x555924c3fb10 rank 0 nranks 2 cudaDev 3 nvmlDev 3 busId 99000 commId 0x14e7d11eb8db4eeb - Init COMPLETE | |
| worker-0:544575:545775 [4] NCCL INFO Channel 02/1 : 0[4] -> 1[4] [send] via NET/IB/0/GDRDMA/Shared | |
| worker-0:544575:545775 [4] NCCL INFO Channel 03/1 : 0[4] -> 1[4] [send] via NET/IB/0/GDRDMA/Shared | |
| worker-0:544573:545727 [2] NCCL INFO ncclCommInitRank comm 0x5640ba340ff0 rank 0 nranks 2 cudaDev 2 nvmlDev 2 busId 95000 commId 0x13df2009c28cb0a1 - Init COMPLETE | |
| worker-0:544574:545776 [3] NCCL INFO Channel 02/1 : 0[3] -> 1[3] [send] via NET/IB/7/GDRDMA/Shared | |
| worker-0:544574:545776 [3] NCCL INFO Channel 03/1 : 0[3] -> 1[3] [send] via NET/IB/7/GDRDMA/Shared | |
| worker-1:544443:545623 [4] NCCL INFO ncclCommInitRank comm 0x5645eba77300 rank 1 nranks 2 cudaDev 4 nvmlDev 4 busId ab000 commId 0xdacff0147e4123c1 - Init COMPLETE | |
| worker-1:544441:545622 [2] NCCL INFO ncclCommInitRank comm 0x56101a4956f0 rank 1 nranks 2 cudaDev 2 nvmlDev 2 busId 95000 commId 0x13df2009c28cb0a1 - Init COMPLETE | |
| worker-0:544573:545777 [2] NCCL INFO Channel 02/1 : 0[2] -> 1[2] [send] via NET/IB/6/GDRDMA/Shared | |
| worker-0:544573:545777 [2] NCCL INFO Channel 03/1 : 0[2] -> 1[2] [send] via NET/IB/6/GDRDMA/Shared | |
| worker-1:544441:545659 [2] NCCL INFO Channel 02/1 : 0[2] -> 1[2] [receive] via NET/IB/6/GDRDMA/Shared | |
| worker-1:544441:545659 [2] NCCL INFO Channel 03/1 : 0[2] -> 1[2] [receive] via NET/IB/6/GDRDMA/Shared | |
| worker-1:544443:545658 [4] NCCL INFO Channel 02/1 : 0[4] -> 1[4] [receive] via NET/IB/0/GDRDMA/Shared | |
| worker-1:544443:545658 [4] NCCL INFO Channel 03/1 : 0[4] -> 1[4] [receive] via NET/IB/0/GDRDMA/Shared | |
| worker-0:544577:545731 [6] NCCL INFO ncclCommInitRank comm 0x559a9b5c2d10 rank 0 nranks 2 cudaDev 6 nvmlDev 6 busId b3000 commId 0x2131b074d138773a - Init COMPLETE | |
| worker-0:544577:545778 [6] NCCL INFO Channel 02/1 : 0[6] -> 1[6] [send] via NET/IB/2/GDRDMA/Shared | |
| worker-0:544577:545778 [6] NCCL INFO Channel 03/1 : 0[6] -> 1[6] [send] via NET/IB/2/GDRDMA/Shared | |
| worker-1:544446:545618 [7] NCCL INFO ncclCommInitRank comm 0x561e314b9bd0 rank 1 nranks 2 cudaDev 7 nvmlDev 7 busId b7000 commId 0x19f08473bfc639b2 - Init COMPLETE | |
| worker-1:544442:545617 [3] NCCL INFO ncclCommInitRank comm 0x55c0f43244b0 rank 1 nranks 2 cudaDev 3 nvmlDev 3 busId 99000 commId 0x14e7d11eb8db4eeb - Init COMPLETE | |
| worker-0:544578:545735 [7] NCCL INFO ncclCommInitRank comm 0x55a99bcbe170 rank 0 nranks 2 cudaDev 7 nvmlDev 7 busId b7000 commId 0x19f08473bfc639b2 - Init COMPLETE | |
| worker-1:544442:545661 [3] NCCL INFO Channel 02/1 : 0[3] -> 1[3] [receive] via NET/IB/7/GDRDMA/Shared | |
| worker-1:544442:545661 [3] NCCL INFO Channel 03/1 : 0[3] -> 1[3] [receive] via NET/IB/7/GDRDMA/Shared | |
| worker-0:544578:545779 [7] NCCL INFO Channel 02/1 : 0[7] -> 1[7] [send] via NET/IB/3/GDRDMA/Shared | |
| worker-0:544578:545779 [7] NCCL INFO Channel 03/1 : 0[7] -> 1[7] [send] via NET/IB/3/GDRDMA/Shared | |
| worker-0:544576:545740 [5] NCCL INFO ncclCommInitRank comm 0x55c6aa5b0230 rank 0 nranks 2 cudaDev 5 nvmlDev 5 busId af000 commId 0x48d25f3c2cff0fd6 - Init COMPLETE | |
| worker-1:544446:545660 [7] NCCL INFO Channel 02/1 : 0[7] -> 1[7] [receive] via NET/IB/3/GDRDMA/Shared | |
| worker-1:544446:545660 [7] NCCL INFO Channel 03/1 : 0[7] -> 1[7] [receive] via NET/IB/3/GDRDMA/Shared | |
| worker-0:544576:545780 [5] NCCL INFO Channel 02/1 : 0[5] -> 1[5] [send] via NET/IB/1/GDRDMA/Shared | |
| worker-0:544576:545780 [5] NCCL INFO Channel 03/1 : 0[5] -> 1[5] [send] via NET/IB/1/GDRDMA/Shared | |
| worker-1:544445:545624 [6] NCCL INFO ncclCommInitRank comm 0x558c66abf370 rank 1 nranks 2 cudaDev 6 nvmlDev 6 busId b3000 commId 0x2131b074d138773a - Init COMPLETE | |
| worker-1:544445:545662 [6] NCCL INFO Channel 02/1 : 0[6] -> 1[6] [receive] via NET/IB/2/GDRDMA/Shared | |
| worker-1:544445:545662 [6] NCCL INFO Channel 03/1 : 0[6] -> 1[6] [receive] via NET/IB/2/GDRDMA/Shared | |
| worker-0:544572:545729 [1] NCCL INFO ncclCommInitRank comm 0x556975eae0b0 rank 0 nranks 2 cudaDev 1 nvmlDev 1 busId 91000 commId 0x4b0c98864598e9fe - Init COMPLETE | |
| worker-1:544444:545620 [5] NCCL INFO ncclCommInitRank comm 0x55ca5f9cd020 rank 1 nranks 2 cudaDev 5 nvmlDev 5 busId af000 commId 0x48d25f3c2cff0fd6 - Init COMPLETE | |
| worker-0:544571:545782 [0] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544571:545782 [0] NCCL INFO Using network IB | |
| worker-0:544572:545783 [1] NCCL INFO Channel 02/1 : 0[1] -> 1[1] [send] via NET/IB/5/GDRDMA/Shared | |
| worker-0:544572:545783 [1] NCCL INFO Channel 03/1 : 0[1] -> 1[1] [send] via NET/IB/5/GDRDMA/Shared | |
| worker-1:544444:545663 [5] NCCL INFO Channel 02/1 : 0[5] -> 1[5] [receive] via NET/IB/1/GDRDMA/Shared | |
| worker-1:544444:545663 [5] NCCL INFO Channel 03/1 : 0[5] -> 1[5] [receive] via NET/IB/1/GDRDMA/Shared | |
| worker-1:544440:545619 [1] NCCL INFO ncclCommInitRank comm 0x55a19b08e290 rank 1 nranks 2 cudaDev 1 nvmlDev 1 busId 91000 commId 0x4b0c98864598e9fe - Init COMPLETE | |
| worker-0:544575:545784 [4] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544575:545784 [4] NCCL INFO Using network IB | |
| worker-1:544440:545664 [1] NCCL INFO Channel 02/1 : 0[1] -> 1[1] [receive] via NET/IB/5/GDRDMA/Shared | |
| worker-1:544440:545664 [1] NCCL INFO Channel 03/1 : 0[1] -> 1[1] [receive] via NET/IB/5/GDRDMA/Shared | |
| worker-0:544573:545785 [2] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544573:545785 [2] NCCL INFO Using network IB | |
| worker-0:544574:545786 [3] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544574:545786 [3] NCCL INFO Using network IB | |
| worker-0:544578:545787 [7] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544578:545787 [7] NCCL INFO Using network IB | |
| worker-0:544577:545788 [6] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544577:545788 [6] NCCL INFO Using network IB | |
| worker-0:544576:545789 [5] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544576:545789 [5] NCCL INFO Using network IB | |
| worker-0:544572:545790 [1] NCCL INFO Using non-device net plugin version 0 | |
| worker-0:544572:545790 [1] NCCL INFO Using network IB | |
| worker-0:544578:545787 [7] NCCL INFO ncclCommInitRank comm 0x55a99bcd4aa0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId b7000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544577:545788 [6] NCCL INFO ncclCommInitRank comm 0x559a9b5d95f0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId b3000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544571:545782 [0] NCCL INFO ncclCommInitRank comm 0x55ec66bf52e0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544576:545789 [5] NCCL INFO ncclCommInitRank comm 0x55c6aa5c6b00 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId af000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544572:545790 [1] NCCL INFO ncclCommInitRank comm 0x556975ec4a10 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 91000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544573:545785 [2] NCCL INFO ncclCommInitRank comm 0x5640ba357a80 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 95000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544575:545784 [4] NCCL INFO ncclCommInitRank comm 0x5571647ec3b0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId ab000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544574:545786 [3] NCCL INFO ncclCommInitRank comm 0x555924c564e0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 99000 commId 0x778b3f5043bf8e36 - Init START | |
| worker-0:544571:545782 [0] NCCL INFO MNNVL busId 0x8d000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544578:545787 [7] NCCL INFO MNNVL busId 0xb7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544576:545789 [5] NCCL INFO MNNVL busId 0xaf000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544577:545788 [6] NCCL INFO MNNVL busId 0xb3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544572:545790 [1] NCCL INFO MNNVL busId 0x91000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544573:545785 [2] NCCL INFO MNNVL busId 0x95000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544574:545786 [3] NCCL INFO MNNVL busId 0x99000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-0:544575:545784 [4] NCCL INFO MNNVL busId 0xab000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| NCCL version 2.21.5+cuda12.4 | |
| worker-1:544439:545666 [0] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544439:545666 [0] NCCL INFO Using network IB | |
| worker-1:544440:545667 [1] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544440:545667 [1] NCCL INFO Using network IB | |
| worker-1:544445:545669 [6] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544446:545668 [7] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544445:545669 [6] NCCL INFO Using network IB | |
| worker-1:544446:545668 [7] NCCL INFO Using network IB | |
| worker-1:544441:545671 [2] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544441:545671 [2] NCCL INFO Using network IB | |
| worker-1:544442:545670 [3] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544442:545670 [3] NCCL INFO Using network IB | |
| worker-1:544444:545673 [5] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544444:545673 [5] NCCL INFO Using network IB | |
| worker-1:544443:545672 [4] NCCL INFO Using non-device net plugin version 0 | |
| worker-1:544443:545672 [4] NCCL INFO Using network IB | |
| worker-1:544444:545673 [5] NCCL INFO ncclCommInitRank comm 0x55ca5f9e3ac0 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId af000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544442:545670 [3] NCCL INFO ncclCommInitRank comm 0x55c0f433aef0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 99000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544443:545672 [4] NCCL INFO ncclCommInitRank comm 0x5645eba8dd80 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId ab000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544441:545671 [2] NCCL INFO ncclCommInitRank comm 0x56101a4ac2a0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 95000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544440:545667 [1] NCCL INFO ncclCommInitRank comm 0x55a19b0a4e30 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 91000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544439:545666 [0] NCCL INFO ncclCommInitRank comm 0x55d1fd231e80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544446:545668 [7] NCCL INFO ncclCommInitRank comm 0x561e314d06d0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId b7000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544445:545669 [6] NCCL INFO ncclCommInitRank comm 0x558c66ad5e60 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId b3000 commId 0x580fa346b03991d9 - Init START | |
| worker-1:544444:545673 [5] NCCL INFO MNNVL busId 0xaf000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544442:545670 [3] NCCL INFO MNNVL busId 0x99000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544443:545672 [4] NCCL INFO MNNVL busId 0xab000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544439:545666 [0] NCCL INFO MNNVL busId 0x8d000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544441:545671 [2] NCCL INFO MNNVL busId 0x95000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544440:545667 [1] NCCL INFO MNNVL busId 0x91000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544446:545668 [7] NCCL INFO MNNVL busId 0xb7000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544445:545669 [6] NCCL INFO MNNVL busId 0xb3000 fabric UUID 0.0 cliqueId 0x0 state 3 healthMask 0x0 | |
| worker-1:544445:545669 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544445:545669 [6] NCCL INFO NVLS multicast support is available on dev 6 | |
| worker-0:544575:545784 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544575:545784 [4] NCCL INFO NVLS multicast support is available on dev 4 | |
| worker-0:544571:545782 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff | |
| worker-0:544571:545782 [0] NCCL INFO NVLS multicast support is available on dev 0 | |
| worker-0:544574:545786 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff | |
| worker-0:544574:545786 [3] NCCL INFO NVLS multicast support is available on dev 3 | |
| worker-1:544444:545673 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544444:545673 [5] NCCL INFO NVLS multicast support is available on dev 5 | |
| worker-1:544439:545666 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ffffffff | |
| worker-1:544439:545666 [0] NCCL INFO NVLS multicast support is available on dev 0 | |
| worker-0:544572:545790 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff | |
| worker-0:544572:545790 [1] NCCL INFO NVLS multicast support is available on dev 1 | |
| worker-0:544577:545788 [6] NCCL INFO Setting affinity for GPU 6 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544577:545788 [6] NCCL INFO NVLS multicast support is available on dev 6 | |
| worker-0:544576:545789 [5] NCCL INFO Setting affinity for GPU 5 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544576:545789 [5] NCCL INFO NVLS multicast support is available on dev 5 | |
| worker-0:544573:545785 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff | |
| worker-0:544573:545785 [2] NCCL INFO NVLS multicast support is available on dev 2 | |
| worker-0:544578:545787 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffffff,00000000,00000000 | |
| worker-0:544578:545787 [7] NCCL INFO NVLS multicast support is available on dev 7 | |
| worker-1:544442:545670 [3] NCCL INFO Setting affinity for GPU 3 to ffff,ffffffff | |
| worker-1:544442:545670 [3] NCCL INFO NVLS multicast support is available on dev 3 | |
| worker-0:544578:545787 [7] NCCL INFO comm 0x55a99bcd4aa0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0 | |
| worker-0:544573:545785 [2] NCCL INFO comm 0x5640ba357a80 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 | |
| worker-0:544577:545788 [6] NCCL INFO comm 0x559a9b5d95f0 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0 | |
| worker-0:544574:545786 [3] NCCL INFO comm 0x555924c564e0 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 | |
| worker-0:544576:545789 [5] NCCL INFO comm 0x55c6aa5c6b00 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 | |
| worker-0:544572:545790 [1] NCCL INFO comm 0x556975ec4a10 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 | |
| worker-0:544578:545787 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6 | |
| worker-0:544573:545785 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 | |
| worker-0:544575:545784 [4] NCCL INFO comm 0x5571647ec3b0 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 | |
| worker-0:544577:545788 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5 | |
| worker-0:544578:545787 [7] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544571:545782 [0] NCCL INFO comm 0x55ec66bf52e0 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
| worker-0:544573:545785 [2] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544572:545790 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
| worker-0:544574:545786 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2 | |
| worker-0:544576:545789 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4 | |
| worker-0:544577:545788 [6] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544572:545790 [1] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544574:545786 [3] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544576:545789 [5] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544575:545784 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544575:545784 [4] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
| worker-0:544571:545782 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 | |
| worker-0:544571:545782 [0] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544443:545672 [4] NCCL INFO Setting affinity for GPU 4 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544443:545672 [4] NCCL INFO NVLS multicast support is available on dev 4 | |
| worker-1:544446:545668 [7] NCCL INFO Setting affinity for GPU 7 to ffff,ffffffff,00000000,00000000 | |
| worker-1:544446:545668 [7] NCCL INFO NVLS multicast support is available on dev 7 | |
| worker-1:544441:545671 [2] NCCL INFO Setting affinity for GPU 2 to ffff,ffffffff | |
| worker-1:544441:545671 [2] NCCL INFO NVLS multicast support is available on dev 2 | |
| worker-1:544440:545667 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ffffffff | |
| worker-1:544440:545667 [1] NCCL INFO NVLS multicast support is available on dev 1 | |
| worker-1:544445:545669 [6] NCCL INFO comm 0x558c66ad5e60 rank 6 nRanks 8 nNodes 1 localRanks 8 localRank 6 MNNVL 0 | |
| worker-1:544440:545667 [1] NCCL INFO comm 0x55a19b0a4e30 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 | |
| worker-1:544446:545668 [7] NCCL INFO comm 0x561e314d06d0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0 | |
| worker-1:544443:545672 [4] NCCL INFO comm 0x5645eba8dd80 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0 | |
| worker-1:544444:545673 [5] NCCL INFO comm 0x55ca5f9e3ac0 rank 5 nRanks 8 nNodes 1 localRanks 8 localRank 5 MNNVL 0 | |
| worker-1:544441:545671 [2] NCCL INFO comm 0x56101a4ac2a0 rank 2 nRanks 8 nNodes 1 localRanks 8 localRank 2 MNNVL 0 | |
| worker-1:544442:545670 [3] NCCL INFO comm 0x55c0f433aef0 rank 3 nRanks 8 nNodes 1 localRanks 8 localRank 3 MNNVL 0 | |
| worker-1:544439:545666 [0] NCCL INFO comm 0x55d1fd231e80 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
| worker-1:544440:545667 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
| worker-1:544440:545667 [1] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544445:545669 [6] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5 | |
| worker-1:544441:545671 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 | |
| worker-1:544442:545670 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2 | |
| worker-1:544446:545668 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544445:545669 [6] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544441:545671 [2] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544442:545670 [3] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544446:545668 [7] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544444:545673 [5] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/-1/-1->5->4 [2] 6/-1/-1->5->4 [3] 6/-1/-1->5->4 [4] 6/-1/-1->5->4 [5] 6/-1/-1->5->4 [6] 6/-1/-1->5->4 [7] 6/-1/-1->5->4 [8] 6/-1/-1->5->4 [9] 6/-1/-1->5->4 [10] 6/-1/-1->5->4 [11] 6/-1/-1->5->4 [12] 6/-1/-1->5->4 [13] 6/-1/-1->5->4 [14] 6/-1/-1->5->4 [15] 6/-1/-1->5->4 [16] 6/-1/-1->5->4 [17] 6/-1/-1->5->4 [18] 6/-1/-1->5->4 [19] 6/-1/-1->5->4 [20] 6/-1/-1->5->4 [21] 6/-1/-1->5->4 [22] 6/-1/-1->5->4 [23] 6/-1/-1->5->4 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544443:545672 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544444:545673 [5] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544443:545672 [4] NCCL INFO P2P Chunksize set to 524288 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
| worker-1:544439:545666 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 | |
| worker-1:544439:545666 [0] NCCL INFO P2P Chunksize set to 524288 | |
| worker-0:544574:545786 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545782 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 00/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 01/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 02/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 03/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 04/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 05/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 06/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 07/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 08/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 09/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 10/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 11/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 12/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 13/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 14/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 00/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 15/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 01/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 16/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 17/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 02/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 18/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 03/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 19/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 04/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 20/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 05/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 21/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 06/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 22/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 07/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 23/0 : 3[3] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 08/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 09/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 10/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 11/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 12/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 00/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 01/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 02/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 03/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 04/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 05/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 14/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 06/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 15/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 07/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 16/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 08/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 17/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 14/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 09/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 18/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 15/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 10/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 19/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 16/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 17/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 18/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 19/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 20/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 21/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 22/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 23/0 : 1[1] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 00/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 11/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 01/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 12/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 02/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 13/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 03/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 14/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 00/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 04/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 20/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 01/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 05/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 06/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 07/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 15/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 21/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 02/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 08/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 16/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 22/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 09/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 03/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 17/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 23/0 : 6[6] -> 7[7] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 10/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 04/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 18/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 11/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 05/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 19/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 12/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 06/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 13/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 20/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 14/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 21/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 07/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 15/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 22/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 16/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 08/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 23/0 : 5[5] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 17/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 18/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 19/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 09/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 10/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 20/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 11/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 21/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 12/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 22/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 13/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 23/0 : 7[7] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 14/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 15/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 16/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 17/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 18/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 19/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 20/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 21/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 22/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 23/0 : 2[2] -> 3[3] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 02/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 03/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 04/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 05/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 06/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 07/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 08/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 09/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 10/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 11/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 12/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 13/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 14/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 15/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 16/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 17/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 18/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 19/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 20/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 21/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 22/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 23/0 : 4[4] -> 5[5] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Connected all rings | |
| worker-0:544574:545786 [3] NCCL INFO Connected all rings | |
| worker-0:544572:545790 [1] NCCL INFO Connected all rings | |
| worker-0:544571:545782 [0] NCCL INFO Connected all rings | |
| worker-0:544573:545785 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Connected all rings | |
| worker-1:544440:545667 [1] NCCL INFO Connected all rings | |
| worker-1:544439:545666 [0] NCCL INFO Connected all rings | |
| worker-0:544573:545785 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 16/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 17/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 18/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 19/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 20/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 21/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 22/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO Channel 23/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Connected all rings | |
| worker-0:544578:545787 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Connected all rings | |
| worker-0:544577:545788 [6] NCCL INFO Connected all rings | |
| worker-0:544575:545784 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 16/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 16/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 17/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 16/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 17/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 18/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 17/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 18/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 19/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 18/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 19/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 20/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 19/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 20/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 20/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 21/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 21/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 22/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 22/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544575:545784 [4] NCCL INFO Channel 23/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-0:544574:545786 [3] NCCL INFO Channel 23/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 21/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 22/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO Channel 23/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 15/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 16/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 17/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 18/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 19/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 20/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 21/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 22/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544578:545787 [7] NCCL INFO Channel 23/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 02/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 03/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 04/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 05/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 06/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 07/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 08/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 09/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 10/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 11/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Connected all rings | |
| worker-1:544442:545670 [3] NCCL INFO Connected all rings | |
| worker-1:544443:545672 [4] NCCL INFO Connected all rings | |
| worker-1:544440:545667 [1] NCCL INFO Channel 12/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 13/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 14/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 15/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 16/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 17/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 13/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 16/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 18/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 19/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 20/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 21/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 17/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 16/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 22/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-1:544440:545667 [1] NCCL INFO Channel 23/0 : 1[1] -> 0[0] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 18/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 17/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 19/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 18/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 19/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 20/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 00/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 02/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 21/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 20/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 01/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 03/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 04/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 21/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 22/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 22/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 00/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 05/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 01/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 02/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 06/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 02/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 03/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 07/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Connected all rings | |
| worker-1:544442:545670 [3] NCCL INFO Channel 03/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 00/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Connected all rings | |
| worker-1:544444:545673 [5] NCCL INFO Connected all rings | |
| worker-1:544443:545672 [4] NCCL INFO Channel 04/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 05/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 08/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 04/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 06/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 09/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 05/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 07/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 10/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 06/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO Channel 23/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO Channel 23/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 08/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 01/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 09/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 10/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 11/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 07/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 11/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 08/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 12/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 12/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 09/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 13/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 13/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 10/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 14/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 14/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 11/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 15/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 15/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 12/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 16/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 16/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 13/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 17/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 17/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 14/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 18/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 18/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 19/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 02/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 15/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 20/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 03/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 16/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 21/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 04/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 17/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 22/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 18/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544443:545672 [4] NCCL INFO Channel 23/0 : 4[4] -> 3[3] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 19/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 06/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 19/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 20/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 21/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 22/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544441:545671 [2] NCCL INFO Channel 23/0 : 2[2] -> 1[1] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 20/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 21/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 07/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 22/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544442:545670 [3] NCCL INFO Channel 23/0 : 3[3] -> 2[2] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 08/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 09/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 10/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 11/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 12/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 13/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 02/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 03/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 14/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 04/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 05/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 00/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 06/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 01/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 15/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 07/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 02/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 16/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 03/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 08/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 04/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 09/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 05/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 10/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 17/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 06/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 11/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 18/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 07/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 12/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 19/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 20/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 08/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 21/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 09/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 13/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 10/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 22/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 14/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544446:545668 [7] NCCL INFO Channel 23/0 : 7[7] -> 6[6] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 11/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 15/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 12/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 16/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 13/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 17/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 14/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 18/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 15/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 19/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 16/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 20/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 17/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 21/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 18/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 22/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 19/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544444:545673 [5] NCCL INFO Channel 23/0 : 5[5] -> 4[4] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 20/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 21/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 22/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544445:545669 [6] NCCL INFO Channel 23/0 : 6[6] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545666 [0] NCCL INFO Connected all trees | |
| worker-0:544571:545782 [0] NCCL INFO Connected all trees | |
| worker-1:544439:545666 [0] NCCL INFO NVLS comm 0x55d1fd231e80 headRank 0 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544440:545667 [1] NCCL INFO Connected all trees | |
| worker-0:544571:545782 [0] NCCL INFO NVLS comm 0x55ec66bf52e0 headRank 0 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544440:545667 [1] NCCL INFO NVLS comm 0x55a19b0a4e30 headRank 1 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-0:544572:545790 [1] NCCL INFO Connected all trees | |
| worker-0:544573:545785 [2] NCCL INFO Connected all trees | |
| worker-0:544572:545790 [1] NCCL INFO NVLS comm 0x556975ec4a10 headRank 1 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544442:545670 [3] NCCL INFO Connected all trees | |
| worker-1:544441:545671 [2] NCCL INFO Connected all trees | |
| worker-0:544573:545785 [2] NCCL INFO NVLS comm 0x5640ba357a80 headRank 2 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544446:545668 [7] NCCL INFO Connected all trees | |
| worker-1:544441:545671 [2] NCCL INFO NVLS comm 0x56101a4ac2a0 headRank 2 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544442:545670 [3] NCCL INFO NVLS comm 0x55c0f433aef0 headRank 3 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544444:545673 [5] NCCL INFO Connected all trees | |
| worker-1:544443:545672 [4] NCCL INFO Connected all trees | |
| worker-1:544445:545669 [6] NCCL INFO Connected all trees | |
| worker-1:544446:545668 [7] NCCL INFO NVLS comm 0x561e314d06d0 headRank 7 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544444:545673 [5] NCCL INFO NVLS comm 0x55ca5f9e3ac0 headRank 5 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544443:545672 [4] NCCL INFO NVLS comm 0x5645eba8dd80 headRank 4 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-0:544574:545786 [3] NCCL INFO Connected all trees | |
| worker-1:544445:545669 [6] NCCL INFO NVLS comm 0x558c66ad5e60 headRank 6 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-0:544578:545787 [7] NCCL INFO Connected all trees | |
| worker-0:544575:545784 [4] NCCL INFO Connected all trees | |
| worker-0:544577:545788 [6] NCCL INFO Connected all trees | |
| worker-0:544576:545789 [5] NCCL INFO Connected all trees | |
| worker-0:544574:545786 [3] NCCL INFO NVLS comm 0x555924c564e0 headRank 3 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544440:545667 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544440:545667 [1] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544441:545671 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544441:545671 [2] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544444:545673 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544444:545673 [5] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544445:545669 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544445:545669 [6] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544443:545672 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544443:545672 [4] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544439:545666 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544446:545668 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544439:545666 [0] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544446:545668 [7] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544442:545670 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-1:544442:545670 [3] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544578:545787 [7] NCCL INFO NVLS comm 0x55a99bcd4aa0 headRank 7 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544444:545673 [5] NCCL INFO ncclCommInitRank comm 0x55ca5f9e3ac0 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId af000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544440:545667 [1] NCCL INFO ncclCommInitRank comm 0x55a19b0a4e30 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 91000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544442:545670 [3] NCCL INFO ncclCommInitRank comm 0x55c0f433aef0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 99000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544446:545668 [7] NCCL INFO ncclCommInitRank comm 0x561e314d06d0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId b7000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544445:545669 [6] NCCL INFO ncclCommInitRank comm 0x558c66ad5e60 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId b3000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544441:545671 [2] NCCL INFO ncclCommInitRank comm 0x56101a4ac2a0 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 95000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544439:545666 [0] NCCL INFO ncclCommInitRank comm 0x55d1fd231e80 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544443:545672 [4] NCCL INFO ncclCommInitRank comm 0x5645eba8dd80 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId ab000 commId 0x580fa346b03991d9 - Init COMPLETE | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544577:545788 [6] NCCL INFO NVLS comm 0x559a9b5d95f0 headRank 6 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-0:544575:545784 [4] NCCL INFO NVLS comm 0x5571647ec3b0 headRank 4 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544576:545789 [5] NCCL INFO NVLS comm 0x55c6aa5c6b00 headRank 5 nHeads 8 buffSize 1048576 memSize 2097152 nvlsPerRankSize 150994944 nvlsTotalSize 1207959552 | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544573:545785 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544573:545785 [2] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544574:545786 [3] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544574:545786 [3] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544572:545790 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544572:545790 [1] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544575:545784 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544575:545784 [4] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544576:545789 [5] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544576:545789 [5] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544577:545788 [6] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544577:545788 [6] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544578:545787 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544578:545787 [7] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-0:544571:545782 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
| worker-0:544571:545782 [0] NCCL INFO 24 coll channels, 24 collnet channels, 16 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544572:545790 [1] NCCL INFO ncclCommInitRank comm 0x556975ec4a10 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 91000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544574:545786 [3] NCCL INFO ncclCommInitRank comm 0x555924c564e0 rank 3 nranks 8 cudaDev 3 nvmlDev 3 busId 99000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544578:545787 [7] NCCL INFO ncclCommInitRank comm 0x55a99bcd4aa0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId b7000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544576:545789 [5] NCCL INFO ncclCommInitRank comm 0x55c6aa5c6b00 rank 5 nranks 8 cudaDev 5 nvmlDev 5 busId af000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544575:545784 [4] NCCL INFO ncclCommInitRank comm 0x5571647ec3b0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId ab000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544573:545785 [2] NCCL INFO ncclCommInitRank comm 0x5640ba357a80 rank 2 nranks 8 cudaDev 2 nvmlDev 2 busId 95000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544577:545788 [6] NCCL INFO ncclCommInitRank comm 0x559a9b5d95f0 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId b3000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544571:545782 [0] NCCL INFO ncclCommInitRank comm 0x55ec66bf52e0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 8d000 commId 0x778b3f5043bf8e36 - Init COMPLETE | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 1[1] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 2[2] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 3[3] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 4[4] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 5[5] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 00/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 01/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 02/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 03/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 04/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 05/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 06/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 07/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 08/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 09/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 10/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 11/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 12/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 13/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 14/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 15/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 16/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 17/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 6[6] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 18/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 00/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 01/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 19/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 20/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 02/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 21/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 03/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 22/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 04/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 23/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 05/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 24/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 06/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 25/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 26/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 27/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 07/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 28/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 08/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 09/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 29/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 30/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 10/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 11/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 12/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-0:544571:545965 [0] NCCL INFO Channel 31/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 13/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 14/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 15/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 16/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 17/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 18/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 19/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 20/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 21/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 22/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 23/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 24/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 25/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 26/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 27/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 28/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 29/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 30/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| worker-1:544439:545829 [0] NCCL INFO Channel 31/1 : 0[0] -> 7[7] via P2P/CUMEM | |
| INFO:torchtune.utils._logging:Instantiating model and loading checkpoint took 68.42 secs | |
| INFO:torchtune.utils._logging:Memory stats after model init: | |
| GPU peak memory allocation: 13.05 GiB | |
| GPU peak memory reserved: 14.69 GiB | |
| GPU peak memory active: 13.05 GiB | |
| INFO:torchtune.utils._logging:Optimizer is initialized. | |
| INFO:torchtune.utils._logging:Compiling loss with torch.compile... | |
| INFO:torchtune.utils._logging:Loss is initialized. | |
| Packing dataset: 0%| | 0/52002 [00:00<?, ?it/s] | |
| Packing dataset: 1%| | 438/52002 [00:00<00:11, 4376.62it/s] | |
| Packing dataset: 2%|▏ | 885/52002 [00:00<00:11, 4430.50it/s] | |
| Packing dataset: 3%|▎ | 1329/52002 [00:00<00:11, 4360.23it/s] | |
| Packing dataset: 3%|▎ | 1786/52002 [00:00<00:11, 4436.33it/s] | |
| Packing dataset: 4%|▍ | 2257/52002 [00:00<00:10, 4532.74it/s] | |
| Packing dataset: 5%|▌ | 2712/52002 [00:00<00:11, 4476.40it/s] | |
| Packing dataset: 6%|▌ | 3166/52002 [00:00<00:10, 4494.99it/s] | |
| Packing dataset: 7%|▋ | 3616/52002 [00:00<00:10, 4457.18it/s] | |
| Packing dataset: 8%|▊ | 4062/52002 [00:00<00:11, 4283.55it/s] | |
| Packing dataset: 9%|▊ | 4492/52002 [00:01<00:11, 4149.56it/s] | |
| Packing dataset: 10%|▉ | 4958/52002 [00:01<00:10, 4296.10it/s] | |
| Packing dataset: 10%|█ | 5420/52002 [00:01<00:10, 4390.69it/s] | |
| Packing dataset: 11%|█▏ | 5899/52002 [00:01<00:10, 4508.08it/s] | |
| Packing dataset: 12%|█▏ | 6352/52002 [00:01<00:10, 4446.29it/s] | |
| Packing dataset: 13%|█▎ | 6804/52002 [00:01<00:10, 4463.21it/s] | |
| Packing dataset: 14%|█▍ | 7252/52002 [00:01<00:10, 4324.87it/s] | |
| Packing dataset: 15%|█▍ | 7711/52002 [00:01<00:10, 4400.52it/s] | |
| Packing dataset: 16%|█▌ | 8153/52002 [00:01<00:10, 4341.57it/s] | |
| Packing dataset: 17%|█▋ | 8589/52002 [00:01<00:10, 4117.89it/s] | |
| Packing dataset: 17%|█▋ | 9022/52002 [00:02<00:10, 4177.26it/s] | |
| Packing dataset: 18%|█▊ | 9473/52002 [00:02<00:09, 4271.34it/s] | |
| Packing dataset: 19%|█▉ | 9903/52002 [00:02<00:09, 4265.11it/s] | |
| Packing dataset: 20%|█▉ | 10331/52002 [00:02<00:10, 4109.86it/s] | |
| Packing dataset: 21%|██ | 10776/52002 [00:02<00:09, 4206.32it/s] | |
| Packing dataset: 22%|██▏ | 11199/52002 [00:02<00:09, 4189.42it/s] | |
| Packing dataset: 22%|██▏ | 11620/52002 [00:02<00:09, 4148.48it/s] | |
| Packing dataset: 23%|██▎ | 12036/52002 [00:02<00:09, 4138.91it/s] | |
| Packing dataset: 24%|██▍ | 12451/52002 [00:02<00:09, 4077.80it/s] | |
| Packing dataset: 25%|██▍ | 12860/52002 [00:03<00:09, 3953.13it/s] | |
| Packing dataset: 26%|██▌ | 13309/52002 [00:03<00:09, 4106.63it/s] | |
| Packing dataset: 26%|██▋ | 13721/52002 [00:03<00:09, 4110.48it/s] | |
| Packing dataset: 27%|██▋ | 14167/52002 [00:03<00:08, 4212.66it/s] | |
| Packing dataset: 28%|██▊ | 14631/52002 [00:03<00:08, 4338.79it/s] | |
| Packing dataset: 29%|██▉ | 15077/52002 [00:03<00:08, 4368.77it/s] | |
| Packing dataset: 30%|██▉ | 15515/52002 [00:03<00:08, 4200.82it/s] | |
| Packing dataset: 31%|███ | 15943/52002 [00:03<00:08, 4222.77it/s] | |
| Packing dataset: 32%|███▏ | 16406/52002 [00:03<00:08, 4336.92it/s] | |
| Packing dataset: 32%|███▏ | 16841/52002 [00:03<00:08, 4259.56it/s] | |
| Packing dataset: 33%|███▎ | 17268/52002 [00:04<00:08, 4240.45it/s] | |
| Packing dataset: 34%|███▍ | 17693/52002 [00:04<00:08, 4229.51it/s] | |
| Packing dataset: 35%|███▍ | 18126/52002 [00:04<00:07, 4257.97it/s] | |
| Packing dataset: 36%|███▌ | 18553/52002 [00:04<00:08, 4080.84it/s] | |
| Packing dataset: 37%|███▋ | 19009/52002 [00:04<00:07, 4216.36it/s] | |
| Packing dataset: 37%|███▋ | 19471/52002 [00:04<00:07, 4333.19it/s] | |
| Packing dataset: 38%|███▊ | 19906/52002 [00:04<00:07, 4264.25it/s] | |
| Packing dataset: 39%|███▉ | 20360/52002 [00:04<00:07, 4342.65it/s] | |
| Packing dataset: 40%|███▉ | 20796/52002 [00:04<00:07, 4165.62it/s] | |
| Packing dataset: 41%|████ | 21215/52002 [00:04<00:07, 4111.71it/s] | |
| Packing dataset: 42%|████▏ | 21628/52002 [00:05<00:07, 4026.30it/s] | |
| Packing dataset: 42%|████▏ | 22055/52002 [00:05<00:07, 4092.39it/s] | |
| Packing dataset: 43%|████▎ | 22466/52002 [00:05<00:07, 4093.23it/s] | |
| Packing dataset: 44%|████▍ | 22880/52002 [00:05<00:07, 4105.63it/s] | |
| Packing dataset: 45%|████▍ | 23343/52002 [00:05<00:06, 4258.62it/s] | |
| Packing dataset: 46%|████▌ | 23783/52002 [00:05<00:06, 4299.89it/s] | |
| Packing dataset: 47%|████▋ | 24224/52002 [00:05<00:06, 4327.59it/s] | |
| Packing dataset: 47%|████▋ | 24694/52002 [00:05<00:06, 4392.98it/s] | |
| Packing dataset: 48%|████▊ | 25134/52002 [00:05<00:06, 4316.24it/s] | |
| Packing dataset: 49%|████▉ | 25577/52002 [00:05<00:06, 4348.14it/s] | |
| Packing dataset: 50%|█████ | 26013/52002 [00:06<00:06, 4301.88it/s] | |
| Packing dataset: 51%|█████ | 26473/52002 [00:06<00:05, 4385.34it/s] | |
| Packing dataset: 52%|█████▏ | 26912/52002 [00:06<00:05, 4355.90it/s] | |
| Packing dataset: 53%|█████▎ | 27372/52002 [00:06<00:05, 4427.56it/s] | |
| Packing dataset: 54%|█████▎ | 27825/52002 [00:06<00:05, 4456.27it/s] | |
| Packing dataset: 54%|█████▍ | 28271/52002 [00:06<00:05, 4368.97it/s] | |
| Packing dataset: 55%|█████▌ | 28717/52002 [00:06<00:05, 4393.68it/s] | |
| Packing dataset: 56%|█████▌ | 29157/52002 [00:06<00:05, 4280.79it/s] | |
| Packing dataset: 57%|█████▋ | 29586/52002 [00:06<00:05, 4227.99it/s] | |
| Packing dataset: 58%|█████▊ | 30010/52002 [00:07<00:05, 4191.93it/s] | |
| Packing dataset: 59%|█████▊ | 30465/52002 [00:07<00:05, 4289.68it/s] | |
| Packing dataset: 59%|█████▉ | 30922/52002 [00:07<00:04, 4368.34it/s] | |
| Packing dataset: 60%|██████ | 31360/52002 [00:07<00:04, 4343.03it/s] | |
| Packing dataset: 61%|██████ | 31839/52002 [00:07<00:04, 4474.16it/s] | |
| Packing dataset: 62%|██████▏ | 32296/52002 [00:07<00:04, 4500.48it/s] | |
| Packing dataset: 63%|██████▎ | 32751/52002 [00:07<00:04, 4514.46it/s] | |
| Packing dataset: 64%|██████▍ | 33216/52002 [00:07<00:04, 4554.60it/s] | |
| Packing dataset: 65%|██████▍ | 33672/52002 [00:07<00:04, 4445.20it/s] | |
| Packing dataset: 66%|██████▌ | 34118/52002 [00:07<00:04, 4418.29it/s] | |
| Packing dataset: 66%|██████▋ | 34561/52002 [00:08<00:03, 4381.24it/s] | |
| Packing dataset: 67%|██████▋ | 35020/52002 [00:08<00:03, 4442.36it/s] | |
| Packing dataset: 68%|██████▊ | 35465/52002 [00:08<00:03, 4442.75it/s] | |
| Packing dataset: 69%|██████▉ | 35914/52002 [00:08<00:03, 4456.21it/s] | |
| Packing dataset: 70%|██████▉ | 36360/52002 [00:08<00:03, 4445.09it/s] | |
| Packing dataset: 71%|███████ | 36809/52002 [00:08<00:03, 4457.70it/s] | |
| Packing dataset: 72%|███████▏ | 37297/52002 [00:08<00:03, 4582.56it/s] | |
| Packing dataset: 73%|███████▎ | 37756/52002 [00:08<00:03, 4506.44it/s] | |
| Packing dataset: 73%|███████▎ | 38207/52002 [00:08<00:03, 4493.07it/s] | |
| Packing dataset: 74%|███████▍ | 38657/52002 [00:08<00:02, 4484.35it/s] | |
| Packing dataset: 75%|███████▌ | 39106/52002 [00:09<00:02, 4423.73it/s] | |
| Packing dataset: 76%|███████▌ | 39565/52002 [00:09<00:02, 4468.30it/s] | |
| Packing dataset: 77%|███████▋ | 40013/52002 [00:09<00:02, 4439.85it/s] | |
| Packing dataset: 78%|███████▊ | 40466/52002 [00:09<00:02, 4465.84it/s] | |
| Packing dataset: 79%|███████▊ | 40947/52002 [00:09<00:02, 4495.82it/s] | |
| Packing dataset: 80%|███████▉ | 41417/52002 [00:09<00:02, 4551.60it/s] | |
| Packing dataset: 81%|████████ | 41873/52002 [00:09<00:02, 4505.16it/s] | |
| Packing dataset: 81%|████████▏ | 42324/52002 [00:09<00:02, 4442.81it/s] | |
| Packing dataset: 82%|████████▏ | 42769/52002 [00:09<00:02, 4388.59it/s] | |
| Packing dataset: 83%|████████▎ | 43209/52002 [00:09<00:02, 4302.78it/s] | |
| Packing dataset: 84%|████████▍ | 43640/52002 [00:10<00:01, 4294.41it/s] | |
| Packing dataset: 85%|████████▍ | 44096/52002 [00:10<00:01, 4370.40it/s] | |
| Packing dataset: 86%|████████▌ | 44536/52002 [00:10<00:01, 4378.41it/s] | |
| Packing dataset: 87%|████████▋ | 45002/52002 [00:10<00:01, 4461.04it/s] | |
| Packing dataset: 87%|████████▋ | 45449/52002 [00:10<00:01, 4431.07it/s] | |
| Packing dataset: 88%|████████▊ | 45921/52002 [00:10<00:01, 4514.67it/s] | |
| Packing dataset: 89%|████████▉ | 46373/52002 [00:10<00:01, 4427.82it/s] | |
| Packing dataset: 90%|█████████ | 46817/52002 [00:10<00:01, 4396.99it/s] | |
| Packing dataset: 91%|█████████ | 47258/52002 [00:10<00:01, 4330.43it/s] | |
| Packing dataset: 92%|█████████▏| 47692/52002 [00:11<00:00, 4322.76it/s] | |
| Packing dataset: 93%|█████████▎| 48135/52002 [00:11<00:00, 4353.90it/s] | |
| Packing dataset: 93%|█████████▎| 48571/52002 [00:11<00:00, 4337.52it/s] | |
| Packing dataset: 94%|█████████▍| 49011/52002 [00:11<00:00, 4355.50it/s] | |
| Packing dataset: 95%|█████████▌| 49468/52002 [00:11<00:00, 4416.72it/s] | |
| Packing dataset: 96%|█████████▌| 49910/52002 [00:11<00:00, 4360.43it/s] | |
| Packing dataset: 97%|█████████▋| 50356/52002 [00:11<00:00, 4386.79it/s] | |
| Packing dataset: 98%|█████████▊| 50795/52002 [00:11<00:00, 4335.26it/s] | |
| Packing dataset: 99%|█████████▊| 51256/52002 [00:11<00:00, 4414.52it/s] | |
| Packing dataset: 99%|█████████▉| 51698/52002 [00:11<00:00, 4276.50it/s] | |
| Packing dataset: 100%|██████████| 52002/52002 [00:11<00:00, 4333.55it/s] | |
| INFO:torchtune.utils._logging:No learning rate scheduler configured. Using constant learning rate. | |
| WARNING:torchtune.utils._logging: Profiling disabled. | |
| INFO:torchtune.utils._logging: Profiler config after instantiation: {'enabled': False} | |
| 0%| | 0/776 [00:00<?, ?it/s]DEBUG:torchtune.utils._logging:Using flex attention for attention computation since a BlockMask was passed in. | |
| 0%| | 1/776 [00:55<11:56:10, 55.45s/it] | |
| 1|1|Loss: 2.938946008682251: 0%| | 1/776 [00:55<11:56:10, 55.45s/it] | |
| 1|1|Loss: 2.938946008682251: 0%| | 2/776 [00:57<5:06:56, 23.79s/it] | |
| 1|2|Loss: 1.8505816459655762: 0%| | 2/776 [00:57<5:06:56, 23.79s/it] | |
| 1|2|Loss: 1.8505816459655762: 0%| | 3/776 [00:58<2:53:17, 13.45s/it] | |
| 1|3|Loss: 1.5443191528320312: 0%| | 3/776 [00:58<2:53:17, 13.45s/it] | |
| 1|3|Loss: 1.5443191528320312: 1%| | 4/776 [00:59<1:52:11, 8.72s/it] | |
| 1|4|Loss: 1.5330866575241089: 1%| | 4/776 [00:59<1:52:11, 8.72s/it] | |
| 1|4|Loss: 1.5330866575241089: 1%| | 5/776 [01:01<1:18:02, 6.07s/it] | |
| 1|5|Loss: 1.4468822479248047: 1%| | 5/776 [01:01<1:18:02, 6.07s/it] | |
| 1|5|Loss: 1.4468822479248047: 1%| | 6/776 [01:02<56:26, 4.40s/it] | |
| 1|6|Loss: 1.2722903490066528: 1%| | 6/776 [01:02<56:26, 4.40s/it] | |
| 1|6|Loss: 1.2722903490066528: 1%| | 7/776 [01:03<42:42, 3.33s/it] | |
| 1|7|Loss: 1.2810163497924805: 1%| | 7/776 [01:03<42:42, 3.33s/it] | |
| 1|7|Loss: 1.2810163497924805: 1%| | 8/776 [01:04<33:44, 2.64s/it] | |
| 1|8|Loss: 1.2979081869125366: 1%| | 8/776 [01:04<33:44, 2.64s/it] | |
| 1|8|Loss: 1.2979081869125366: 1%| | 9/776 [01:05<27:39, 2.16s/it] | |
| 1|9|Loss: 1.3873618841171265: 1%| | 9/776 [01:05<27:39, 2.16s/it] | |
| 1|9|Loss: 1.3873618841171265: 1%|▏ | 10/776 [01:06<23:32, 1.84s/it] | |
| 1|10|Loss: 1.2918756008148193: 1%|▏ | 10/776 [01:06<23:32, 1.84s/it] | |
| 1|10|Loss: 1.2918756008148193: 1%|▏ | 11/776 [01:07<20:44, 1.63s/it] | |
| 1|11|Loss: 1.2397658824920654: 1%|▏ | 11/776 [01:07<20:44, 1.63s/it] | |
| 1|11|Loss: 1.2397658824920654: 2%|▏ | 12/776 [01:09<18:44, 1.47s/it] | |
| 1|12|Loss: 1.173169493675232: 2%|▏ | 12/776 [01:09<18:44, 1.47s/it] | |
| 1|12|Loss: 1.173169493675232: 2%|▏ | 13/776 [01:10<17:21, 1.36s/it] | |
| 1|13|Loss: 1.2359483242034912: 2%|▏ | 13/776 [01:10<17:21, 1.36s/it] | |
| 1|13|Loss: 1.2359483242034912: 2%|▏ | 14/776 [01:11<16:26, 1.30s/it] | |
| 1|14|Loss: 1.214673399925232: 2%|▏ | 14/776 [01:11<16:26, 1.30s/it] | |
| 1|14|Loss: 1.214673399925232: 2%|▏ | 15/776 [01:12<15:51, 1.25s/it] | |
| 1|15|Loss: 1.180527687072754: 2%|▏ | 15/776 [01:12<15:51, 1.25s/it] | |
| 1|15|Loss: 1.180527687072754: 2%|▏ | 16/776 [01:13<16:34, 1.31s/it] | |
| 1|16|Loss: 1.17176353931427: 2%|▏ | 16/776 [01:13<16:34, 1.31s/it] | |
| 1|16|Loss: 1.17176353931427: 2%|▏ | 17/776 [01:15<16:47, 1.33s/it] | |
| 1|17|Loss: 1.1649839878082275: 2%|▏ | 17/776 [01:15<16:47, 1.33s/it] | |
| 1|17|Loss: 1.1649839878082275: 2%|▏ | 18/776 [01:16<16:02, 1.27s/it] | |
| 1|18|Loss: 1.120618462562561: 2%|▏ | 18/776 [01:16<16:02, 1.27s/it] | |
| 1|18|Loss: 1.120618462562561: 2%|▏ | 19/776 [01:17<15:35, 1.24s/it] | |
| 1|19|Loss: 1.165447473526001: 2%|▏ | 19/776 [01:17<15:35, 1.24s/it] | |
| 1|19|Loss: 1.165447473526001: 3%|▎ | 20/776 [01:18<15:11, 1.21s/it] | |
| 1|20|Loss: 1.1638232469558716: 3%|▎ | 20/776 [01:18<15:11, 1.21s/it] | |
| 1|20|Loss: 1.1638232469558716: 3%|▎ | 21/776 [01:19<14:58, 1.19s/it] | |
| 1|21|Loss: 1.1374095678329468: 3%|▎ | 21/776 [01:19<14:58, 1.19s/it] | |
| 1|21|Loss: 1.1374095678329468: 3%|▎ | 22/776 [01:20<14:44, 1.17s/it] | |
| 1|22|Loss: 1.1553051471710205: 3%|▎ | 22/776 [01:20<14:44, 1.17s/it] | |
| 1|22|Loss: 1.1553051471710205: 3%|▎ | 23/776 [01:22<14:32, 1.16s/it] | |
| 1|23|Loss: 1.1035996675491333: 3%|▎ | 23/776 [01:22<14:32, 1.16s/it] | |
| 1|23|Loss: 1.1035996675491333: 3%|▎ | 24/776 [01:23<14:25, 1.15s/it] | |
| 1|24|Loss: 1.142500400543213: 3%|▎ | 24/776 [01:23<14:25, 1.15s/it] | |
| 1|24|Loss: 1.142500400543213: 3%|▎ | 25/776 [01:24<14:21, 1.15s/it] | |
| 1|25|Loss: 1.075368881225586: 3%|▎ | 25/776 [01:24<14:21, 1.15s/it] | |
| 1|25|Loss: 1.075368881225586: 3%|▎ | 26/776 [01:25<14:24, 1.15s/it] | |
| 1|26|Loss: 1.0792189836502075: 3%|▎ | 26/776 [01:25<14:24, 1.15s/it] | |
| 1|26|Loss: 1.0792189836502075: 3%|▎ | 27/776 [01:26<14:20, 1.15s/it] | |
| 1|27|Loss: 1.1476564407348633: 3%|▎ | 27/776 [01:26<14:20, 1.15s/it] | |
| 1|27|Loss: 1.1476564407348633: 4%|▎ | 28/776 [01:28<16:14, 1.30s/it] | |
| 1|28|Loss: 1.0897356271743774: 4%|▎ | 28/776 [01:28<16:14, 1.30s/it] | |
| 1|28|Loss: 1.0897356271743774: 4%|▎ | 29/776 [01:29<16:28, 1.32s/it] | |
| 1|29|Loss: 1.0512956380844116: 4%|▎ | 29/776 [01:29<16:28, 1.32s/it] | |
| 1|29|Loss: 1.0512956380844116: 4%|▍ | 30/776 [01:30<15:43, 1.27s/it] | |
| 1|30|Loss: 1.0918381214141846: 4%|▍ | 30/776 [01:30<15:43, 1.27s/it] | |
| 1|30|Loss: 1.0918381214141846: 4%|▍ | 31/776 [01:31<15:09, 1.22s/it] | |
| 1|31|Loss: 1.1017324924468994: 4%|▍ | 31/776 [01:31<15:09, 1.22s/it] | |
| 1|31|Loss: 1.1017324924468994: 4%|▍ | 32/776 [01:33<14:45, 1.19s/it] | |
| 1|32|Loss: 1.1101791858673096: 4%|▍ | 32/776 [01:33<14:45, 1.19s/it] | |
| 1|32|Loss: 1.1101791858673096: 4%|▍ | 33/776 [01:34<14:30, 1.17s/it] | |
| 1|33|Loss: 1.1335989236831665: 4%|▍ | 33/776 [01:34<14:30, 1.17s/it] | |
| 1|33|Loss: 1.1335989236831665: 4%|▍ | 34/776 [01:35<14:18, 1.16s/it] | |
| 1|34|Loss: 1.125473976135254: 4%|▍ | 34/776 [01:35<14:18, 1.16s/it] | |
| 1|34|Loss: 1.125473976135254: 5%|▍ | 35/776 [01:36<14:16, 1.16s/it] | |
| 1|35|Loss: 1.228766918182373: 5%|▍ | 35/776 [01:36<14:16, 1.16s/it] | |
| 1|35|Loss: 1.228766918182373: 5%|▍ | 36/776 [01:37<14:08, 1.15s/it] | |
| 1|36|Loss: 1.1193197965621948: 5%|▍ | 36/776 [01:37<14:08, 1.15s/it] | |
| 1|36|Loss: 1.1193197965621948: 5%|▍ | 37/776 [01:38<14:01, 1.14s/it] | |
| 1|37|Loss: 1.123386025428772: 5%|▍ | 37/776 [01:38<14:01, 1.14s/it] | |
| 1|37|Loss: 1.123386025428772: 5%|▍ | 38/776 [01:39<13:54, 1.13s/it] | |
| 1|38|Loss: 1.1215287446975708: 5%|▍ | 38/776 [01:39<13:54, 1.13s/it] | |
| 1|38|Loss: 1.1215287446975708: 5%|▌ | 39/776 [01:41<14:45, 1.20s/it] | |
| 1|39|Loss: 1.0890690088272095: 5%|▌ | 39/776 [01:41<14:45, 1.20s/it] | |
| 1|39|Loss: 1.0890690088272095: 5%|▌ | 40/776 [01:42<15:37, 1.27s/it] | |
| 1|40|Loss: 1.0658645629882812: 5%|▌ | 40/776 [01:42<15:37, 1.27s/it] | |
| 1|40|Loss: 1.0658645629882812: 5%|▌ | 41/776 [01:43<15:59, 1.31s/it] | |
| 1|41|Loss: 1.1289267539978027: 5%|▌ | 41/776 [01:43<15:59, 1.31s/it] | |
| 1|41|Loss: 1.1289267539978027: 5%|▌ | 42/776 [01:45<15:18, 1.25s/it] | |
| 1|42|Loss: 1.1276999711990356: 5%|▌ | 42/776 [01:45<15:18, 1.25s/it] | |
| 1|42|Loss: 1.1276999711990356: 6%|▌ | 43/776 [01:46<14:49, 1.21s/it] | |
| 1|43|Loss: 1.0734614133834839: 6%|▌ | 43/776 [01:46<14:49, 1.21s/it] | |
| 1|43|Loss: 1.0734614133834839: 6%|▌ | 44/776 [01:47<14:32, 1.19s/it] | |
| 1|44|Loss: 1.1249150037765503: 6%|▌ | 44/776 [01:47<14:32, 1.19s/it] | |
| 1|44|Loss: 1.1249150037765503: 6%|▌ | 45/776 [01:48<14:29, 1.19s/it] | |
| 1|45|Loss: 1.1202725172042847: 6%|▌ | 45/776 [01:48<14:29, 1.19s/it] | |
| 1|45|Loss: 1.1202725172042847: 6%|▌ | 46/776 [01:49<14:16, 1.17s/it] | |
| 1|46|Loss: 1.0269125699996948: 6%|▌ | 46/776 [01:49<14:16, 1.17s/it] | |
| 1|46|Loss: 1.0269125699996948: 6%|▌ | 47/776 [01:50<14:02, 1.16s/it] | |
| 1|47|Loss: 1.0340683460235596: 6%|▌ | 47/776 [01:50<14:02, 1.16s/it] | |
| 1|47|Loss: 1.0340683460235596: 6%|▌ | 48/776 [01:51<13:55, 1.15s/it] | |
| 1|48|Loss: 1.097231388092041: 6%|▌ | 48/776 [01:51<13:55, 1.15s/it] | |
| 1|48|Loss: 1.097231388092041: 6%|▋ | 49/776 [01:53<13:51, 1.14s/it] | |
| 1|49|Loss: 1.1261508464813232: 6%|▋ | 49/776 [01:53<13:51, 1.14s/it] | |
| 1|49|Loss: 1.1261508464813232: 6%|▋ | 50/776 [01:54<13:44, 1.14s/it] | |
| 1|50|Loss: 1.0019255876541138: 6%|▋ | 50/776 [01:54<13:44, 1.14s/it] | |
| 1|50|Loss: 1.0019255876541138: 7%|▋ | 51/776 [01:55<14:34, 1.21s/it] | |
| 1|51|Loss: 1.2170408964157104: 7%|▋ | 51/776 [01:55<14:34, 1.21s/it] | |
| 1|51|Loss: 1.2170408964157104: 7%|▋ | 52/776 [01:56<15:20, 1.27s/it] | |
| 1|52|Loss: 1.0806734561920166: 7%|▋ | 52/776 [01:56<15:20, 1.27s/it] | |
| 1|52|Loss: 1.0806734561920166: 7%|▋ | 53/776 [01:58<14:46, 1.23s/it] | |
| 1|53|Loss: 1.0367075204849243: 7%|▋ | 53/776 [01:58<14:46, 1.23s/it] | |
| 1|53|Loss: 1.0367075204849243: 7%|▋ | 54/776 [01:59<15:21, 1.28s/it] | |
| 1|54|Loss: 1.1906617879867554: 7%|▋ | 54/776 [01:59<15:21, 1.28s/it] | |
| 1|54|Loss: 1.1906617879867554: 7%|▋ | 55/776 [02:00<14:46, 1.23s/it] | |
| 1|55|Loss: 1.0795036554336548: 7%|▋ | 55/776 [02:00<14:46, 1.23s/it] | |
| 1|55|Loss: 1.0795036554336548: 7%|▋ | 56/776 [02:01<14:23, 1.20s/it] | |
| 1|56|Loss: 1.1053745746612549: 7%|▋ | 56/776 [02:01<14:23, 1.20s/it] | |
| 1|56|Loss: 1.1053745746612549: 7%|▋ | 57/776 [02:02<14:04, 1.17s/it] | |
| 1|57|Loss: 1.0304906368255615: 7%|▋ | 57/776 [02:02<14:04, 1.17s/it] | |
| 1|57|Loss: 1.0304906368255615: 7%|▋ | 58/776 [02:03<13:51, 1.16s/it] | |
| 1|58|Loss: 1.1180912256240845: 7%|▋ | 58/776 [02:03<13:51, 1.16s/it] | |
| 1|58|Loss: 1.1180912256240845: 8%|▊ | 59/776 [02:05<13:43, 1.15s/it] | |
| 1|59|Loss: 1.0865293741226196: 8%|▊ | 59/776 [02:05<13:43, 1.15s/it] | |
| 1|59|Loss: 1.0865293741226196: 8%|▊ | 60/776 [02:06<13:35, 1.14s/it] | |
| 1|60|Loss: 1.1785703897476196: 8%|▊ | 60/776 [02:06<13:35, 1.14s/it] | |
| 1|60|Loss: 1.1785703897476196: 8%|▊ | 61/776 [02:07<13:35, 1.14s/it] | |
| 1|61|Loss: 1.080942988395691: 8%|▊ | 61/776 [02:07<13:35, 1.14s/it] | |
| 1|61|Loss: 1.080942988395691: 8%|▊ | 62/776 [02:08<13:35, 1.14s/it] | |
| 1|62|Loss: 1.1973055601119995: 8%|▊ | 62/776 [02:08<13:35, 1.14s/it] | |
| 1|62|Loss: 1.1973055601119995: 8%|▊ | 63/776 [02:09<14:18, 1.20s/it] | |
| 1|63|Loss: 1.1062465906143188: 8%|▊ | 63/776 [02:09<14:18, 1.20s/it] | |
| 1|63|Loss: 1.1062465906143188: 8%|▊ | 64/776 [02:11<15:02, 1.27s/it] | |
| 1|64|Loss: 1.1948652267456055: 8%|▊ | 64/776 [02:11<15:02, 1.27s/it] | |
| 1|64|Loss: 1.1948652267456055: 8%|▊ | 65/776 [02:12<14:33, 1.23s/it] | |
| 1|65|Loss: 1.1732417345046997: 8%|▊ | 65/776 [02:12<14:33, 1.23s/it] | |
| 1|65|Loss: 1.1732417345046997: 9%|▊ | 66/776 [02:13<14:58, 1.27s/it] | |
| 1|66|Loss: 1.3428436517715454: 9%|▊ | 66/776 [02:13<14:58, 1.27s/it] | |
| 1|66|Loss: 1.3428436517715454: 9%|▊ | 67/776 [02:14<14:26, 1.22s/it] | |
| 1|67|Loss: 1.3502891063690186: 9%|▊ | 67/776 [02:14<14:26, 1.22s/it] | |
| 1|67|Loss: 1.3502891063690186: 9%|▉ | 68/776 [02:16<14:04, 1.19s/it] | |
| 1|68|Loss: 1.626853108406067: 9%|▉ | 68/776 [02:16<14:04, 1.19s/it] | |
| 1|68|Loss: 1.626853108406067: 9%|▉ | 69/776 [02:17<13:47, 1.17s/it] | |
| 1|69|Loss: 2.102133274078369: 9%|▉ | 69/776 [02:17<13:47, 1.17s/it] | |
| 1|69|Loss: 2.102133274078369: 9%|▉ | 70/776 [02:18<13:41, 1.16s/it] | |
| 1|70|Loss: 2.4904701709747314: 9%|▉ | 70/776 [02:18<13:41, 1.16s/it] | |
| 1|70|Loss: 2.4904701709747314: 9%|▉ | 71/776 [02:19<13:35, 1.16s/it] | |
| 1|71|Loss: 4.394655704498291: 9%|▉ | 71/776 [02:19<13:35, 1.16s/it] | |
| 1|71|Loss: 4.394655704498291: 9%|▉ | 72/776 [02:20<13:31, 1.15s/it] | |
| 1|72|Loss: 5.533819675445557: 9%|▉ | 72/776 [02:20<13:31, 1.15s/it] | |
| 1|72|Loss: 5.533819675445557: 9%|▉ | 73/776 [02:21<13:23, 1.14s/it] | |
| 1|73|Loss: 5.987132549285889: 9%|▉ | 73/776 [02:21<13:23, 1.14s/it] | |
| 1|73|Loss: 5.987132549285889: 10%|▉ | 74/776 [02:22<13:18, 1.14s/it] | |
| 1|74|Loss: 4.513265132904053: 10%|▉ | 74/776 [02:22<13:18, 1.14s/it] | |
| 1|74|Loss: 4.513265132904053: 10%|▉ | 75/776 [02:24<14:59, 1.28s/it] | |
| 1|75|Loss: 3.6513607501983643: 10%|▉ | 75/776 [02:24<14:59, 1.28s/it] | |
| 1|75|Loss: 3.6513607501983643: 10%|▉ | 76/776 [02:25<14:30, 1.24s/it] | |
| 1|76|Loss: 2.9064791202545166: 10%|▉ | 76/776 [02:25<14:30, 1.24s/it] | |
| 1|76|Loss: 2.9064791202545166: 10%|▉ | 77/776 [02:26<14:04, 1.21s/it] | |
| 1|77|Loss: 7.2656707763671875: 10%|▉ | 77/776 [02:26<14:04, 1.21s/it] | |
| 1|77|Loss: 7.2656707763671875: 10%|█ | 78/776 [02:28<14:39, 1.26s/it] | |
| 1|78|Loss: 6.634099960327148: 10%|█ | 78/776 [02:28<14:39, 1.26s/it] | |
| 1|78|Loss: 6.634099960327148: 10%|█ | 79/776 [02:29<14:12, 1.22s/it] | |
| 1|79|Loss: 7.0447282791137695: 10%|█ | 79/776 [02:29<14:12, 1.22s/it] | |
| 1|79|Loss: 7.0447282791137695: 10%|█ | 80/776 [02:30<13:52, 1.20s/it] | |
| 1|80|Loss: 7.235594272613525: 10%|█ | 80/776 [02:30<13:52, 1.20s/it] | |
| 1|80|Loss: 7.235594272613525: 10%|█ | 81/776 [02:31<13:43, 1.18s/it] | |
| 1|81|Loss: 7.014163494110107: 10%|█ | 81/776 [02:31<13:43, 1.18s/it] | |
| 1|81|Loss: 7.014163494110107: 11%|█ | 82/776 [02:32<13:30, 1.17s/it] | |
| 1|82|Loss: 6.741518497467041: 11%|█ | 82/776 [02:32<13:30, 1.17s/it] | |
| 1|82|Loss: 6.741518497467041: 11%|█ | 83/776 [02:33<13:20, 1.15s/it] | |
| 1|83|Loss: 6.598292350769043: 11%|█ | 83/776 [02:33<13:20, 1.15s/it] | |
| 1|83|Loss: 6.598292350769043: 11%|█ | 84/776 [02:34<13:11, 1.14s/it] | |
| 1|84|Loss: 6.2466721534729: 11%|█ | 84/776 [02:34<13:11, 1.14s/it] | |
| 1|84|Loss: 6.2466721534729: 11%|█ | 85/776 [02:36<13:05, 1.14s/it] | |
| 1|85|Loss: 6.618438720703125: 11%|█ | 85/776 [02:36<13:05, 1.14s/it] | |
| 1|85|Loss: 6.618438720703125: 11%|█ | 86/776 [02:37<14:45, 1.28s/it] | |
| 1|86|Loss: 6.089427471160889: 11%|█ | 86/776 [02:37<14:45, 1.28s/it] | |
| 1|86|Loss: 6.089427471160889: 11%|█ | 87/776 [02:38<14:11, 1.24s/it] | |
| 1|87|Loss: 5.876938343048096: 11%|█ | 87/776 [02:38<14:11, 1.24s/it] | |
| 1|87|Loss: 5.876938343048096: 11%|█▏ | 88/776 [02:39<13:45, 1.20s/it] | |
| 1|88|Loss: 5.7449774742126465: 11%|█▏ | 88/776 [02:39<13:45, 1.20s/it] | |
| 1|88|Loss: 5.7449774742126465: 11%|█▏ | 89/776 [02:40<13:26, 1.17s/it] | |
| 1|89|Loss: 5.749597549438477: 11%|█▏ | 89/776 [02:40<13:26, 1.17s/it] | |
| 1|89|Loss: 5.749597549438477: 12%|█▏ | 90/776 [02:42<13:55, 1.22s/it] | |
| 1|90|Loss: 6.018191337585449: 12%|█▏ | 90/776 [02:42<13:55, 1.22s/it] | |
| 1|90|Loss: 6.018191337585449: 12%|█▏ | 91/776 [02:43<13:41, 1.20s/it] | |
| 1|91|Loss: 5.914539337158203: 12%|█▏ | 91/776 [02:43<13:41, 1.20s/it] | |
| 1|91|Loss: 5.914539337158203: 12%|█▏ | 92/776 [02:44<13:29, 1.18s/it] | |
| 1|92|Loss: 5.904043197631836: 12%|█▏ | 92/776 [02:44<13:29, 1.18s/it] | |
| 1|92|Loss: 5.904043197631836: 12%|█▏ | 93/776 [02:45<13:15, 1.16s/it] | |
| 1|93|Loss: 5.878889560699463: 12%|█▏ | 93/776 [02:45<13:15, 1.16s/it] | |
| 1|93|Loss: 5.878889560699463: 12%|█▏ | 94/776 [02:46<13:05, 1.15s/it] | |
| 1|94|Loss: 5.524048328399658: 12%|█▏ | 94/776 [02:46<13:05, 1.15s/it] | |
| 1|94|Loss: 5.524048328399658: 12%|█▏ | 95/776 [02:48<13:08, 1.16s/it] | |
| 1|95|Loss: 5.764723777770996: 12%|█▏ | 95/776 [02:48<13:08, 1.16s/it] | |
| 1|95|Loss: 5.764723777770996: 12%|█▏ | 96/776 [02:49<13:05, 1.16s/it] | |
| 1|96|Loss: 5.779175281524658: 12%|█▏ | 96/776 [02:49<13:05, 1.16s/it] | |
| 1|96|Loss: 5.779175281524658: 12%|█▎ | 97/776 [02:50<12:58, 1.15s/it] | |
| 1|97|Loss: 5.583903789520264: 12%|█▎ | 97/776 [02:50<12:58, 1.15s/it] | |
| 1|97|Loss: 5.583903789520264: 13%|█▎ | 98/776 [02:51<14:37, 1.29s/it] | |
| 1|98|Loss: 5.714621067047119: 13%|█▎ | 98/776 [02:51<14:37, 1.29s/it] | |
| 1|98|Loss: 5.714621067047119: 13%|█▎ | 99/776 [02:53<14:00, 1.24s/it] | |
| 1|99|Loss: 5.697048664093018: 13%|█▎ | 99/776 [02:53<14:00, 1.24s/it] | |
| 1|99|Loss: 5.697048664093018: 13%|█▎ | 100/776 [02:54<13:33, 1.20s/it] | |
| 1|100|Loss: 5.328735828399658: 13%|█▎ | 100/776 [02:54<13:33, 1.20s/it] | |
| 1|100|Loss: 5.328735828399658: 13%|█▎ | 101/776 [02:55<13:16, 1.18s/it] | |
| 1|101|Loss: 5.292338848114014: 13%|█▎ | 101/776 [02:55<13:16, 1.18s/it] | |
| 1|101|Loss: 5.292338848114014: 13%|█▎ | 102/776 [02:56<13:56, 1.24s/it] | |
| 1|102|Loss: 5.358601093292236: 13%|█▎ | 102/776 [02:56<13:56, 1.24s/it] | |
| 1|102|Loss: 5.358601093292236: 13%|█▎ | 103/776 [02:57<13:30, 1.21s/it] | |
| 1|103|Loss: 5.254549026489258: 13%|█▎ | 103/776 [02:57<13:30, 1.21s/it] | |
| 1|103|Loss: 5.254549026489258: 13%|█▎ | 104/776 [02:58<13:13, 1.18s/it] | |
| 1|104|Loss: 5.329051971435547: 13%|█▎ | 104/776 [02:58<13:13, 1.18s/it] | |
| 1|104|Loss: 5.329051971435547: 14%|█▎ | 105/776 [03:00<12:59, 1.16s/it] | |
| 1|105|Loss: 5.139285087585449: 14%|█▎ | 105/776 [03:00<12:59, 1.16s/it] | |
| 1|105|Loss: 5.139285087585449: 14%|█▎ | 106/776 [03:01<12:50, 1.15s/it] | |
| 1|106|Loss: 5.336142063140869: 14%|█▎ | 106/776 [03:01<12:50, 1.15s/it] | |
| 1|106|Loss: 5.336142063140869: 14%|█▍ | 107/776 [03:02<12:48, 1.15s/it] | |
| 1|107|Loss: 5.02528190612793: 14%|█▍ | 107/776 [03:02<12:48, 1.15s/it] | |
| 1|107|Loss: 5.02528190612793: 14%|█▍ | 108/776 [03:03<12:46, 1.15s/it] | |
| 1|108|Loss: 4.972185134887695: 14%|█▍ | 108/776 [03:03<12:46, 1.15s/it] | |
| 1|108|Loss: 4.972185134887695: 14%|█▍ | 109/776 [03:04<12:40, 1.14s/it] | |
| 1|109|Loss: 4.981472969055176: 14%|█▍ | 109/776 [03:04<12:40, 1.14s/it] | |
| 1|109|Loss: 4.981472969055176: 14%|█▍ | 110/776 [03:06<14:11, 1.28s/it] | |
| 1|110|Loss: 5.03405237197876: 14%|█▍ | 110/776 [03:06<14:11, 1.28s/it] | |
| 1|110|Loss: 5.03405237197876: 14%|█▍ | 111/776 [03:07<13:40, 1.23s/it] | |
| 1|111|Loss: 4.988402366638184: 14%|█▍ | 111/776 [03:07<13:40, 1.23s/it] | |
| 1|111|Loss: 4.988402366638184: 14%|█▍ | 112/776 [03:08<13:18, 1.20s/it] | |
| 1|112|Loss: 4.5170674324035645: 14%|█▍ | 112/776 [03:08<13:18, 1.20s/it] | |
| 1|112|Loss: 4.5170674324035645: 15%|█▍ | 113/776 [03:09<13:05, 1.18s/it] | |
| 1|113|Loss: 4.134557247161865: 15%|█▍ | 113/776 [03:09<13:05, 1.18s/it] | |
| 1|113|Loss: 4.134557247161865: 15%|█▍ | 114/776 [03:10<12:51, 1.17s/it] | |
| 1|114|Loss: 4.452454090118408: 15%|█▍ | 114/776 [03:10<12:51, 1.17s/it] | |
| 1|114|Loss: 4.452454090118408: 15%|█▍ | 115/776 [03:12<13:23, 1.22s/it] | |
| 1|115|Loss: 4.223154067993164: 15%|█▍ | 115/776 [03:12<13:23, 1.22s/it] | |
| 1|115|Loss: 4.223154067993164: 15%|█▍ | 116/776 [03:13<13:05, 1.19s/it] | |
| 1|116|Loss: 4.145360946655273: 15%|█▍ | 116/776 [03:13<13:05, 1.19s/it] | |
| 1|116|Loss: 4.145360946655273: 15%|█▌ | 117/776 [03:14<12:51, 1.17s/it] | |
| 1|117|Loss: 3.8110954761505127: 15%|█▌ | 117/776 [03:14<12:51, 1.17s/it] | |
| 1|117|Loss: 3.8110954761505127: 15%|█▌ | 118/776 [03:15<12:44, 1.16s/it] | |
| 1|118|Loss: 4.103744983673096: 15%|█▌ | 118/776 [03:15<12:44, 1.16s/it] | |
| 1|118|Loss: 4.103744983673096: 15%|█▌ | 119/776 [03:16<12:40, 1.16s/it] | |
| 1|119|Loss: 3.7936387062072754: 15%|█▌ | 119/776 [03:16<12:40, 1.16s/it] | |
| 1|119|Loss: 3.7936387062072754: 15%|█▌ | 120/776 [03:17<12:32, 1.15s/it] | |
| 1|120|Loss: 3.647224187850952: 15%|█▌ | 120/776 [03:17<12:32, 1.15s/it] | |
| 1|120|Loss: 3.647224187850952: 16%|█▌ | 121/776 [03:18<12:28, 1.14s/it] | |
| 1|121|Loss: 3.6670963764190674: 16%|█▌ | 121/776 [03:18<12:28, 1.14s/it] | |
| 1|121|Loss: 3.6670963764190674: 16%|█▌ | 122/776 [03:20<14:02, 1.29s/it] | |
| 1|122|Loss: 3.841413974761963: 16%|█▌ | 122/776 [03:20<14:02, 1.29s/it] | |
| 1|122|Loss: 3.841413974761963: 16%|█▌ | 123/776 [03:21<13:31, 1.24s/it] | |
| 1|123|Loss: 3.6820356845855713: 16%|█▌ | 123/776 [03:21<13:31, 1.24s/it] | |
| 1|123|Loss: 3.6820356845855713: 16%|█▌ | 124/776 [03:22<13:05, 1.20s/it] | |
| 1|124|Loss: 3.491396427154541: 16%|█▌ | 124/776 [03:22<13:05, 1.20s/it] | |
| 1|124|Loss: 3.491396427154541: 16%|█▌ | 125/776 [03:23<12:56, 1.19s/it] | |
| 1|125|Loss: 5.381327152252197: 16%|█▌ | 125/776 [03:23<12:56, 1.19s/it] | |
| 1|125|Loss: 5.381327152252197: 16%|█▌ | 126/776 [03:25<12:41, 1.17s/it] | |
| 1|126|Loss: 6.426223278045654: 16%|█▌ | 126/776 [03:25<12:41, 1.17s/it] | |
| 1|126|Loss: 6.426223278045654: 16%|█▋ | 127/776 [03:26<13:10, 1.22s/it] | |
| 1|127|Loss: 8.679311752319336: 16%|█▋ | 127/776 [03:26<13:10, 1.22s/it] | |
| 1|127|Loss: 8.679311752319336: 16%|█▋ | 128/776 [03:27<12:57, 1.20s/it] | |
| 1|128|Loss: 7.835454940795898: 16%|█▋ | 128/776 [03:27<12:57, 1.20s/it] | |
| 1|128|Loss: 7.835454940795898: 17%|█▋ | 129/776 [03:28<12:48, 1.19s/it] | |
| 1|129|Loss: 7.560299873352051: 17%|█▋ | 129/776 [03:28<12:48, 1.19s/it] | |
| 1|129|Loss: 7.560299873352051: 17%|█▋ | 130/776 [03:29<12:36, 1.17s/it] | |
| 1|130|Loss: 7.334409713745117: 17%|█▋ | 130/776 [03:29<12:36, 1.17s/it] | |
| 1|130|Loss: 7.334409713745117: 17%|█▋ | 131/776 [03:30<12:26, 1.16s/it] | |
| 1|131|Loss: 7.023469924926758: 17%|█▋ | 131/776 [03:30<12:26, 1.16s/it] | |
| 1|131|Loss: 7.023469924926758: 17%|█▋ | 132/776 [03:32<12:17, 1.14s/it] | |
| 1|132|Loss: 6.6159844398498535: 17%|█▋ | 132/776 [03:32<12:17, 1.14s/it] | |
| 1|132|Loss: 6.6159844398498535: 17%|█▋ | 133/776 [03:33<13:05, 1.22s/it] | |
| 1|133|Loss: 6.576676845550537: 17%|█▋ | 133/776 [03:33<13:05, 1.22s/it] | |
| 1|133|Loss: 6.576676845550537: 17%|█▋ | 134/776 [03:34<13:25, 1.25s/it] | |
| 1|134|Loss: 6.503146171569824: 17%|█▋ | 134/776 [03:34<13:25, 1.25s/it] | |
| 1|134|Loss: 6.503146171569824: 17%|█▋ | 135/776 [03:35<12:59, 1.22s/it] | |
| 1|135|Loss: 6.327778339385986: 17%|█▋ | 135/776 [03:35<12:59, 1.22s/it] | |
| 1|135|Loss: 6.327778339385986: 18%|█▊ | 136/776 [03:37<12:39, 1.19s/it] | |
| 1|136|Loss: 6.4561872482299805: 18%|█▊ | 136/776 [03:37<12:39, 1.19s/it] | |
| 1|136|Loss: 6.4561872482299805: 18%|█▊ | 137/776 [03:38<12:28, 1.17s/it] | |
| 1|137|Loss: 6.349610805511475: 18%|█▊ | 137/776 [03:38<12:28, 1.17s/it] | |
| 1|137|Loss: 6.349610805511475: 18%|█▊ | 138/776 [03:39<12:17, 1.16s/it] | |
| 1|138|Loss: 6.320553302764893: 18%|█▊ | 138/776 [03:39<12:17, 1.16s/it] | |
| 1|138|Loss: 6.320553302764893: 18%|█▊ | 139/776 [03:40<12:57, 1.22s/it] | |
| 1|139|Loss: 6.182732582092285: 18%|█▊ | 139/776 [03:40<12:57, 1.22s/it] | |
| 1|139|Loss: 6.182732582092285: 18%|█▊ | 140/776 [03:41<12:35, 1.19s/it] | |
| 1|140|Loss: 6.304178714752197: 18%|█▊ | 140/776 [03:41<12:35, 1.19s/it] | |
| 1|140|Loss: 6.304178714752197: 18%|█▊ | 141/776 [03:42<12:22, 1.17s/it] | |
| 1|141|Loss: 6.048110485076904: 18%|█▊ | 141/776 [03:42<12:22, 1.17s/it] | |
| 1|141|Loss: 6.048110485076904: 18%|█▊ | 142/776 [03:44<12:14, 1.16s/it] | |
| 1|142|Loss: 6.094357490539551: 18%|█▊ | 142/776 [03:44<12:14, 1.16s/it] | |
| 1|142|Loss: 6.094357490539551: 18%|█▊ | 143/776 [03:45<12:05, 1.15s/it] | |
| 1|143|Loss: 6.225237846374512: 18%|█▊ | 143/776 [03:45<12:05, 1.15s/it] | |
| 1|143|Loss: 6.225237846374512: 19%|█▊ | 144/776 [03:46<12:52, 1.22s/it] | |
| 1|144|Loss: 6.356908321380615: 19%|█▊ | 144/776 [03:46<12:52, 1.22s/it] | |
| 1|144|Loss: 6.356908321380615: 19%|█▊ | 145/776 [03:47<12:37, 1.20s/it] | |
| 1|145|Loss: 5.899023532867432: 19%|█▊ | 145/776 [03:47<12:37, 1.20s/it] | |
| 1|145|Loss: 5.899023532867432: 19%|█▉ | 146/776 [03:49<13:01, 1.24s/it] | |
| 1|146|Loss: 6.32288932800293: 19%|█▉ | 146/776 [03:49<13:01, 1.24s/it] | |
| 1|146|Loss: 6.32288932800293: 19%|█▉ | 147/776 [03:50<12:37, 1.20s/it] | |
| 1|147|Loss: 5.884298324584961: 19%|█▉ | 147/776 [03:50<12:37, 1.20s/it] | |
| 1|147|Loss: 5.884298324584961: 19%|█▉ | 148/776 [03:51<12:19, 1.18s/it] | |
| 1|148|Loss: 5.782046318054199: 19%|█▉ | 148/776 [03:51<12:19, 1.18s/it] | |
| 1|148|Loss: 5.782046318054199: 19%|█▉ | 149/776 [03:52<12:08, 1.16s/it] | |
| 1|149|Loss: 5.989625930786133: 19%|█▉ | 149/776 [03:52<12:08, 1.16s/it] | |
| 1|149|Loss: 5.989625930786133: 19%|█▉ | 150/776 [03:53<12:04, 1.16s/it] | |
| 1|150|Loss: 5.948478698730469: 19%|█▉ | 150/776 [03:53<12:04, 1.16s/it] | |
| 1|150|Loss: 5.948478698730469: 19%|█▉ | 151/776 [03:54<12:32, 1.20s/it] | |
| 1|151|Loss: 5.9121832847595215: 19%|█▉ | 151/776 [03:54<12:32, 1.20s/it] | |
| 1|151|Loss: 5.9121832847595215: 20%|█▉ | 152/776 [03:55<12:16, 1.18s/it] | |
| 1|152|Loss: 5.9345316886901855: 20%|█▉ | 152/776 [03:55<12:16, 1.18s/it] | |
| 1|152|Loss: 5.9345316886901855: 20%|█▉ | 153/776 [03:57<12:04, 1.16s/it] | |
| 1|153|Loss: 5.817429065704346: 20%|█▉ | 153/776 [03:57<12:04, 1.16s/it] | |
| 1|153|Loss: 5.817429065704346: 20%|█▉ | 154/776 [03:58<12:00, 1.16s/it] | |
| 1|154|Loss: 5.905948638916016: 20%|█▉ | 154/776 [03:58<12:00, 1.16s/it] | |
| 1|154|Loss: 5.905948638916016: 20%|█▉ | 155/776 [03:59<11:52, 1.15s/it] | |
| 1|155|Loss: 5.813040256500244: 20%|█▉ | 155/776 [03:59<11:52, 1.15s/it] | |
| 1|155|Loss: 5.813040256500244: 20%|██ | 156/776 [04:00<12:41, 1.23s/it] | |
| 1|156|Loss: 5.897429466247559: 20%|██ | 156/776 [04:00<12:41, 1.23s/it] | |
| 1|156|Loss: 5.897429466247559: 20%|██ | 157/776 [04:01<12:20, 1.20s/it] | |
| 1|157|Loss: 5.88229513168335: 20%|██ | 157/776 [04:01<12:20, 1.20s/it] | |
| 1|157|Loss: 5.88229513168335: 20%|██ | 158/776 [04:03<12:42, 1.23s/it] | |
| 1|158|Loss: 5.781468391418457: 20%|██ | 158/776 [04:03<12:42, 1.23s/it] | |
| 1|158|Loss: 5.781468391418457: 20%|██ | 159/776 [04:04<12:21, 1.20s/it] | |
| 1|159|Loss: 5.82930326461792: 20%|██ | 159/776 [04:04<12:21, 1.20s/it] | |
| 1|159|Loss: 5.82930326461792: 21%|██ | 160/776 [04:05<12:14, 1.19s/it] | |
| 1|160|Loss: 5.699976444244385: 21%|██ | 160/776 [04:05<12:14, 1.19s/it] | |
| 1|160|Loss: 5.699976444244385: 21%|██ | 161/776 [04:06<12:03, 1.18s/it] | |
| 1|161|Loss: 5.823160648345947: 21%|██ | 161/776 [04:06<12:03, 1.18s/it] | |
| 1|161|Loss: 5.823160648345947: 21%|██ | 162/776 [04:07<11:51, 1.16s/it] | |
| 1|162|Loss: 5.91910457611084: 21%|██ | 162/776 [04:07<11:51, 1.16s/it] | |
| 1|162|Loss: 5.91910457611084: 21%|██ | 163/776 [04:09<12:19, 1.21s/it] | |
| 1|163|Loss: 5.713568210601807: 21%|██ | 163/776 [04:09<12:19, 1.21s/it] | |
| 1|163|Loss: 5.713568210601807: 21%|██ | 164/776 [04:10<12:03, 1.18s/it] | |
| 1|164|Loss: 5.675170421600342: 21%|██ | 164/776 [04:10<12:03, 1.18s/it] | |
| 1|164|Loss: 5.675170421600342: 21%|██▏ | 165/776 [04:11<11:53, 1.17s/it] | |
| 1|165|Loss: 5.863519668579102: 21%|██▏ | 165/776 [04:11<11:53, 1.17s/it] | |
| 1|165|Loss: 5.863519668579102: 21%|██▏ | 166/776 [04:12<11:48, 1.16s/it] | |
| 1|166|Loss: 6.067221641540527: 21%|██▏ | 166/776 [04:12<11:48, 1.16s/it] | |
| 1|166|Loss: 6.067221641540527: 22%|██▏ | 167/776 [04:13<11:44, 1.16s/it] | |
| 1|167|Loss: 5.760990619659424: 22%|██▏ | 167/776 [04:13<11:44, 1.16s/it] | |
| 1|167|Loss: 5.760990619659424: 22%|██▏ | 168/776 [04:15<12:24, 1.22s/it] | |
| 1|168|Loss: 5.737936496734619: 22%|██▏ | 168/776 [04:15<12:24, 1.22s/it] | |
| 1|168|Loss: 5.737936496734619: 22%|██▏ | 169/776 [04:16<12:03, 1.19s/it] | |
| 1|169|Loss: 5.911172389984131: 22%|██▏ | 169/776 [04:16<12:03, 1.19s/it] | |
| 1|169|Loss: 5.911172389984131: 22%|██▏ | 170/776 [04:17<12:30, 1.24s/it] | |
| 1|170|Loss: 5.701608657836914: 22%|██▏ | 170/776 [04:17<12:30, 1.24s/it] | |
| 1|170|Loss: 5.701608657836914: 22%|██▏ | 171/776 [04:18<12:11, 1.21s/it] | |
| 1|171|Loss: 5.702700138092041: 22%|██▏ | 171/776 [04:18<12:11, 1.21s/it] | |
| 1|171|Loss: 5.702700138092041: 22%|██▏ | 172/776 [04:19<11:53, 1.18s/it] | |
| 1|172|Loss: 5.842147350311279: 22%|██▏ | 172/776 [04:19<11:53, 1.18s/it] | |
| 1|172|Loss: 5.842147350311279: 22%|██▏ | 173/776 [04:20<11:43, 1.17s/it] | |
| 1|173|Loss: 5.8724365234375: 22%|██▏ | 173/776 [04:20<11:43, 1.17s/it] | |
| 1|173|Loss: 5.8724365234375: 22%|██▏ | 174/776 [04:21<11:36, 1.16s/it] | |
| 1|174|Loss: 5.491985321044922: 22%|██▏ | 174/776 [04:21<11:36, 1.16s/it] | |
| 1|174|Loss: 5.491985321044922: 23%|██▎ | 175/776 [04:23<11:27, 1.14s/it] | |
| 1|175|Loss: 6.015341758728027: 23%|██▎ | 175/776 [04:23<11:27, 1.14s/it] | |
| 1|175|Loss: 6.015341758728027: 23%|██▎ | 176/776 [04:24<12:05, 1.21s/it] | |
| 1|176|Loss: 5.533262729644775: 23%|██▎ | 176/776 [04:24<12:05, 1.21s/it] | |
| 1|176|Loss: 5.533262729644775: 23%|██▎ | 177/776 [04:25<11:51, 1.19s/it] | |
| 1|177|Loss: 5.775820255279541: 23%|██▎ | 177/776 [04:25<11:51, 1.19s/it] | |
| 1|177|Loss: 5.775820255279541: 23%|██▎ | 178/776 [04:26<11:40, 1.17s/it] | |
| 1|178|Loss: 5.716368198394775: 23%|██▎ | 178/776 [04:26<11:40, 1.17s/it] | |
| 1|178|Loss: 5.716368198394775: 23%|██▎ | 179/776 [04:27<11:31, 1.16s/it] | |
| 1|179|Loss: 5.717504024505615: 23%|██▎ | 179/776 [04:27<11:31, 1.16s/it] | |
| 1|179|Loss: 5.717504024505615: 23%|██▎ | 180/776 [04:29<12:19, 1.24s/it] | |
| 1|180|Loss: 5.6729888916015625: 23%|██▎ | 180/776 [04:29<12:19, 1.24s/it] | |
| 1|180|Loss: 5.6729888916015625: 23%|██▎ | 181/776 [04:30<11:59, 1.21s/it] | |
| 1|181|Loss: 5.57545804977417: 23%|██▎ | 181/776 [04:30<11:59, 1.21s/it] | |
| 1|181|Loss: 5.57545804977417: 23%|██▎ | 182/776 [04:31<12:19, 1.24s/it] | |
| 1|182|Loss: 5.413262844085693: 23%|██▎ | 182/776 [04:31<12:19, 1.24s/it] | |
| 1|182|Loss: 5.413262844085693: 24%|██▎ | 183/776 [04:32<11:54, 1.21s/it] | |
| 1|183|Loss: 5.956679344177246: 24%|██▎ | 183/776 [04:32<11:54, 1.21s/it] | |
| 1|183|Loss: 5.956679344177246: 24%|██▎ | 184/776 [04:33<11:38, 1.18s/it] | |
| 1|184|Loss: 5.800989627838135: 24%|██▎ | 184/776 [04:33<11:38, 1.18s/it] | |
| 1|184|Loss: 5.800989627838135: 24%|██▍ | 185/776 [04:35<11:27, 1.16s/it] | |
| 1|185|Loss: 5.738701343536377: 24%|██▍ | 185/776 [04:35<11:27, 1.16s/it] | |
| 1|185|Loss: 5.738701343536377: 24%|██▍ | 186/776 [04:36<11:16, 1.15s/it] | |
| 1|186|Loss: 5.6218366622924805: 24%|██▍ | 186/776 [04:36<11:16, 1.15s/it] | |
| 1|186|Loss: 5.6218366622924805: 24%|██▍ | 187/776 [04:37<11:12, 1.14s/it] | |
| 1|187|Loss: 5.900515079498291: 24%|██▍ | 187/776 [04:37<11:12, 1.14s/it] | |
| 1|187|Loss: 5.900515079498291: 24%|██▍ | 188/776 [04:38<11:45, 1.20s/it] | |
| 1|188|Loss: 5.775814533233643: 24%|██▍ | 188/776 [04:38<11:45, 1.20s/it] | |
| 1|188|Loss: 5.775814533233643: 24%|██▍ | 189/776 [04:39<11:30, 1.18s/it] | |
| 1|189|Loss: 5.590970039367676: 24%|██▍ | 189/776 [04:39<11:30, 1.18s/it] | |
| 1|189|Loss: 5.590970039367676: 24%|██▍ | 190/776 [04:40<11:18, 1.16s/it] | |
| 1|190|Loss: 5.632714748382568: 24%|██▍ | 190/776 [04:40<11:18, 1.16s/it] | |
| 1|190|Loss: 5.632714748382568: 25%|██▍ | 191/776 [04:42<11:58, 1.23s/it] | |
| 1|191|Loss: 5.668724536895752: 25%|██▍ | 191/776 [04:42<11:58, 1.23s/it] | |
| 1|191|Loss: 5.668724536895752: 25%|██▍ | 192/776 [04:43<11:43, 1.20s/it] | |
| 1|192|Loss: 5.706427574157715: 25%|██▍ | 192/776 [04:43<11:43, 1.20s/it] | |
| 1|192|Loss: 5.706427574157715: 25%|██▍ | 193/776 [04:44<11:30, 1.18s/it] | |
| 1|193|Loss: 5.94058084487915: 25%|██▍ | 193/776 [04:44<11:30, 1.18s/it] | |
| 1|193|Loss: 5.94058084487915: 25%|██▌ | 194/776 [04:45<11:54, 1.23s/it] | |
| 1|194|Loss: 5.7771782875061035: 25%|██▌ | 194/776 [04:45<11:54, 1.23s/it] | |
| 1|194|Loss: 5.7771782875061035: 25%|██▌ | 195/776 [04:47<11:34, 1.20s/it] | |
| 1|195|Loss: 5.720219612121582: 25%|██▌ | 195/776 [04:47<11:34, 1.20s/it] | |
| 1|195|Loss: 5.720219612121582: 25%|██▌ | 196/776 [04:48<11:21, 1.18s/it] | |
| 1|196|Loss: 5.516477108001709: 25%|██▌ | 196/776 [04:48<11:21, 1.18s/it] | |
| 1|196|Loss: 5.516477108001709: 25%|██▌ | 197/776 [04:49<11:15, 1.17s/it] | |
| 1|197|Loss: 5.775252342224121: 25%|██▌ | 197/776 [04:49<11:15, 1.17s/it] | |
| 1|197|Loss: 5.775252342224121: 26%|██▌ | 198/776 [04:50<11:10, 1.16s/it] | |
| 1|198|Loss: 5.5500712394714355: 26%|██▌ | 198/776 [04:50<11:10, 1.16s/it] | |
| 1|198|Loss: 5.5500712394714355: 26%|██▌ | 199/776 [04:51<11:06, 1.16s/it] | |
| 1|199|Loss: 5.731786251068115: 26%|██▌ | 199/776 [04:51<11:06, 1.16s/it] | |
| 1|199|Loss: 5.731786251068115: 26%|██▌ | 200/776 [04:52<11:32, 1.20s/it] | |
| 1|200|Loss: 5.771639347076416: 26%|██▌ | 200/776 [04:52<11:32, 1.20s/it] | |
| 1|200|Loss: 5.771639347076416: 26%|██▌ | 201/776 [04:54<11:16, 1.18s/it] | |
| 1|201|Loss: 5.535252094268799: 26%|██▌ | 201/776 [04:54<11:16, 1.18s/it] | |
| 1|201|Loss: 5.535252094268799: 26%|██▌ | 202/776 [04:55<11:52, 1.24s/it] | |
| 1|202|Loss: 5.112711429595947: 26%|██▌ | 202/776 [04:55<11:52, 1.24s/it] | |
| 1|202|Loss: 5.112711429595947: 26%|██▌ | 203/776 [04:56<11:35, 1.21s/it] | |
| 1|203|Loss: 5.389400959014893: 26%|██▌ | 203/776 [04:56<11:35, 1.21s/it] | |
| 1|203|Loss: 5.389400959014893: 26%|██▋ | 204/776 [04:57<11:19, 1.19s/it] | |
| 1|204|Loss: 5.315254211425781: 26%|██▋ | 204/776 [04:57<11:19, 1.19s/it] | |
| 1|204|Loss: 5.315254211425781: 26%|██▋ | 205/776 [04:58<11:06, 1.17s/it] | |
| 1|205|Loss: 5.26814603805542: 26%|██▋ | 205/776 [04:58<11:06, 1.17s/it] | |
| 1|205|Loss: 5.26814603805542: 27%|██▋ | 206/776 [05:00<11:35, 1.22s/it] | |
| 1|206|Loss: 5.536235332489014: 27%|██▋ | 206/776 [05:00<11:35, 1.22s/it] | |
| 1|206|Loss: 5.536235332489014: 27%|██▋ | 207/776 [05:01<11:19, 1.19s/it] | |
| 1|207|Loss: 5.568277359008789: 27%|██▋ | 207/776 [05:01<11:19, 1.19s/it] | |
| 1|207|Loss: 5.568277359008789: 27%|██▋ | 208/776 [05:02<11:11, 1.18s/it] | |
| 1|208|Loss: 5.6019792556762695: 27%|██▋ | 208/776 [05:02<11:11, 1.18s/it] | |
| 1|208|Loss: 5.6019792556762695: 27%|██▋ | 209/776 [05:03<11:01, 1.17s/it] | |
| 1|209|Loss: 5.4087419509887695: 27%|██▋ | 209/776 [05:03<11:01, 1.17s/it] | |
| 1|209|Loss: 5.4087419509887695: 27%|██▋ | 210/776 [05:04<10:51, 1.15s/it] | |
| 1|210|Loss: 5.335548400878906: 27%|██▋ | 210/776 [05:04<10:51, 1.15s/it] | |
| 1|210|Loss: 5.335548400878906: 27%|██▋ | 211/776 [05:05<10:46, 1.14s/it] | |
| 1|211|Loss: 5.52059268951416: 27%|██▋ | 211/776 [05:05<10:46, 1.14s/it] | |
| 1|211|Loss: 5.52059268951416: 27%|██▋ | 212/776 [05:07<11:17, 1.20s/it] | |
| 1|212|Loss: 5.313406944274902: 27%|██▋ | 212/776 [05:07<11:17, 1.20s/it] | |
| 1|212|Loss: 5.313406944274902: 27%|██▋ | 213/776 [05:08<11:05, 1.18s/it] | |
| 1|213|Loss: 5.316163539886475: 27%|██▋ | 213/776 [05:08<11:05, 1.18s/it] | |
| 1|213|Loss: 5.316163539886475: 28%|██▊ | 214/776 [05:09<11:40, 1.25s/it] | |
| 1|214|Loss: 5.56427526473999: 28%|██▊ | 214/776 [05:09<11:40, 1.25s/it] | |
| 1|214|Loss: 5.56427526473999: 28%|██▊ | 215/776 [05:10<11:18, 1.21s/it] | |
| 1|215|Loss: 5.539584636688232: 28%|██▊ | 215/776 [05:10<11:18, 1.21s/it] | |
| 1|215|Loss: 5.539584636688232: 28%|██▊ | 216/776 [05:11<11:01, 1.18s/it] | |
| 1|216|Loss: 5.240893840789795: 28%|██▊ | 216/776 [05:11<11:01, 1.18s/it] | |
| 1|216|Loss: 5.240893840789795: 28%|██▊ | 217/776 [05:13<10:54, 1.17s/it] | |
| 1|217|Loss: 5.240677356719971: 28%|██▊ | 217/776 [05:13<10:54, 1.17s/it] | |
| 1|217|Loss: 5.240677356719971: 28%|██▊ | 218/776 [05:14<11:20, 1.22s/it] | |
| 1|218|Loss: 5.350729465484619: 28%|██▊ | 218/776 [05:14<11:20, 1.22s/it] | |
| 1|218|Loss: 5.350729465484619: 28%|██▊ | 219/776 [05:15<11:08, 1.20s/it] | |
| 1|219|Loss: 5.26855993270874: 28%|██▊ | 219/776 [05:15<11:08, 1.20s/it] | |
| 1|219|Loss: 5.26855993270874: 28%|██▊ | 220/776 [05:16<10:55, 1.18s/it] | |
| 1|220|Loss: 5.379322052001953: 28%|██▊ | 220/776 [05:16<10:55, 1.18s/it] | |
| 1|220|Loss: 5.379322052001953: 28%|██▊ | 221/776 [05:17<10:43, 1.16s/it] | |
| 1|221|Loss: 5.309174537658691: 28%|██▊ | 221/776 [05:17<10:43, 1.16s/it] | |
| 1|221|Loss: 5.309174537658691: 29%|██▊ | 222/776 [05:18<10:37, 1.15s/it] | |
| 1|222|Loss: 5.274999141693115: 29%|██▊ | 222/776 [05:18<10:37, 1.15s/it] | |
| 1|222|Loss: 5.274999141693115: 29%|██▊ | 223/776 [05:20<10:31, 1.14s/it] | |
| 1|223|Loss: 5.278564453125: 29%|██▊ | 223/776 [05:20<10:31, 1.14s/it] | |
| 1|223|Loss: 5.278564453125: 29%|██▉ | 224/776 [05:21<11:05, 1.21s/it] | |
| 1|224|Loss: 5.072393417358398: 29%|██▉ | 224/776 [05:21<11:05, 1.21s/it] | |
| 1|224|Loss: 5.072393417358398: 29%|██▉ | 225/776 [05:22<10:55, 1.19s/it] | |
| 1|225|Loss: 5.496215343475342: 29%|██▉ | 225/776 [05:22<10:55, 1.19s/it] | |
| 1|225|Loss: 5.496215343475342: 29%|██▉ | 226/776 [05:23<11:26, 1.25s/it] | |
| 1|226|Loss: 5.229061603546143: 29%|██▉ | 226/776 [05:23<11:26, 1.25s/it] | |
| 1|226|Loss: 5.229061603546143: 29%|██▉ | 227/776 [05:25<11:03, 1.21s/it] | |
| 1|227|Loss: 5.511023044586182: 29%|██▉ | 227/776 [05:25<11:03, 1.21s/it] | |
| 1|227|Loss: 5.511023044586182: 29%|██▉ | 228/776 [05:26<10:48, 1.18s/it] | |
| 1|228|Loss: 5.266863822937012: 29%|██▉ | 228/776 [05:26<10:48, 1.18s/it] | |
| 1|228|Loss: 5.266863822937012: 30%|██▉ | 229/776 [05:27<10:36, 1.16s/it] | |
| 1|229|Loss: 5.2136335372924805: 30%|██▉ | 229/776 [05:27<10:36, 1.16s/it] | |
| 1|229|Loss: 5.2136335372924805: 30%|██▉ | 230/776 [05:28<11:06, 1.22s/it] | |
| 1|230|Loss: 5.553706169128418: 30%|██▉ | 230/776 [05:28<11:06, 1.22s/it] | |
| 1|230|Loss: 5.553706169128418: 30%|██▉ | 231/776 [05:29<10:55, 1.20s/it] | |
| 1|231|Loss: 5.047750473022461: 30%|██▉ | 231/776 [05:29<10:55, 1.20s/it] | |
| 1|231|Loss: 5.047750473022461: 30%|██▉ | 232/776 [05:31<10:46, 1.19s/it] | |
| 1|232|Loss: 4.983281135559082: 30%|██▉ | 232/776 [05:31<10:46, 1.19s/it] | |
| 1|232|Loss: 4.983281135559082: 30%|███ | 233/776 [05:32<10:34, 1.17s/it] | |
| 1|233|Loss: 4.8648576736450195: 30%|███ | 233/776 [05:32<10:34, 1.17s/it] | |
| 1|233|Loss: 4.8648576736450195: 30%|███ | 234/776 [05:33<10:24, 1.15s/it] | |
| 1|234|Loss: 5.688746452331543: 30%|███ | 234/776 [05:33<10:24, 1.15s/it] | |
| 1|234|Loss: 5.688746452331543: 30%|███ | 235/776 [05:34<10:20, 1.15s/it] | |
| 1|235|Loss: 5.284543514251709: 30%|███ | 235/776 [05:34<10:20, 1.15s/it] | |
| 1|235|Loss: 5.284543514251709: 30%|███ | 236/776 [05:35<10:21, 1.15s/it] | |
| 1|236|Loss: 5.2191386222839355: 30%|███ | 236/776 [05:35<10:21, 1.15s/it] | |
| 1|236|Loss: 5.2191386222839355: 31%|███ | 237/776 [05:36<10:44, 1.20s/it] | |
| 1|237|Loss: 5.406431674957275: 31%|███ | 237/776 [05:36<10:44, 1.20s/it] | |
| 1|237|Loss: 5.406431674957275: 31%|███ | 238/776 [05:38<11:14, 1.25s/it] | |
| 1|238|Loss: 5.577589988708496: 31%|███ | 238/776 [05:38<11:14, 1.25s/it] | |
| 1|238|Loss: 5.577589988708496: 31%|███ | 239/776 [05:39<10:54, 1.22s/it] | |
| 1|239|Loss: 5.272293567657471: 31%|███ | 239/776 [05:39<10:54, 1.22s/it] | |
| 1|239|Loss: 5.272293567657471: 31%|███ | 240/776 [05:40<10:40, 1.19s/it] | |
| 1|240|Loss: 5.2062668800354: 31%|███ | 240/776 [05:40<10:40, 1.19s/it] | |
| 1|240|Loss: 5.2062668800354: 31%|███ | 241/776 [05:41<10:26, 1.17s/it] | |
| 1|241|Loss: 5.3405842781066895: 31%|███ | 241/776 [05:41<10:26, 1.17s/it] | |
| 1|241|Loss: 5.3405842781066895: 31%|███ | 242/776 [05:42<10:47, 1.21s/it] | |
| 1|242|Loss: 5.433411598205566: 31%|███ | 242/776 [05:42<10:47, 1.21s/it] | |
| 1|242|Loss: 5.433411598205566: 31%|███▏ | 243/776 [05:44<10:32, 1.19s/it] | |
| 1|243|Loss: 5.1978349685668945: 31%|███▏ | 243/776 [05:44<10:32, 1.19s/it] | |
| 1|243|Loss: 5.1978349685668945: 31%|███▏ | 244/776 [05:45<10:22, 1.17s/it] | |
| 1|244|Loss: 5.063118934631348: 31%|███▏ | 244/776 [05:45<10:22, 1.17s/it] | |
| 1|244|Loss: 5.063118934631348: 32%|███▏ | 245/776 [05:46<10:12, 1.15s/it] | |
| 1|245|Loss: 5.289031505584717: 32%|███▏ | 245/776 [05:46<10:12, 1.15s/it] | |
| 1|245|Loss: 5.289031505584717: 32%|███▏ | 246/776 [05:47<10:11, 1.15s/it] | |
| 1|246|Loss: 5.195354461669922: 32%|███▏ | 246/776 [05:47<10:11, 1.15s/it] | |
| 1|246|Loss: 5.195354461669922: 32%|███▏ | 247/776 [05:48<10:09, 1.15s/it] | |
| 1|247|Loss: 5.34556245803833: 32%|███▏ | 247/776 [05:48<10:09, 1.15s/it] | |
| 1|247|Loss: 5.34556245803833: 32%|███▏ | 248/776 [05:49<10:06, 1.15s/it] | |
| 1|248|Loss: 4.999575614929199: 32%|███▏ | 248/776 [05:49<10:06, 1.15s/it] | |
| 1|248|Loss: 4.999575614929199: 32%|███▏ | 249/776 [05:51<11:12, 1.28s/it] | |
| 1|249|Loss: 5.239809036254883: 32%|███▏ | 249/776 [05:51<11:12, 1.28s/it] | |
| 1|249|Loss: 5.239809036254883: 32%|███▏ | 250/776 [05:52<10:49, 1.23s/it] | |
| 1|250|Loss: 5.258659839630127: 32%|███▏ | 250/776 [05:52<10:49, 1.23s/it] | |
| 1|250|Loss: 5.258659839630127: 32%|███▏ | 251/776 [05:53<10:34, 1.21s/it] | |
| 1|251|Loss: 5.434571743011475: 32%|███▏ | 251/776 [05:53<10:34, 1.21s/it] | |
| 1|251|Loss: 5.434571743011475: 32%|███▏ | 252/776 [05:54<10:20, 1.18s/it] | |
| 1|252|Loss: 5.419220924377441: 32%|███▏ | 252/776 [05:54<10:20, 1.18s/it] | |
| 1|252|Loss: 5.419220924377441: 33%|███▎ | 253/776 [05:55<10:08, 1.16s/it] | |
| 1|253|Loss: 5.190994739532471: 33%|███▎ | 253/776 [05:55<10:08, 1.16s/it] | |
| 1|253|Loss: 5.190994739532471: 33%|███▎ | 254/776 [05:57<10:34, 1.22s/it] | |
| 1|254|Loss: 5.445740699768066: 33%|███▎ | 254/776 [05:57<10:34, 1.22s/it] | |
| 1|254|Loss: 5.445740699768066: 33%|███▎ | 255/776 [05:58<10:20, 1.19s/it] | |
| 1|255|Loss: 5.226301670074463: 33%|███▎ | 255/776 [05:58<10:20, 1.19s/it] | |
| 1|255|Loss: 5.226301670074463: 33%|███▎ | 256/776 [05:59<10:11, 1.18s/it] | |
| 1|256|Loss: 5.257811069488525: 33%|███▎ | 256/776 [05:59<10:11, 1.18s/it] | |
| 1|256|Loss: 5.257811069488525: 33%|███▎ | 257/776 [06:00<10:06, 1.17s/it] | |
| 1|257|Loss: 5.44108247756958: 33%|███▎ | 257/776 [06:00<10:06, 1.17s/it] | |
| 1|257|Loss: 5.44108247756958: 33%|███▎ | 258/776 [06:01<09:57, 1.15s/it] | |
| 1|258|Loss: 5.14818000793457: 33%|███▎ | 258/776 [06:01<09:57, 1.15s/it] | |
| 1|258|Loss: 5.14818000793457: 33%|███▎ | 259/776 [06:02<09:51, 1.14s/it] | |
| 1|259|Loss: 5.128236293792725: 33%|███▎ | 259/776 [06:02<09:51, 1.14s/it] | |
| 1|259|Loss: 5.128236293792725: 34%|███▎ | 260/776 [06:04<10:26, 1.21s/it] | |
| 1|260|Loss: 5.298697471618652: 34%|███▎ | 260/776 [06:04<10:26, 1.21s/it] | |
| 1|260|Loss: 5.298697471618652: 34%|███▎ | 261/776 [06:05<10:49, 1.26s/it] | |
| 1|261|Loss: 5.339964389801025: 34%|███▎ | 261/776 [06:05<10:49, 1.26s/it] | |
| 1|261|Loss: 5.339964389801025: 34%|███▍ | 262/776 [06:06<10:27, 1.22s/it] | |
| 1|262|Loss: 5.459559917449951: 34%|███▍ | 262/776 [06:06<10:27, 1.22s/it] | |
| 1|262|Loss: 5.459559917449951: 34%|███▍ | 263/776 [06:07<10:10, 1.19s/it] | |
| 1|263|Loss: 5.173248291015625: 34%|███▍ | 263/776 [06:07<10:10, 1.19s/it] | |
| 1|263|Loss: 5.173248291015625: 34%|███▍ | 264/776 [06:08<09:58, 1.17s/it] | |
| 1|264|Loss: 5.265397548675537: 34%|███▍ | 264/776 [06:08<09:58, 1.17s/it] | |
| 1|264|Loss: 5.265397548675537: 34%|███▍ | 265/776 [06:10<09:50, 1.15s/it] | |
| 1|265|Loss: 5.3298821449279785: 34%|███▍ | 265/776 [06:10<09:50, 1.15s/it] | |
| 1|265|Loss: 5.3298821449279785: 34%|███▍ | 266/776 [06:11<10:20, 1.22s/it] | |
| 1|266|Loss: 5.216048240661621: 34%|███▍ | 266/776 [06:11<10:20, 1.22s/it] | |
| 1|266|Loss: 5.216048240661621: 34%|███▍ | 267/776 [06:12<10:08, 1.20s/it] | |
| 1|267|Loss: 5.549046516418457: 34%|███▍ | 267/776 [06:12<10:08, 1.20s/it] | |
| 1|267|Loss: 5.549046516418457: 35%|███▍ | 268/776 [06:13<09:56, 1.17s/it] | |
| 1|268|Loss: 5.427663803100586: 35%|███▍ | 268/776 [06:13<09:56, 1.17s/it] | |
| 1|268|Loss: 5.427663803100586: 35%|███▍ | 269/776 [06:14<09:48, 1.16s/it] | |
| 1|269|Loss: 5.264499187469482: 35%|███▍ | 269/776 [06:14<09:48, 1.16s/it] | |
| 1|269|Loss: 5.264499187469482: 35%|███▍ | 270/776 [06:15<09:42, 1.15s/it] | |
| 1|270|Loss: 5.116490364074707: 35%|███▍ | 270/776 [06:15<09:42, 1.15s/it] | |
| 1|270|Loss: 5.116490364074707: 35%|███▍ | 271/776 [06:17<09:36, 1.14s/it] | |
| 1|271|Loss: 5.419467449188232: 35%|███▍ | 271/776 [06:17<09:36, 1.14s/it] | |
| 1|271|Loss: 5.419467449188232: 35%|███▌ | 272/776 [06:18<10:11, 1.21s/it] | |
| 1|272|Loss: 5.345190525054932: 35%|███▌ | 272/776 [06:18<10:11, 1.21s/it] | |
| 1|272|Loss: 5.345190525054932: 35%|███▌ | 273/776 [06:19<10:30, 1.25s/it] | |
| 1|273|Loss: 5.220262050628662: 35%|███▌ | 273/776 [06:19<10:30, 1.25s/it] | |
| 1|273|Loss: 5.220262050628662: 35%|███▌ | 274/776 [06:20<10:09, 1.21s/it] | |
| 1|274|Loss: 5.17307710647583: 35%|███▌ | 274/776 [06:20<10:09, 1.21s/it] | |
| 1|274|Loss: 5.17307710647583: 35%|███▌ | 275/776 [06:22<09:54, 1.19s/it] | |
| 1|275|Loss: 5.324012756347656: 35%|███▌ | 275/776 [06:22<09:54, 1.19s/it] | |
| 1|275|Loss: 5.324012756347656: 36%|███▌ | 276/776 [06:23<09:45, 1.17s/it] | |
| 1|276|Loss: 5.410219192504883: 36%|███▌ | 276/776 [06:23<09:45, 1.17s/it] | |
| 1|276|Loss: 5.410219192504883: 36%|███▌ | 277/776 [06:24<09:38, 1.16s/it] | |
| 1|277|Loss: 5.514828681945801: 36%|███▌ | 277/776 [06:24<09:38, 1.16s/it] | |
| 1|277|Loss: 5.514828681945801: 36%|███▌ | 278/776 [06:25<10:03, 1.21s/it] | |
| 1|278|Loss: 5.312533855438232: 36%|███▌ | 278/776 [06:25<10:03, 1.21s/it] | |
| 1|278|Loss: 5.312533855438232: 36%|███▌ | 279/776 [06:26<09:56, 1.20s/it] | |
| 1|279|Loss: 5.534008026123047: 36%|███▌ | 279/776 [06:26<09:56, 1.20s/it] | |
| 1|279|Loss: 5.534008026123047: 36%|███▌ | 280/776 [06:27<09:43, 1.18s/it] | |
| 1|280|Loss: 5.2112135887146: 36%|███▌ | 280/776 [06:27<09:43, 1.18s/it] | |
| 1|280|Loss: 5.2112135887146: 36%|███▌ | 281/776 [06:29<09:33, 1.16s/it] | |
| 1|281|Loss: 5.400199890136719: 36%|███▌ | 281/776 [06:29<09:33, 1.16s/it] | |
| 1|281|Loss: 5.400199890136719: 36%|███▋ | 282/776 [06:30<09:26, 1.15s/it] | |
| 1|282|Loss: 5.158200263977051: 36%|███▋ | 282/776 [06:30<09:26, 1.15s/it] | |
| 1|282|Loss: 5.158200263977051: 36%|███▋ | 283/776 [06:31<09:22, 1.14s/it] | |
| 1|283|Loss: 5.4410576820373535: 36%|███▋ | 283/776 [06:31<09:22, 1.14s/it] | |
| 1|283|Loss: 5.4410576820373535: 37%|███▋ | 284/776 [06:32<10:01, 1.22s/it] | |
| 1|284|Loss: 5.32828426361084: 37%|███▋ | 284/776 [06:32<10:01, 1.22s/it] | |
| 1|284|Loss: 5.32828426361084: 37%|███▋ | 285/776 [06:34<10:14, 1.25s/it] | |
| 1|285|Loss: 5.308468341827393: 37%|███▋ | 285/776 [06:34<10:14, 1.25s/it] | |
| 1|285|Loss: 5.308468341827393: 37%|███▋ | 286/776 [06:35<09:53, 1.21s/it] | |
| 1|286|Loss: 5.345756530761719: 37%|███▋ | 286/776 [06:35<09:53, 1.21s/it] | |
| 1|286|Loss: 5.345756530761719: 37%|███▋ | 287/776 [06:36<09:39, 1.19s/it] | |
| 1|287|Loss: 5.381749153137207: 37%|███▋ | 287/776 [06:36<09:39, 1.19s/it] | |
| 1|287|Loss: 5.381749153137207: 37%|███▋ | 288/776 [06:37<09:31, 1.17s/it] | |
| 1|288|Loss: 4.856078624725342: 37%|███▋ | 288/776 [06:37<09:31, 1.17s/it] | |
| 1|288|Loss: 4.856078624725342: 37%|███▋ | 289/776 [06:38<09:27, 1.16s/it] | |
| 1|289|Loss: 5.005937576293945: 37%|███▋ | 289/776 [06:38<09:27, 1.16s/it] | |
| 1|289|Loss: 5.005937576293945: 37%|███▋ | 290/776 [06:39<09:46, 1.21s/it] | |
| 1|290|Loss: 5.373960494995117: 37%|███▋ | 290/776 [06:39<09:46, 1.21s/it] | |
| 1|290|Loss: 5.373960494995117: 38%|███▊ | 291/776 [06:41<09:33, 1.18s/it] | |
| 1|291|Loss: 5.39868688583374: 38%|███▊ | 291/776 [06:41<09:33, 1.18s/it] | |
| 1|291|Loss: 5.39868688583374: 38%|███▊ | 292/776 [06:42<09:21, 1.16s/it] | |
| 1|292|Loss: 5.196873188018799: 38%|███▊ | 292/776 [06:42<09:21, 1.16s/it] | |
| 1|292|Loss: 5.196873188018799: 38%|███▊ | 293/776 [06:43<09:20, 1.16s/it] | |
| 1|293|Loss: 5.145110607147217: 38%|███▊ | 293/776 [06:43<09:20, 1.16s/it] | |
| 1|293|Loss: 5.145110607147217: 38%|███▊ | 294/776 [06:44<09:15, 1.15s/it] | |
| 1|294|Loss: 5.085460186004639: 38%|███▊ | 294/776 [06:44<09:15, 1.15s/it] | |
| 1|294|Loss: 5.085460186004639: 38%|███▊ | 295/776 [06:45<09:14, 1.15s/it] | |
| 1|295|Loss: 4.837353229522705: 38%|███▊ | 295/776 [06:45<09:14, 1.15s/it] | |
| 1|295|Loss: 4.837353229522705: 38%|███▊ | 296/776 [06:46<09:48, 1.23s/it] | |
| 1|296|Loss: 5.443464279174805: 38%|███▊ | 296/776 [06:46<09:48, 1.23s/it] | |
| 1|296|Loss: 5.443464279174805: 38%|███▊ | 297/776 [06:48<09:34, 1.20s/it] | |
| 1|297|Loss: 4.979115962982178: 38%|███▊ | 297/776 [06:48<09:34, 1.20s/it] | |
| 1|297|Loss: 4.979115962982178: 38%|███▊ | 298/776 [06:49<09:55, 1.25s/it] | |
| 1|298|Loss: 5.46377420425415: 38%|███▊ | 298/776 [06:49<09:55, 1.25s/it] | |
| 1|298|Loss: 5.46377420425415: 39%|███▊ | 299/776 [06:50<09:39, 1.21s/it] | |
| 1|299|Loss: 5.030261516571045: 39%|███▊ | 299/776 [06:50<09:39, 1.21s/it] | |
| 1|299|Loss: 5.030261516571045: 39%|███▊ | 300/776 [06:51<09:26, 1.19s/it] | |
| 1|300|Loss: 5.510290622711182: 39%|███▊ | 300/776 [06:51<09:26, 1.19s/it] | |
| 1|300|Loss: 5.510290622711182: 39%|███▉ | 301/776 [06:52<09:15, 1.17s/it] | |
| 1|301|Loss: 5.4347243309021: 39%|███▉ | 301/776 [06:52<09:15, 1.17s/it] | |
| 1|301|Loss: 5.4347243309021: 39%|███▉ | 302/776 [06:54<09:37, 1.22s/it] | |
| 1|302|Loss: 5.515040397644043: 39%|███▉ | 302/776 [06:54<09:37, 1.22s/it] | |
| 1|302|Loss: 5.515040397644043: 39%|███▉ | 303/776 [06:55<09:24, 1.19s/it] | |
| 1|303|Loss: 5.288409233093262: 39%|███▉ | 303/776 [06:55<09:24, 1.19s/it] | |
| 1|303|Loss: 5.288409233093262: 39%|███▉ | 304/776 [06:56<09:14, 1.18s/it] | |
| 1|304|Loss: 5.213053226470947: 39%|███▉ | 304/776 [06:56<09:14, 1.18s/it] | |
| 1|304|Loss: 5.213053226470947: 39%|███▉ | 305/776 [06:57<09:09, 1.17s/it] | |
| 1|305|Loss: 4.953383922576904: 39%|███▉ | 305/776 [06:57<09:09, 1.17s/it] | |
| 1|305|Loss: 4.953383922576904: 39%|███▉ | 306/776 [06:58<09:03, 1.16s/it] | |
| 1|306|Loss: 5.371671199798584: 39%|███▉ | 306/776 [06:58<09:03, 1.16s/it] | |
| 1|306|Loss: 5.371671199798584: 40%|███▉ | 307/776 [07:00<09:35, 1.23s/it] | |
| 1|307|Loss: 5.213160991668701: 40%|███▉ | 307/776 [07:00<09:35, 1.23s/it] | |
| 1|307|Loss: 5.213160991668701: 40%|███▉ | 308/776 [07:01<09:19, 1.20s/it] | |
| 1|308|Loss: 5.178910255432129: 40%|███▉ | 308/776 [07:01<09:19, 1.20s/it] | |
| 1|308|Loss: 5.178910255432129: 40%|███▉ | 309/776 [07:02<09:11, 1.18s/it] | |
| 1|309|Loss: 5.518357276916504: 40%|███▉ | 309/776 [07:02<09:11, 1.18s/it] | |
| 1|309|Loss: 5.518357276916504: 40%|███▉ | 310/776 [07:03<09:37, 1.24s/it] | |
| 1|310|Loss: 4.870225429534912: 40%|███▉ | 310/776 [07:03<09:37, 1.24s/it] | |
| 1|310|Loss: 4.870225429534912: 40%|████ | 311/776 [07:04<09:19, 1.20s/it] | |
| 1|311|Loss: 5.287115573883057: 40%|████ | 311/776 [07:04<09:19, 1.20s/it] | |
| 1|311|Loss: 5.287115573883057: 40%|████ | 312/776 [07:06<09:06, 1.18s/it] | |
| 1|312|Loss: 5.405235767364502: 40%|████ | 312/776 [07:06<09:06, 1.18s/it] | |
| 1|312|Loss: 5.405235767364502: 40%|████ | 313/776 [07:07<08:56, 1.16s/it] | |
| 1|313|Loss: 5.439174652099609: 40%|████ | 313/776 [07:07<08:56, 1.16s/it] | |
| 1|313|Loss: 5.439174652099609: 40%|████ | 314/776 [07:08<09:18, 1.21s/it] | |
| 1|314|Loss: 5.42974853515625: 40%|████ | 314/776 [07:08<09:18, 1.21s/it] | |
| 1|314|Loss: 5.42974853515625: 41%|████ | 315/776 [07:09<09:07, 1.19s/it] | |
| 1|315|Loss: 5.185024738311768: 41%|████ | 315/776 [07:09<09:07, 1.19s/it] | |
| 1|315|Loss: 5.185024738311768: 41%|████ | 316/776 [07:10<08:57, 1.17s/it] | |
| 1|316|Loss: 5.29945182800293: 41%|████ | 316/776 [07:10<08:57, 1.17s/it] | |
| 1|316|Loss: 5.29945182800293: 41%|████ | 317/776 [07:11<08:49, 1.15s/it] | |
| 1|317|Loss: 5.284602165222168: 41%|████ | 317/776 [07:11<08:49, 1.15s/it] | |
| 1|317|Loss: 5.284602165222168: 41%|████ | 318/776 [07:13<09:22, 1.23s/it] | |
| 1|318|Loss: 5.524261474609375: 41%|████ | 318/776 [07:13<09:22, 1.23s/it] | |
| 1|318|Loss: 5.524261474609375: 41%|████ | 319/776 [07:14<09:07, 1.20s/it] | |
| 1|319|Loss: 5.2213544845581055: 41%|████ | 319/776 [07:14<09:07, 1.20s/it] | |
| 1|319|Loss: 5.2213544845581055: 41%|████ | 320/776 [07:15<09:00, 1.19s/it] | |
| 1|320|Loss: 5.149913311004639: 41%|████ | 320/776 [07:15<09:00, 1.19s/it] | |
| 1|320|Loss: 5.149913311004639: 41%|████▏ | 321/776 [07:16<08:52, 1.17s/it] | |
| 1|321|Loss: 5.552460670471191: 41%|████▏ | 321/776 [07:16<08:52, 1.17s/it] | |
| 1|321|Loss: 5.552460670471191: 41%|████▏ | 322/776 [07:17<09:12, 1.22s/it] | |
| 1|322|Loss: 4.982635974884033: 41%|████▏ | 322/776 [07:17<09:12, 1.22s/it] | |
| 1|322|Loss: 4.982635974884033: 42%|████▏ | 323/776 [07:19<08:58, 1.19s/it] | |
| 1|323|Loss: 5.131198406219482: 42%|████▏ | 323/776 [07:19<08:58, 1.19s/it] | |
| 1|323|Loss: 5.131198406219482: 42%|████▏ | 324/776 [07:20<08:48, 1.17s/it] | |
| 1|324|Loss: 5.096321105957031: 42%|████▏ | 324/776 [07:20<08:48, 1.17s/it] | |
| 1|324|Loss: 5.096321105957031: 42%|████▏ | 325/776 [07:21<08:41, 1.16s/it] | |
| 1|325|Loss: 5.391838073730469: 42%|████▏ | 325/776 [07:21<08:41, 1.16s/it] | |
| 1|325|Loss: 5.391838073730469: 42%|████▏ | 326/776 [07:22<09:01, 1.20s/it] | |
| 1|326|Loss: 5.058282852172852: 42%|████▏ | 326/776 [07:22<09:01, 1.20s/it] | |
| 1|326|Loss: 5.058282852172852: 42%|████▏ | 327/776 [07:23<08:50, 1.18s/it] | |
| 1|327|Loss: 5.052055358886719: 42%|████▏ | 327/776 [07:23<08:50, 1.18s/it] | |
| 1|327|Loss: 5.052055358886719: 42%|████▏ | 328/776 [07:24<08:41, 1.16s/it] | |
| 1|328|Loss: 5.285645484924316: 42%|████▏ | 328/776 [07:24<08:41, 1.16s/it] | |
| 1|328|Loss: 5.285645484924316: 42%|████▏ | 329/776 [07:26<08:33, 1.15s/it] | |
| 1|329|Loss: 5.325260639190674: 42%|████▏ | 329/776 [07:26<08:33, 1.15s/it] | |
| 1|329|Loss: 5.325260639190674: 43%|████▎ | 330/776 [07:27<09:09, 1.23s/it] | |
| 1|330|Loss: 5.224215507507324: 43%|████▎ | 330/776 [07:27<09:09, 1.23s/it] | |
| 1|330|Loss: 5.224215507507324: 43%|████▎ | 331/776 [07:28<08:57, 1.21s/it] | |
| 1|331|Loss: 5.1997504234313965: 43%|████▎ | 331/776 [07:28<08:57, 1.21s/it] | |
| 1|331|Loss: 5.1997504234313965: 43%|████▎ | 332/776 [07:29<08:46, 1.19s/it] | |
| 1|332|Loss: 5.101251125335693: 43%|████▎ | 332/776 [07:29<08:46, 1.19s/it] | |
| 1|332|Loss: 5.101251125335693: 43%|████▎ | 333/776 [07:30<08:38, 1.17s/it] | |
| 1|333|Loss: 5.171118259429932: 43%|████▎ | 333/776 [07:30<08:38, 1.17s/it] | |
| 1|333|Loss: 5.171118259429932: 43%|████▎ | 334/776 [07:32<08:55, 1.21s/it] | |
| 1|334|Loss: 4.960958957672119: 43%|████▎ | 334/776 [07:32<08:55, 1.21s/it] | |
| 1|334|Loss: 4.960958957672119: 43%|████▎ | 335/776 [07:33<08:44, 1.19s/it] | |
| 1|335|Loss: 5.615512847900391: 43%|████▎ | 335/776 [07:33<08:44, 1.19s/it] | |
| 1|335|Loss: 5.615512847900391: 43%|████▎ | 336/776 [07:34<08:35, 1.17s/it] | |
| 1|336|Loss: 5.183916091918945: 43%|████▎ | 336/776 [07:34<08:35, 1.17s/it] | |
| 1|336|Loss: 5.183916091918945: 43%|████▎ | 337/776 [07:35<08:32, 1.17s/it] | |
| 1|337|Loss: 5.204352378845215: 43%|████▎ | 337/776 [07:35<08:32, 1.17s/it] | |
| 1|337|Loss: 5.204352378845215: 44%|████▎ | 338/776 [07:36<08:52, 1.22s/it] | |
| 1|338|Loss: 5.353341579437256: 44%|████▎ | 338/776 [07:36<08:52, 1.22s/it] | |
| 1|338|Loss: 5.353341579437256: 44%|████▎ | 339/776 [07:38<08:40, 1.19s/it] | |
| 1|339|Loss: 5.146745204925537: 44%|████▎ | 339/776 [07:38<08:40, 1.19s/it] | |
| 1|339|Loss: 5.146745204925537: 44%|████▍ | 340/776 [07:39<08:30, 1.17s/it] | |
| 1|340|Loss: 5.056707382202148: 44%|████▍ | 340/776 [07:39<08:30, 1.17s/it] | |
| 1|340|Loss: 5.056707382202148: 44%|████▍ | 341/776 [07:40<08:23, 1.16s/it] | |
| 1|341|Loss: 5.263746738433838: 44%|████▍ | 341/776 [07:40<08:23, 1.16s/it] | |
| 1|341|Loss: 5.263746738433838: 44%|████▍ | 342/776 [07:41<08:53, 1.23s/it] | |
| 1|342|Loss: 5.06340217590332: 44%|████▍ | 342/776 [07:41<08:53, 1.23s/it] | |
| 1|342|Loss: 5.06340217590332: 44%|████▍ | 343/776 [07:42<08:37, 1.20s/it] | |
| 1|343|Loss: 5.31987190246582: 44%|████▍ | 343/776 [07:42<08:37, 1.20s/it] | |
| 1|343|Loss: 5.31987190246582: 44%|████▍ | 344/776 [07:43<08:28, 1.18s/it] | |
| 1|344|Loss: 5.075689315795898: 44%|████▍ | 344/776 [07:43<08:28, 1.18s/it] | |
| 1|344|Loss: 5.075689315795898: 44%|████▍ | 345/776 [07:45<08:21, 1.16s/it] | |
| 1|345|Loss: 5.238735198974609: 44%|████▍ | 345/776 [07:45<08:21, 1.16s/it] | |
| 1|345|Loss: 5.238735198974609: 45%|████▍ | 346/776 [07:46<08:40, 1.21s/it] | |
| 1|346|Loss: 5.2498555183410645: 45%|████▍ | 346/776 [07:46<08:40, 1.21s/it] | |
| 1|346|Loss: 5.2498555183410645: 45%|████▍ | 347/776 [07:47<08:32, 1.19s/it] | |
| 1|347|Loss: 5.2396321296691895: 45%|████▍ | 347/776 [07:47<08:32, 1.19s/it] | |
| 1|347|Loss: 5.2396321296691895: 45%|████▍ | 348/776 [07:48<08:24, 1.18s/it] | |
| 1|348|Loss: 5.644989490509033: 45%|████▍ | 348/776 [07:48<08:24, 1.18s/it] | |
| 1|348|Loss: 5.644989490509033: 45%|████▍ | 349/776 [07:49<08:15, 1.16s/it] | |
| 1|349|Loss: 5.1132683753967285: 45%|████▍ | 349/776 [07:49<08:15, 1.16s/it] | |
| 1|349|Loss: 5.1132683753967285: 45%|████▌ | 350/776 [07:51<08:38, 1.22s/it] | |
| 1|350|Loss: 5.012974739074707: 45%|████▌ | 350/776 [07:51<08:38, 1.22s/it] | |
| 1|350|Loss: 5.012974739074707: 45%|████▌ | 351/776 [07:52<08:26, 1.19s/it] | |
| 1|351|Loss: 5.127974510192871: 45%|████▌ | 351/776 [07:52<08:26, 1.19s/it] | |
| 1|351|Loss: 5.127974510192871: 45%|████▌ | 352/776 [07:53<08:18, 1.18s/it] | |
| 1|352|Loss: 5.349217891693115: 45%|████▌ | 352/776 [07:53<08:18, 1.18s/it] | |
| 1|352|Loss: 5.349217891693115: 45%|████▌ | 353/776 [07:54<08:13, 1.17s/it] | |
| 1|353|Loss: 4.668721675872803: 45%|████▌ | 353/776 [07:54<08:13, 1.17s/it] | |
| 1|353|Loss: 4.668721675872803: 46%|████▌ | 354/776 [07:56<08:40, 1.23s/it] | |
| 1|354|Loss: 5.261246681213379: 46%|████▌ | 354/776 [07:56<08:40, 1.23s/it] | |
| 1|354|Loss: 5.261246681213379: 46%|████▌ | 355/776 [07:57<08:24, 1.20s/it] | |
| 1|355|Loss: 5.340883731842041: 46%|████▌ | 355/776 [07:57<08:24, 1.20s/it] | |
| 1|355|Loss: 5.340883731842041: 46%|████▌ | 356/776 [07:58<08:14, 1.18s/it] | |
| 1|356|Loss: 5.286664962768555: 46%|████▌ | 356/776 [07:58<08:14, 1.18s/it] | |
| 1|356|Loss: 5.286664962768555: 46%|████▌ | 357/776 [07:59<08:06, 1.16s/it] | |
| 1|357|Loss: 5.3656325340271: 46%|████▌ | 357/776 [07:59<08:06, 1.16s/it] | |
| 1|357|Loss: 5.3656325340271: 46%|████▌ | 358/776 [08:00<08:03, 1.16s/it] | |
| 1|358|Loss: 5.315868854522705: 46%|████▌ | 358/776 [08:00<08:03, 1.16s/it] | |
| 1|358|Loss: 5.315868854522705: 46%|████▋ | 359/776 [08:01<08:26, 1.21s/it] | |
| 1|359|Loss: 5.2990498542785645: 46%|████▋ | 359/776 [08:01<08:26, 1.21s/it] | |
| 1|359|Loss: 5.2990498542785645: 46%|████▋ | 360/776 [08:02<08:12, 1.18s/it] | |
| 1|360|Loss: 5.305074691772461: 46%|████▋ | 360/776 [08:02<08:12, 1.18s/it] | |
| 1|360|Loss: 5.305074691772461: 47%|████▋ | 361/776 [08:04<08:04, 1.17s/it] | |
| 1|361|Loss: 5.241093635559082: 47%|████▋ | 361/776 [08:04<08:04, 1.17s/it] | |
| 1|361|Loss: 5.241093635559082: 47%|████▋ | 362/776 [08:05<08:18, 1.20s/it] | |
| 1|362|Loss: 5.064663887023926: 47%|████▋ | 362/776 [08:05<08:18, 1.20s/it] | |
| 1|362|Loss: 5.064663887023926: 47%|████▋ | 363/776 [08:06<08:11, 1.19s/it] | |
| 1|363|Loss: 5.174687385559082: 47%|████▋ | 363/776 [08:06<08:11, 1.19s/it] | |
| 1|363|Loss: 5.174687385559082: 47%|████▋ | 364/776 [08:07<08:01, 1.17s/it] | |
| 1|364|Loss: 5.148858070373535: 47%|████▋ | 364/776 [08:07<08:01, 1.17s/it] | |
| 1|364|Loss: 5.148858070373535: 47%|████▋ | 365/776 [08:09<08:27, 1.23s/it] | |
| 1|365|Loss: 5.029953479766846: 47%|████▋ | 365/776 [08:09<08:27, 1.23s/it] | |
| 1|365|Loss: 5.029953479766846: 47%|████▋ | 366/776 [08:10<08:12, 1.20s/it] | |
| 1|366|Loss: 5.404118061065674: 47%|████▋ | 366/776 [08:10<08:12, 1.20s/it] | |
| 1|366|Loss: 5.404118061065674: 47%|████▋ | 367/776 [08:11<08:01, 1.18s/it] | |
| 1|367|Loss: 5.190275192260742: 47%|████▋ | 367/776 [08:11<08:01, 1.18s/it] | |
| 1|367|Loss: 5.190275192260742: 47%|████▋ | 368/776 [08:12<07:54, 1.16s/it] | |
| 1|368|Loss: 5.162330150604248: 47%|████▋ | 368/776 [08:12<07:54, 1.16s/it] | |
| 1|368|Loss: 5.162330150604248: 48%|████▊ | 369/776 [08:13<07:51, 1.16s/it] | |
| 1|369|Loss: 5.094810485839844: 48%|████▊ | 369/776 [08:13<07:51, 1.16s/it] | |
| 1|369|Loss: 5.094810485839844: 48%|████▊ | 370/776 [08:14<07:46, 1.15s/it] | |
| 1|370|Loss: 5.105665683746338: 48%|████▊ | 370/776 [08:14<07:46, 1.15s/it] | |
| 1|370|Loss: 5.105665683746338: 48%|████▊ | 371/776 [08:16<08:04, 1.20s/it] | |
| 1|371|Loss: 4.948776721954346: 48%|████▊ | 371/776 [08:16<08:04, 1.20s/it] | |
| 1|371|Loss: 4.948776721954346: 48%|████▊ | 372/776 [08:17<07:54, 1.17s/it] | |
| 1|372|Loss: 5.263126373291016: 48%|████▊ | 372/776 [08:17<07:54, 1.17s/it] | |
| 1|372|Loss: 5.263126373291016: 48%|████▊ | 373/776 [08:18<07:46, 1.16s/it] | |
| 1|373|Loss: 5.340749740600586: 48%|████▊ | 373/776 [08:18<07:46, 1.16s/it] | |
| 1|373|Loss: 5.340749740600586: 48%|████▊ | 374/776 [08:19<08:09, 1.22s/it] | |
| 1|374|Loss: 5.155880451202393: 48%|████▊ | 374/776 [08:19<08:09, 1.22s/it] | |
| 1|374|Loss: 5.155880451202393: 48%|████▊ | 375/776 [08:20<07:57, 1.19s/it] | |
| 1|375|Loss: 5.015707969665527: 48%|████▊ | 375/776 [08:20<07:57, 1.19s/it] | |
| 1|375|Loss: 5.015707969665527: 48%|████▊ | 376/776 [08:22<08:20, 1.25s/it] | |
| 1|376|Loss: 5.007016181945801: 48%|████▊ | 376/776 [08:22<08:20, 1.25s/it] | |
| 1|376|Loss: 5.007016181945801: 49%|████▊ | 377/776 [08:23<08:03, 1.21s/it] | |
| 1|377|Loss: 4.866985321044922: 49%|████▊ | 377/776 [08:23<08:03, 1.21s/it] | |
| 1|377|Loss: 4.866985321044922: 49%|████▊ | 378/776 [08:24<07:52, 1.19s/it] | |
| 1|378|Loss: 5.120359420776367: 49%|████▊ | 378/776 [08:24<07:52, 1.19s/it] | |
| 1|378|Loss: 5.120359420776367: 49%|████▉ | 379/776 [08:25<07:44, 1.17s/it] | |
| 1|379|Loss: 5.306872844696045: 49%|████▉ | 379/776 [08:25<07:44, 1.17s/it] | |
| 1|379|Loss: 5.306872844696045: 49%|████▉ | 380/776 [08:26<07:44, 1.17s/it] | |
| 1|380|Loss: 5.128311634063721: 49%|████▉ | 380/776 [08:26<07:44, 1.17s/it] | |
| 1|380|Loss: 5.128311634063721: 49%|████▉ | 381/776 [08:27<07:38, 1.16s/it] | |
| 1|381|Loss: 4.980410575866699: 49%|████▉ | 381/776 [08:27<07:38, 1.16s/it] | |
| 1|381|Loss: 4.980410575866699: 49%|████▉ | 382/776 [08:28<07:31, 1.15s/it] | |
| 1|382|Loss: 5.114475727081299: 49%|████▉ | 382/776 [08:28<07:31, 1.15s/it] | |
| 1|382|Loss: 5.114475727081299: 49%|████▉ | 383/776 [08:30<07:50, 1.20s/it] | |
| 1|383|Loss: 5.215021133422852: 49%|████▉ | 383/776 [08:30<07:50, 1.20s/it] | |
| 1|383|Loss: 5.215021133422852: 49%|████▉ | 384/776 [08:31<07:41, 1.18s/it] | |
| 1|384|Loss: 5.157972812652588: 49%|████▉ | 384/776 [08:31<07:41, 1.18s/it] | |
| 1|384|Loss: 5.157972812652588: 50%|████▉ | 385/776 [08:32<07:37, 1.17s/it] | |
| 1|385|Loss: 5.275562286376953: 50%|████▉ | 385/776 [08:32<07:37, 1.17s/it] | |
| 1|385|Loss: 5.275562286376953: 50%|████▉ | 386/776 [08:33<07:53, 1.21s/it] | |
| 1|386|Loss: 5.347341537475586: 50%|████▉ | 386/776 [08:33<07:53, 1.21s/it] | |
| 1|386|Loss: 5.347341537475586: 50%|████▉ | 387/776 [08:34<07:41, 1.19s/it] | |
| 1|387|Loss: 5.265030860900879: 50%|████▉ | 387/776 [08:34<07:41, 1.19s/it] | |
| 1|387|Loss: 5.265030860900879: 50%|█████ | 388/776 [08:36<08:06, 1.25s/it] | |
| 1|388|Loss: 5.118323802947998: 50%|█████ | 388/776 [08:36<08:06, 1.25s/it] | |
| 1|388|Loss: 5.118323802947998: 50%|█████ | 389/776 [08:37<07:50, 1.22s/it] | |
| 1|389|Loss: 5.187166213989258: 50%|█████ | 389/776 [08:37<07:50, 1.22s/it] | |
| 1|389|Loss: 5.187166213989258: 50%|█████ | 390/776 [08:38<07:40, 1.19s/it] | |
| 1|390|Loss: 5.54276704788208: 50%|█████ | 390/776 [08:38<07:40, 1.19s/it] | |
| 1|390|Loss: 5.54276704788208: 50%|█████ | 391/776 [08:39<07:32, 1.17s/it] | |
| 1|391|Loss: 5.194374084472656: 50%|█████ | 391/776 [08:39<07:32, 1.17s/it] | |
| 1|391|Loss: 5.194374084472656: 51%|█████ | 392/776 [08:40<07:24, 1.16s/it] | |
| 1|392|Loss: 5.070514678955078: 51%|█████ | 392/776 [08:40<07:24, 1.16s/it] | |
| 1|392|Loss: 5.070514678955078: 51%|█████ | 393/776 [08:42<07:19, 1.15s/it] | |
| 1|393|Loss: 5.059937953948975: 51%|█████ | 393/776 [08:42<07:19, 1.15s/it] | |
| 1|393|Loss: 5.059937953948975: 51%|█████ | 394/776 [08:43<07:16, 1.14s/it] | |
| 1|394|Loss: 5.2571635246276855: 51%|█████ | 394/776 [08:43<07:16, 1.14s/it] | |
| 1|394|Loss: 5.2571635246276855: 51%|█████ | 395/776 [08:44<07:41, 1.21s/it] | |
| 1|395|Loss: 5.13892936706543: 51%|█████ | 395/776 [08:44<07:41, 1.21s/it] | |
| 1|395|Loss: 5.13892936706543: 51%|█████ | 396/776 [08:45<07:29, 1.18s/it] | |
| 1|396|Loss: 4.920266151428223: 51%|█████ | 396/776 [08:45<07:29, 1.18s/it] | |
| 1|396|Loss: 4.920266151428223: 51%|█████ | 397/776 [08:46<07:23, 1.17s/it] | |
| 1|397|Loss: 5.343251705169678: 51%|█████ | 397/776 [08:46<07:23, 1.17s/it] | |
| 1|397|Loss: 5.343251705169678: 51%|█████▏ | 398/776 [08:48<07:37, 1.21s/it] | |
| 1|398|Loss: 4.985454082489014: 51%|█████▏ | 398/776 [08:48<07:37, 1.21s/it] | |
| 1|398|Loss: 4.985454082489014: 51%|█████▏ | 399/776 [08:49<07:29, 1.19s/it] | |
| 1|399|Loss: 5.482123851776123: 51%|█████▏ | 399/776 [08:49<07:29, 1.19s/it] | |
| 1|399|Loss: 5.482123851776123: 52%|█████▏ | 400/776 [08:50<07:55, 1.27s/it] | |
| 1|400|Loss: 4.657905578613281: 52%|█████▏ | 400/776 [08:50<07:55, 1.27s/it] | |
| 1|400|Loss: 4.657905578613281: 52%|█████▏ | 401/776 [08:51<07:38, 1.22s/it] | |
| 1|401|Loss: 5.21909236907959: 52%|█████▏ | 401/776 [08:51<07:38, 1.22s/it] | |
| 1|401|Loss: 5.21909236907959: 52%|█████▏ | 402/776 [08:52<07:26, 1.19s/it] | |
| 1|402|Loss: 5.398275375366211: 52%|█████▏ | 402/776 [08:52<07:26, 1.19s/it] | |
| 1|402|Loss: 5.398275375366211: 52%|█████▏ | 403/776 [08:54<07:17, 1.17s/it] | |
| 1|403|Loss: 5.341225624084473: 52%|█████▏ | 403/776 [08:54<07:17, 1.17s/it] | |
| 1|403|Loss: 5.341225624084473: 52%|█████▏ | 404/776 [08:55<07:10, 1.16s/it] | |
| 1|404|Loss: 5.161622047424316: 52%|█████▏ | 404/776 [08:55<07:10, 1.16s/it] | |
| 1|404|Loss: 5.161622047424316: 52%|█████▏ | 405/776 [08:56<07:05, 1.15s/it] | |
| 1|405|Loss: 5.096367835998535: 52%|█████▏ | 405/776 [08:56<07:05, 1.15s/it] | |
| 1|405|Loss: 5.096367835998535: 52%|█████▏ | 406/776 [08:57<07:02, 1.14s/it] | |
| 1|406|Loss: 4.878658771514893: 52%|█████▏ | 406/776 [08:57<07:02, 1.14s/it] | |
| 1|406|Loss: 4.878658771514893: 52%|█████▏ | 407/776 [08:58<07:23, 1.20s/it] | |
| 1|407|Loss: 5.031930923461914: 52%|█████▏ | 407/776 [08:58<07:23, 1.20s/it] | |
| 1|407|Loss: 5.031930923461914: 53%|█████▎ | 408/776 [08:59<07:13, 1.18s/it] | |
| 1|408|Loss: 5.054767608642578: 53%|█████▎ | 408/776 [08:59<07:13, 1.18s/it] | |
| 1|408|Loss: 5.054767608642578: 53%|█████▎ | 409/776 [09:01<07:05, 1.16s/it] | |
| 1|409|Loss: 5.270864009857178: 53%|█████▎ | 409/776 [09:01<07:05, 1.16s/it] | |
| 1|409|Loss: 5.270864009857178: 53%|█████▎ | 410/776 [09:02<07:21, 1.21s/it] | |
| 1|410|Loss: 5.399866104125977: 53%|█████▎ | 410/776 [09:02<07:21, 1.21s/it] | |
| 1|410|Loss: 5.399866104125977: 53%|█████▎ | 411/776 [09:03<07:15, 1.19s/it] | |
| 1|411|Loss: 5.142045021057129: 53%|█████▎ | 411/776 [09:03<07:15, 1.19s/it] | |
| 1|411|Loss: 5.142045021057129: 53%|█████▎ | 412/776 [09:04<07:35, 1.25s/it] | |
| 1|412|Loss: 5.491016864776611: 53%|█████▎ | 412/776 [09:04<07:35, 1.25s/it] | |
| 1|412|Loss: 5.491016864776611: 53%|█████▎ | 413/776 [09:05<07:19, 1.21s/it] | |
| 1|413|Loss: 4.906639099121094: 53%|█████▎ | 413/776 [09:05<07:19, 1.21s/it] | |
| 1|413|Loss: 4.906639099121094: 53%|█████▎ | 414/776 [09:07<07:07, 1.18s/it] | |
| 1|414|Loss: 5.244810581207275: 53%|█████▎ | 414/776 [09:07<07:07, 1.18s/it] | |
| 1|414|Loss: 5.244810581207275: 53%|█████▎ | 415/776 [09:08<07:00, 1.16s/it] | |
| 1|415|Loss: 5.008533954620361: 53%|█████▎ | 415/776 [09:08<07:00, 1.16s/it] | |
| 1|415|Loss: 5.008533954620361: 54%|█████▎ | 416/776 [09:09<06:53, 1.15s/it] | |
| 1|416|Loss: 5.457255840301514: 54%|█████▎ | 416/776 [09:09<06:53, 1.15s/it] | |
| 1|416|Loss: 5.457255840301514: 54%|█████▎ | 417/776 [09:10<06:51, 1.15s/it] | |
| 1|417|Loss: 5.411457538604736: 54%|█████▎ | 417/776 [09:10<06:51, 1.15s/it] | |
| 1|417|Loss: 5.411457538604736: 54%|█████▍ | 418/776 [09:11<06:49, 1.14s/it] | |
| 1|418|Loss: 5.286209583282471: 54%|█████▍ | 418/776 [09:11<06:49, 1.14s/it] | |
| 1|418|Loss: 5.286209583282471: 54%|█████▍ | 419/776 [09:12<06:45, 1.14s/it] | |
| 1|419|Loss: 5.029977321624756: 54%|█████▍ | 419/776 [09:12<06:45, 1.14s/it] | |
| 1|419|Loss: 5.029977321624756: 54%|█████▍ | 420/776 [09:14<07:04, 1.19s/it] | |
| 1|420|Loss: 5.224172115325928: 54%|█████▍ | 420/776 [09:14<07:04, 1.19s/it] | |
| 1|420|Loss: 5.224172115325928: 54%|█████▍ | 421/776 [09:15<06:55, 1.17s/it] | |
| 1|421|Loss: 4.978454113006592: 54%|█████▍ | 421/776 [09:15<06:55, 1.17s/it] | |
| 1|421|Loss: 4.978454113006592: 54%|█████▍ | 422/776 [09:16<07:17, 1.24s/it] | |
| 1|422|Loss: 5.067971706390381: 54%|█████▍ | 422/776 [09:16<07:17, 1.24s/it] | |
| 1|422|Loss: 5.067971706390381: 55%|█████▍ | 423/776 [09:17<07:32, 1.28s/it] | |
| 1|423|Loss: 5.177703380584717: 55%|█████▍ | 423/776 [09:17<07:32, 1.28s/it] | |
| 1|423|Loss: 5.177703380584717: 55%|█████▍ | 424/776 [09:19<07:15, 1.24s/it] | |
| 1|424|Loss: 5.513968467712402: 55%|█████▍ | 424/776 [09:19<07:15, 1.24s/it] | |
| 1|424|Loss: 5.513968467712402: 55%|█████▍ | 425/776 [09:20<07:01, 1.20s/it] | |
| 1|425|Loss: 5.208603858947754: 55%|█████▍ | 425/776 [09:20<07:01, 1.20s/it] | |
| 1|425|Loss: 5.208603858947754: 55%|█████▍ | 426/776 [09:21<06:53, 1.18s/it] | |
| 1|426|Loss: 5.093612194061279: 55%|█████▍ | 426/776 [09:21<06:53, 1.18s/it] | |
| 1|426|Loss: 5.093612194061279: 55%|█████▌ | 427/776 [09:22<06:48, 1.17s/it] | |
| 1|427|Loss: 5.273745536804199: 55%|█████▌ | 427/776 [09:22<06:48, 1.17s/it] | |
| 1|427|Loss: 5.273745536804199: 55%|█████▌ | 428/776 [09:23<06:42, 1.16s/it] | |
| 1|428|Loss: 5.154370307922363: 55%|█████▌ | 428/776 [09:23<06:42, 1.16s/it] | |
| 1|428|Loss: 5.154370307922363: 55%|█████▌ | 429/776 [09:24<06:36, 1.14s/it] | |
| 1|429|Loss: 5.166691303253174: 55%|█████▌ | 429/776 [09:24<06:36, 1.14s/it] | |
| 1|429|Loss: 5.166691303253174: 55%|█████▌ | 430/776 [09:25<06:33, 1.14s/it] | |
| 1|430|Loss: 5.109349250793457: 55%|█████▌ | 430/776 [09:25<06:33, 1.14s/it] | |
| 1|430|Loss: 5.109349250793457: 56%|█████▌ | 431/776 [09:26<06:31, 1.13s/it] | |
| 1|431|Loss: 4.990408897399902: 56%|█████▌ | 431/776 [09:26<06:31, 1.13s/it] | |
| 1|431|Loss: 4.990408897399902: 56%|█████▌ | 432/776 [09:28<06:50, 1.19s/it] | |
| 1|432|Loss: 5.17957067489624: 56%|█████▌ | 432/776 [09:28<06:50, 1.19s/it] | |
| 1|432|Loss: 5.17957067489624: 56%|█████▌ | 433/776 [09:29<06:43, 1.18s/it] | |
| 1|433|Loss: 5.305391788482666: 56%|█████▌ | 433/776 [09:29<06:43, 1.18s/it] | |
| 1|433|Loss: 5.305391788482666: 56%|█████▌ | 434/776 [09:31<07:27, 1.31s/it] | |
| 1|434|Loss: 5.144301891326904: 56%|█████▌ | 434/776 [09:31<07:27, 1.31s/it] | |
| 1|434|Loss: 5.144301891326904: 56%|█████▌ | 435/776 [09:32<07:09, 1.26s/it] | |
| 1|435|Loss: 5.1769280433654785: 56%|█████▌ | 435/776 [09:32<07:09, 1.26s/it] | |
| 1|435|Loss: 5.1769280433654785: 56%|█████▌ | 436/776 [09:33<06:54, 1.22s/it] | |
| 1|436|Loss: 5.019280910491943: 56%|█████▌ | 436/776 [09:33<06:54, 1.22s/it] | |
| 1|436|Loss: 5.019280910491943: 56%|█████▋ | 437/776 [09:34<06:43, 1.19s/it] | |
| 1|437|Loss: 4.903015613555908: 56%|█████▋ | 437/776 [09:34<06:43, 1.19s/it] | |
| 1|437|Loss: 4.903015613555908: 56%|█████▋ | 438/776 [09:35<06:37, 1.18s/it] | |
| 1|438|Loss: 5.201685905456543: 56%|█████▋ | 438/776 [09:35<06:37, 1.18s/it] | |
| 1|438|Loss: 5.201685905456543: 57%|█████▋ | 439/776 [09:36<06:31, 1.16s/it] | |
| 1|439|Loss: 5.172906875610352: 57%|█████▋ | 439/776 [09:36<06:31, 1.16s/it] | |
| 1|439|Loss: 5.172906875610352: 57%|█████▋ | 440/776 [09:37<06:26, 1.15s/it] | |
| 1|440|Loss: 5.196547985076904: 57%|█████▋ | 440/776 [09:37<06:26, 1.15s/it] | |
| 1|440|Loss: 5.196547985076904: 57%|█████▋ | 441/776 [09:38<06:22, 1.14s/it] | |
| 1|441|Loss: 4.936736583709717: 57%|█████▋ | 441/776 [09:38<06:22, 1.14s/it] | |
| 1|441|Loss: 4.936736583709717: 57%|█████▋ | 442/776 [09:40<06:19, 1.14s/it] | |
| 1|442|Loss: 5.591087818145752: 57%|█████▋ | 442/776 [09:40<06:19, 1.14s/it] | |
| 1|442|Loss: 5.591087818145752: 57%|█████▋ | 443/776 [09:41<06:16, 1.13s/it] | |
| 1|443|Loss: 5.120035171508789: 57%|█████▋ | 443/776 [09:41<06:16, 1.13s/it] | |
| 1|443|Loss: 5.120035171508789: 57%|█████▋ | 444/776 [09:42<06:36, 1.20s/it] | |
| 1|444|Loss: 5.3612895011901855: 57%|█████▋ | 444/776 [09:42<06:36, 1.20s/it] | |
| 1|444|Loss: 5.3612895011901855: 57%|█████▋ | 445/776 [09:43<06:28, 1.17s/it] | |
| 1|445|Loss: 5.297667980194092: 57%|█████▋ | 445/776 [09:43<06:28, 1.17s/it] | |
| 1|445|Loss: 5.297667980194092: 57%|█████▋ | 446/776 [09:45<07:07, 1.30s/it] | |
| 1|446|Loss: 4.688755989074707: 57%|█████▋ | 446/776 [09:45<07:07, 1.30s/it] | |
| 1|446|Loss: 4.688755989074707: 58%|█████▊ | 447/776 [09:46<06:49, 1.24s/it] | |
| 1|447|Loss: 4.889859199523926: 58%|█████▊ | 447/776 [09:46<06:49, 1.24s/it] | |
| 1|447|Loss: 4.889859199523926: 58%|█████▊ | 448/776 [09:47<06:40, 1.22s/it] | |
| 1|448|Loss: 5.192228317260742: 58%|█████▊ | 448/776 [09:47<06:40, 1.22s/it] | |
| 1|448|Loss: 5.192228317260742: 58%|█████▊ | 449/776 [09:48<06:30, 1.19s/it] | |
| 1|449|Loss: 5.042709827423096: 58%|█████▊ | 449/776 [09:48<06:30, 1.19s/it] | |
| 1|449|Loss: 5.042709827423096: 58%|█████▊ | 450/776 [09:49<06:24, 1.18s/it] | |
| 1|450|Loss: 5.052964210510254: 58%|█████▊ | 450/776 [09:49<06:24, 1.18s/it] | |
| 1|450|Loss: 5.052964210510254: 58%|█████▊ | 451/776 [09:50<06:17, 1.16s/it] | |
| 1|451|Loss: 5.413520812988281: 58%|█████▊ | 451/776 [09:50<06:17, 1.16s/it] | |
| 1|451|Loss: 5.413520812988281: 58%|█████▊ | 452/776 [09:52<06:12, 1.15s/it] | |
| 1|452|Loss: 4.890717506408691: 58%|█████▊ | 452/776 [09:52<06:12, 1.15s/it] | |
| 1|452|Loss: 4.890717506408691: 58%|█████▊ | 453/776 [09:53<06:08, 1.14s/it] | |
| 1|453|Loss: 4.736657619476318: 58%|█████▊ | 453/776 [09:53<06:08, 1.14s/it] | |
| 1|453|Loss: 4.736657619476318: 59%|█████▊ | 454/776 [09:54<06:05, 1.14s/it] | |
| 1|454|Loss: 5.113840103149414: 59%|█████▊ | 454/776 [09:54<06:05, 1.14s/it] | |
| 1|454|Loss: 5.113840103149414: 59%|█████▊ | 455/776 [09:55<06:04, 1.14s/it] | |
| 1|455|Loss: 5.090460300445557: 59%|█████▊ | 455/776 [09:55<06:04, 1.14s/it] | |
| 1|455|Loss: 5.090460300445557: 59%|█████▉ | 456/776 [09:56<06:22, 1.20s/it] | |
| 1|456|Loss: 4.926825046539307: 59%|█████▉ | 456/776 [09:56<06:22, 1.20s/it] | |
| 1|456|Loss: 4.926825046539307: 59%|█████▉ | 457/776 [09:57<06:14, 1.17s/it] | |
| 1|457|Loss: 5.013434410095215: 59%|█████▉ | 457/776 [09:57<06:14, 1.17s/it] | |
| 1|457|Loss: 5.013434410095215: 59%|█████▉ | 458/776 [09:59<06:58, 1.32s/it] | |
| 1|458|Loss: 5.242410659790039: 59%|█████▉ | 458/776 [09:59<06:58, 1.32s/it] | |
| 1|458|Loss: 5.242410659790039: 59%|█████▉ | 459/776 [10:00<06:39, 1.26s/it] | |
| 1|459|Loss: 5.1807708740234375: 59%|█████▉ | 459/776 [10:00<06:39, 1.26s/it] | |
| 1|459|Loss: 5.1807708740234375: 59%|█████▉ | 460/776 [10:01<06:24, 1.22s/it] | |
| 1|460|Loss: 5.245110988616943: 59%|█████▉ | 460/776 [10:01<06:24, 1.22s/it] | |
| 1|460|Loss: 5.245110988616943: 59%|█████▉ | 461/776 [10:02<06:14, 1.19s/it] | |
| 1|461|Loss: 5.549051761627197: 59%|█████▉ | 461/776 [10:02<06:14, 1.19s/it] | |
| 1|461|Loss: 5.549051761627197: 60%|█████▉ | 462/776 [10:04<06:05, 1.17s/it] | |
| 1|462|Loss: 5.235245704650879: 60%|█████▉ | 462/776 [10:04<06:05, 1.17s/it] | |
| 1|462|Loss: 5.235245704650879: 60%|█████▉ | 463/776 [10:05<06:00, 1.15s/it] | |
| 1|463|Loss: 4.995019435882568: 60%|█████▉ | 463/776 [10:05<06:00, 1.15s/it] | |
| 1|463|Loss: 4.995019435882568: 60%|█████▉ | 464/776 [10:06<05:56, 1.14s/it] | |
| 1|464|Loss: 5.278835773468018: 60%|█████▉ | 464/776 [10:06<05:56, 1.14s/it] | |
| 1|464|Loss: 5.278835773468018: 60%|█████▉ | 465/776 [10:07<05:53, 1.14s/it] | |
| 1|465|Loss: 5.192996025085449: 60%|█████▉ | 465/776 [10:07<05:53, 1.14s/it] | |
| 1|465|Loss: 5.192996025085449: 60%|██████ | 466/776 [10:08<05:53, 1.14s/it] | |
| 1|466|Loss: 5.062240123748779: 60%|██████ | 466/776 [10:08<05:53, 1.14s/it] | |
| 1|466|Loss: 5.062240123748779: 60%|██████ | 467/776 [10:09<05:51, 1.14s/it] | |
| 1|467|Loss: 5.110442638397217: 60%|██████ | 467/776 [10:09<05:51, 1.14s/it] | |
| 1|467|Loss: 5.110442638397217: 60%|██████ | 468/776 [10:10<06:05, 1.19s/it] | |
| 1|468|Loss: 5.351837158203125: 60%|██████ | 468/776 [10:10<06:05, 1.19s/it] | |
| 1|468|Loss: 5.351837158203125: 60%|██████ | 469/776 [10:12<05:57, 1.17s/it] | |
| 1|469|Loss: 5.283453464508057: 60%|██████ | 469/776 [10:12<05:57, 1.17s/it] | |
| 1|469|Loss: 5.283453464508057: 61%|██████ | 470/776 [10:13<06:35, 1.29s/it] | |
| 1|470|Loss: 5.1789326667785645: 61%|██████ | 470/776 [10:13<06:35, 1.29s/it] | |
| 1|470|Loss: 5.1789326667785645: 61%|██████ | 471/776 [10:14<06:18, 1.24s/it] | |
| 1|471|Loss: 5.24965238571167: 61%|██████ | 471/776 [10:14<06:18, 1.24s/it] | |
| 1|471|Loss: 5.24965238571167: 61%|██████ | 472/776 [10:15<06:06, 1.21s/it] | |
| 1|472|Loss: 5.097524166107178: 61%|██████ | 472/776 [10:15<06:06, 1.21s/it] | |
| 1|472|Loss: 5.097524166107178: 61%|██████ | 473/776 [10:17<05:57, 1.18s/it] | |
| 1|473|Loss: 5.578862190246582: 61%|██████ | 473/776 [10:17<05:57, 1.18s/it] | |
| 1|473|Loss: 5.578862190246582: 61%|██████ | 474/776 [10:18<05:52, 1.17s/it] | |
| 1|474|Loss: 5.332281112670898: 61%|██████ | 474/776 [10:18<05:52, 1.17s/it] | |
| 1|474|Loss: 5.332281112670898: 61%|██████ | 475/776 [10:19<05:47, 1.15s/it] | |
| 1|475|Loss: 5.289259433746338: 61%|██████ | 475/776 [10:19<05:47, 1.15s/it] | |
| 1|475|Loss: 5.289259433746338: 61%|██████▏ | 476/776 [10:20<05:44, 1.15s/it] | |
| 1|476|Loss: 4.735734939575195: 61%|██████▏ | 476/776 [10:20<05:44, 1.15s/it] | |
| 1|476|Loss: 4.735734939575195: 61%|██████▏ | 477/776 [10:21<05:43, 1.15s/it] | |
| 1|477|Loss: 5.401426792144775: 61%|██████▏ | 477/776 [10:21<05:43, 1.15s/it] | |
| 1|477|Loss: 5.401426792144775: 62%|██████▏ | 478/776 [10:22<05:41, 1.14s/it] | |
| 1|478|Loss: 5.19476842880249: 62%|██████▏ | 478/776 [10:22<05:41, 1.14s/it] | |
| 1|478|Loss: 5.19476842880249: 62%|██████▏ | 479/776 [10:23<05:37, 1.14s/it] | |
| 1|479|Loss: 5.001743316650391: 62%|██████▏ | 479/776 [10:23<05:37, 1.14s/it] | |
| 1|479|Loss: 5.001743316650391: 62%|██████▏ | 480/776 [10:24<05:35, 1.13s/it] | |
| 1|480|Loss: 5.37288236618042: 62%|██████▏ | 480/776 [10:24<05:35, 1.13s/it] | |
| 1|480|Loss: 5.37288236618042: 62%|██████▏ | 481/776 [10:26<06:16, 1.27s/it] | |
| 1|481|Loss: 4.761322498321533: 62%|██████▏ | 481/776 [10:26<06:16, 1.27s/it] | |
| 1|481|Loss: 4.761322498321533: 62%|██████▏ | 482/776 [10:27<06:17, 1.28s/it] | |
| 1|482|Loss: 5.134120464324951: 62%|██████▏ | 482/776 [10:27<06:17, 1.28s/it] | |
| 1|482|Loss: 5.134120464324951: 62%|██████▏ | 483/776 [10:29<06:01, 1.23s/it] | |
| 1|483|Loss: 5.098062515258789: 62%|██████▏ | 483/776 [10:29<06:01, 1.23s/it] | |
| 1|483|Loss: 5.098062515258789: 62%|██████▏ | 484/776 [10:30<05:50, 1.20s/it] | |
| 1|484|Loss: 5.355414867401123: 62%|██████▏ | 484/776 [10:30<05:50, 1.20s/it] | |
| 1|484|Loss: 5.355414867401123: 62%|██████▎ | 485/776 [10:31<05:42, 1.18s/it] | |
| 1|485|Loss: 5.0559821128845215: 62%|██████▎ | 485/776 [10:31<05:42, 1.18s/it] | |
| 1|485|Loss: 5.0559821128845215: 63%|██████▎ | 486/776 [10:32<05:38, 1.17s/it] | |
| 1|486|Loss: 5.159172058105469: 63%|██████▎ | 486/776 [10:32<05:38, 1.17s/it] | |
| 1|486|Loss: 5.159172058105469: 63%|██████▎ | 487/776 [10:33<05:35, 1.16s/it] | |
| 1|487|Loss: 5.630231857299805: 63%|██████▎ | 487/776 [10:33<05:35, 1.16s/it] | |
| 1|487|Loss: 5.630231857299805: 63%|██████▎ | 488/776 [10:34<05:30, 1.15s/it] | |
| 1|488|Loss: 5.390827655792236: 63%|██████▎ | 488/776 [10:34<05:30, 1.15s/it] | |
| 1|488|Loss: 5.390827655792236: 63%|██████▎ | 489/776 [10:35<05:28, 1.14s/it] | |
| 1|489|Loss: 5.1869378089904785: 63%|██████▎ | 489/776 [10:35<05:28, 1.14s/it] | |
| 1|489|Loss: 5.1869378089904785: 63%|██████▎ | 490/776 [10:36<05:25, 1.14s/it] | |
| 1|490|Loss: 5.137551784515381: 63%|██████▎ | 490/776 [10:36<05:25, 1.14s/it] | |
| 1|490|Loss: 5.137551784515381: 63%|██████▎ | 491/776 [10:38<05:23, 1.13s/it] | |
| 1|491|Loss: 5.358400344848633: 63%|██████▎ | 491/776 [10:38<05:23, 1.13s/it] | |
| 1|491|Loss: 5.358400344848633: 63%|██████▎ | 492/776 [10:39<05:44, 1.21s/it] | |
| 1|492|Loss: 5.547699928283691: 63%|██████▎ | 492/776 [10:39<05:44, 1.21s/it] | |
| 1|492|Loss: 5.547699928283691: 64%|██████▎ | 493/776 [10:40<05:54, 1.25s/it] | |
| 1|493|Loss: 4.992790222167969: 64%|██████▎ | 493/776 [10:40<05:54, 1.25s/it] | |
| 1|493|Loss: 4.992790222167969: 64%|██████▎ | 494/776 [10:42<06:01, 1.28s/it] | |
| 1|494|Loss: 5.357957363128662: 64%|██████▎ | 494/776 [10:42<06:01, 1.28s/it] | |
| 1|494|Loss: 5.357957363128662: 64%|██████▍ | 495/776 [10:43<05:49, 1.24s/it] | |
| 1|495|Loss: 5.302463531494141: 64%|██████▍ | 495/776 [10:43<05:49, 1.24s/it] | |
| 1|495|Loss: 5.302463531494141: 64%|██████▍ | 496/776 [10:44<05:38, 1.21s/it] | |
| 1|496|Loss: 5.190265655517578: 64%|██████▍ | 496/776 [10:44<05:38, 1.21s/it] | |
| 1|496|Loss: 5.190265655517578: 64%|██████▍ | 497/776 [10:45<05:32, 1.19s/it] | |
| 1|497|Loss: 5.385220527648926: 64%|██████▍ | 497/776 [10:45<05:32, 1.19s/it] | |
| 1|497|Loss: 5.385220527648926: 64%|██████▍ | 498/776 [10:46<05:27, 1.18s/it] | |
| 1|498|Loss: 5.129104137420654: 64%|██████▍ | 498/776 [10:46<05:27, 1.18s/it] | |
| 1|498|Loss: 5.129104137420654: 64%|██████▍ | 499/776 [10:47<05:21, 1.16s/it] | |
| 1|499|Loss: 5.109299659729004: 64%|██████▍ | 499/776 [10:47<05:21, 1.16s/it] | |
| 1|499|Loss: 5.109299659729004: 64%|██████▍ | 500/776 [10:48<05:17, 1.15s/it] | |
| 1|500|Loss: 5.095767021179199: 64%|██████▍ | 500/776 [10:48<05:17, 1.15s/it] | |
| 1|500|Loss: 5.095767021179199: 65%|██████▍ | 501/776 [10:50<05:14, 1.14s/it] | |
| 1|501|Loss: 4.856930255889893: 65%|██████▍ | 501/776 [10:50<05:14, 1.14s/it] | |
| 1|501|Loss: 4.856930255889893: 65%|██████▍ | 502/776 [10:51<05:13, 1.14s/it] | |
| 1|502|Loss: 4.654025077819824: 65%|██████▍ | 502/776 [10:51<05:13, 1.14s/it] | |
| 1|502|Loss: 4.654025077819824: 65%|██████▍ | 503/776 [10:52<05:10, 1.14s/it] | |
| 1|503|Loss: 5.153345108032227: 65%|██████▍ | 503/776 [10:52<05:10, 1.14s/it] | |
| 1|503|Loss: 5.153345108032227: 65%|██████▍ | 504/776 [10:53<05:30, 1.22s/it] | |
| 1|504|Loss: 5.154435157775879: 65%|██████▍ | 504/776 [10:53<05:30, 1.22s/it] | |
| 1|504|Loss: 5.154435157775879: 65%|██████▌ | 505/776 [10:55<05:37, 1.25s/it] | |
| 1|505|Loss: 5.079214096069336: 65%|██████▌ | 505/776 [10:55<05:37, 1.25s/it] | |
| 1|505|Loss: 5.079214096069336: 65%|██████▌ | 506/776 [10:56<05:43, 1.27s/it] | |
| 1|506|Loss: 5.0171918869018555: 65%|██████▌ | 506/776 [10:56<05:43, 1.27s/it] | |
| 1|506|Loss: 5.0171918869018555: 65%|██████▌ | 507/776 [10:57<05:32, 1.24s/it] | |
| 1|507|Loss: 5.288158893585205: 65%|██████▌ | 507/776 [10:57<05:32, 1.24s/it] | |
| 1|507|Loss: 5.288158893585205: 65%|██████▌ | 508/776 [10:58<05:22, 1.20s/it] | |
| 1|508|Loss: 5.115662574768066: 65%|██████▌ | 508/776 [10:58<05:22, 1.20s/it] | |
| 1|508|Loss: 5.115662574768066: 66%|██████▌ | 509/776 [10:59<05:16, 1.18s/it] | |
| 1|509|Loss: 5.207104206085205: 66%|██████▌ | 509/776 [10:59<05:16, 1.18s/it] | |
| 1|509|Loss: 5.207104206085205: 66%|██████▌ | 510/776 [11:00<05:13, 1.18s/it] | |
| 1|510|Loss: 4.99180269241333: 66%|██████▌ | 510/776 [11:00<05:13, 1.18s/it] | |
| 1|510|Loss: 4.99180269241333: 66%|██████▌ | 511/776 [11:02<05:07, 1.16s/it] | |
| 1|511|Loss: 5.203444004058838: 66%|██████▌ | 511/776 [11:02<05:07, 1.16s/it] | |
| 1|511|Loss: 5.203444004058838: 66%|██████▌ | 512/776 [11:03<05:02, 1.14s/it] | |
| 1|512|Loss: 5.010587215423584: 66%|██████▌ | 512/776 [11:03<05:02, 1.14s/it] | |
| 1|512|Loss: 5.010587215423584: 66%|██████▌ | 513/776 [11:04<04:59, 1.14s/it] | |
| 1|513|Loss: 5.185308456420898: 66%|██████▌ | 513/776 [11:04<04:59, 1.14s/it] | |
| 1|513|Loss: 5.185308456420898: 66%|██████▌ | 514/776 [11:05<04:58, 1.14s/it] | |
| 1|514|Loss: 5.239168167114258: 66%|██████▌ | 514/776 [11:05<04:58, 1.14s/it] | |
| 1|514|Loss: 5.239168167114258: 66%|██████▋ | 515/776 [11:06<04:57, 1.14s/it] | |
| 1|515|Loss: 5.371617317199707: 66%|██████▋ | 515/776 [11:06<04:57, 1.14s/it] | |
| 1|515|Loss: 5.371617317199707: 66%|██████▋ | 516/776 [11:07<05:14, 1.21s/it] | |
| 1|516|Loss: 5.868690013885498: 66%|██████▋ | 516/776 [11:07<05:14, 1.21s/it] | |
| 1|516|Loss: 5.868690013885498: 67%|██████▋ | 517/776 [11:09<05:21, 1.24s/it] | |
| 1|517|Loss: 5.550715446472168: 67%|██████▋ | 517/776 [11:09<05:21, 1.24s/it] | |
| 1|517|Loss: 5.550715446472168: 67%|██████▋ | 518/776 [11:10<05:29, 1.28s/it] | |
| 1|518|Loss: 5.544602870941162: 67%|██████▋ | 518/776 [11:10<05:29, 1.28s/it] | |
| 1|518|Loss: 5.544602870941162: 67%|██████▋ | 519/776 [11:11<05:16, 1.23s/it] | |
| 1|519|Loss: 5.563851356506348: 67%|██████▋ | 519/776 [11:11<05:16, 1.23s/it] | |
| 1|519|Loss: 5.563851356506348: 67%|██████▋ | 520/776 [11:12<05:08, 1.21s/it] | |
| 1|520|Loss: 5.38159704208374: 67%|██████▋ | 520/776 [11:12<05:08, 1.21s/it] | |
| 1|520|Loss: 5.38159704208374: 67%|██████▋ | 521/776 [11:14<05:01, 1.18s/it] | |
| 1|521|Loss: 5.429404258728027: 67%|██████▋ | 521/776 [11:14<05:01, 1.18s/it] | |
| 1|521|Loss: 5.429404258728027: 67%|██████▋ | 522/776 [11:15<04:55, 1.16s/it] | |
| 1|522|Loss: 5.376346111297607: 67%|██████▋ | 522/776 [11:15<04:55, 1.16s/it] | |
| 1|522|Loss: 5.376346111297607: 67%|██████▋ | 523/776 [11:16<04:50, 1.15s/it] | |
| 1|523|Loss: 5.401008129119873: 67%|██████▋ | 523/776 [11:16<04:50, 1.15s/it] | |
| 1|523|Loss: 5.401008129119873: 68%|██████▊ | 524/776 [11:17<04:49, 1.15s/it] | |
| 1|524|Loss: 5.219633102416992: 68%|██████▊ | 524/776 [11:17<04:49, 1.15s/it] | |
| 1|524|Loss: 5.219633102416992: 68%|██████▊ | 525/776 [11:18<04:48, 1.15s/it] | |
| 1|525|Loss: 4.826515197753906: 68%|██████▊ | 525/776 [11:18<04:48, 1.15s/it] | |
| 1|525|Loss: 4.826515197753906: 68%|██████▊ | 526/776 [11:19<04:45, 1.14s/it] | |
| 1|526|Loss: 5.2579827308654785: 68%|██████▊ | 526/776 [11:19<04:45, 1.14s/it] | |
| 1|526|Loss: 5.2579827308654785: 68%|██████▊ | 527/776 [11:20<04:43, 1.14s/it] | |
| 1|527|Loss: 4.985126495361328: 68%|██████▊ | 527/776 [11:20<04:43, 1.14s/it] | |
| 1|527|Loss: 4.985126495361328: 68%|██████▊ | 528/776 [11:22<05:04, 1.23s/it] | |
| 1|528|Loss: 5.282034873962402: 68%|██████▊ | 528/776 [11:22<05:04, 1.23s/it] | |
| 1|528|Loss: 5.282034873962402: 68%|██████▊ | 529/776 [11:23<05:14, 1.27s/it] | |
| 1|529|Loss: 5.462903022766113: 68%|██████▊ | 529/776 [11:23<05:14, 1.27s/it] | |
| 1|529|Loss: 5.462903022766113: 68%|██████▊ | 530/776 [11:24<05:16, 1.29s/it] | |
| 1|530|Loss: 5.058537483215332: 68%|██████▊ | 530/776 [11:24<05:16, 1.29s/it] | |
| 1|530|Loss: 5.058537483215332: 68%|██████▊ | 531/776 [11:26<05:03, 1.24s/it] | |
| 1|531|Loss: 5.373104095458984: 68%|██████▊ | 531/776 [11:26<05:03, 1.24s/it] | |
| 1|531|Loss: 5.373104095458984: 69%|██████▊ | 532/776 [11:27<04:53, 1.20s/it] | |
| 1|532|Loss: 5.43235445022583: 69%|██████▊ | 532/776 [11:27<04:53, 1.20s/it] | |
| 1|532|Loss: 5.43235445022583: 69%|██████▊ | 533/776 [11:28<04:49, 1.19s/it] | |
| 1|533|Loss: 5.166787624359131: 69%|██████▊ | 533/776 [11:28<04:49, 1.19s/it] | |
| 1|533|Loss: 5.166787624359131: 69%|██████▉ | 534/776 [11:29<04:44, 1.18s/it] | |
| 1|534|Loss: 5.012847900390625: 69%|██████▉ | 534/776 [11:29<04:44, 1.18s/it] | |
| 1|534|Loss: 5.012847900390625: 69%|██████▉ | 535/776 [11:30<04:41, 1.17s/it] | |
| 1|535|Loss: 5.019012928009033: 69%|██████▉ | 535/776 [11:30<04:41, 1.17s/it] | |
| 1|535|Loss: 5.019012928009033: 69%|██████▉ | 536/776 [11:31<04:36, 1.15s/it] | |
| 1|536|Loss: 4.762948513031006: 69%|██████▉ | 536/776 [11:31<04:36, 1.15s/it] | |
| 1|536|Loss: 4.762948513031006: 69%|██████▉ | 537/776 [11:32<04:34, 1.15s/it] | |
| 1|537|Loss: 5.184920310974121: 69%|██████▉ | 537/776 [11:32<04:34, 1.15s/it] | |
| 1|537|Loss: 5.184920310974121: 69%|██████▉ | 538/776 [11:34<04:31, 1.14s/it] | |
| 1|538|Loss: 5.1850690841674805: 69%|██████▉ | 538/776 [11:34<04:31, 1.14s/it] | |
| 1|538|Loss: 5.1850690841674805: 69%|██████▉ | 539/776 [11:35<04:48, 1.22s/it] | |
| 1|539|Loss: 5.21252965927124: 69%|██████▉ | 539/776 [11:35<04:48, 1.22s/it] | |
| 1|539|Loss: 5.21252965927124: 70%|██████▉ | 540/776 [11:36<04:41, 1.19s/it] | |
| 1|540|Loss: 5.066741943359375: 70%|██████▉ | 540/776 [11:36<04:41, 1.19s/it] | |
| 1|540|Loss: 5.066741943359375: 70%|██████▉ | 541/776 [11:37<04:35, 1.17s/it] | |
| 1|541|Loss: 5.102386474609375: 70%|██████▉ | 541/776 [11:37<04:35, 1.17s/it] | |
| 1|541|Loss: 5.102386474609375: 70%|██████▉ | 542/776 [11:39<04:45, 1.22s/it] | |
| 1|542|Loss: 4.998660564422607: 70%|██████▉ | 542/776 [11:39<04:45, 1.22s/it] | |
| 1|542|Loss: 4.998660564422607: 70%|██████▉ | 543/776 [11:40<04:37, 1.19s/it] | |
| 1|543|Loss: 5.22159481048584: 70%|██████▉ | 543/776 [11:40<04:37, 1.19s/it] | |
| 1|543|Loss: 5.22159481048584: 70%|███████ | 544/776 [11:41<04:31, 1.17s/it] | |
| 1|544|Loss: 5.426517009735107: 70%|███████ | 544/776 [11:41<04:31, 1.17s/it] | |
| 1|544|Loss: 5.426517009735107: 70%|███████ | 545/776 [11:42<04:27, 1.16s/it] | |
| 1|545|Loss: 5.2794108390808105: 70%|███████ | 545/776 [11:42<04:27, 1.16s/it] | |
| 1|545|Loss: 5.2794108390808105: 70%|███████ | 546/776 [11:43<04:25, 1.15s/it] | |
| 1|546|Loss: 5.588244915008545: 70%|███████ | 546/776 [11:43<04:25, 1.15s/it] | |
| 1|546|Loss: 5.588244915008545: 70%|███████ | 547/776 [11:44<04:21, 1.14s/it] | |
| 1|547|Loss: 5.373724937438965: 70%|███████ | 547/776 [11:44<04:21, 1.14s/it] | |
| 1|547|Loss: 5.373724937438965: 71%|███████ | 548/776 [11:45<04:19, 1.14s/it] | |
| 1|548|Loss: 5.38264274597168: 71%|███████ | 548/776 [11:45<04:19, 1.14s/it] | |
| 1|548|Loss: 5.38264274597168: 71%|███████ | 549/776 [11:46<04:18, 1.14s/it] | |
| 1|549|Loss: 5.220317363739014: 71%|███████ | 549/776 [11:46<04:18, 1.14s/it] | |
| 1|549|Loss: 5.220317363739014: 71%|███████ | 550/776 [11:48<04:34, 1.21s/it] | |
| 1|550|Loss: 5.479121685028076: 71%|███████ | 550/776 [11:48<04:34, 1.21s/it] | |
| 1|550|Loss: 5.479121685028076: 71%|███████ | 551/776 [11:49<04:29, 1.20s/it] | |
| 1|551|Loss: 4.846341133117676: 71%|███████ | 551/776 [11:49<04:29, 1.20s/it] | |
| 1|551|Loss: 4.846341133117676: 71%|███████ | 552/776 [11:50<04:26, 1.19s/it] | |
| 1|552|Loss: 5.55071496963501: 71%|███████ | 552/776 [11:50<04:26, 1.19s/it] | |
| 1|552|Loss: 5.55071496963501: 71%|███████▏ | 553/776 [11:51<04:20, 1.17s/it] | |
| 1|553|Loss: 4.949970245361328: 71%|███████▏ | 553/776 [11:51<04:20, 1.17s/it] | |
| 1|553|Loss: 4.949970245361328: 71%|███████▏ | 554/776 [11:53<04:29, 1.21s/it] | |
| 1|554|Loss: 4.840105056762695: 71%|███████▏ | 554/776 [11:53<04:29, 1.21s/it] | |
| 1|554|Loss: 4.840105056762695: 72%|███████▏ | 555/776 [11:54<04:22, 1.19s/it] | |
| 1|555|Loss: 5.286818027496338: 72%|███████▏ | 555/776 [11:54<04:22, 1.19s/it] | |
| 1|555|Loss: 5.286818027496338: 72%|███████▏ | 556/776 [11:55<04:17, 1.17s/it] | |
| 1|556|Loss: 4.880880832672119: 72%|███████▏ | 556/776 [11:55<04:17, 1.17s/it] | |
| 1|556|Loss: 4.880880832672119: 72%|███████▏ | 557/776 [11:56<04:16, 1.17s/it] | |
| 1|557|Loss: 4.9855217933654785: 72%|███████▏ | 557/776 [11:56<04:16, 1.17s/it] | |
| 1|557|Loss: 4.9855217933654785: 72%|███████▏ | 558/776 [11:57<04:11, 1.15s/it] | |
| 1|558|Loss: 5.092526435852051: 72%|███████▏ | 558/776 [11:57<04:11, 1.15s/it] | |
| 1|558|Loss: 5.092526435852051: 72%|███████▏ | 559/776 [11:58<04:08, 1.14s/it] | |
| 1|559|Loss: 4.918326377868652: 72%|███████▏ | 559/776 [11:58<04:08, 1.14s/it] | |
| 1|559|Loss: 4.918326377868652: 72%|███████▏ | 560/776 [11:59<04:05, 1.14s/it] | |
| 1|560|Loss: 5.391762733459473: 72%|███████▏ | 560/776 [11:59<04:05, 1.14s/it] | |
| 1|560|Loss: 5.391762733459473: 72%|███████▏ | 561/776 [12:01<04:04, 1.14s/it] | |
| 1|561|Loss: 5.117834568023682: 72%|███████▏ | 561/776 [12:01<04:04, 1.14s/it] | |
| 1|561|Loss: 5.117834568023682: 72%|███████▏ | 562/776 [12:02<04:20, 1.22s/it] | |
| 1|562|Loss: 5.029565334320068: 72%|███████▏ | 562/776 [12:02<04:20, 1.22s/it] | |
| 1|562|Loss: 5.029565334320068: 73%|███████▎ | 563/776 [12:03<04:15, 1.20s/it] | |
| 1|563|Loss: 5.036685466766357: 73%|███████▎ | 563/776 [12:03<04:15, 1.20s/it] | |
| 1|563|Loss: 5.036685466766357: 73%|███████▎ | 564/776 [12:04<04:09, 1.18s/it] | |
| 1|564|Loss: 5.112013816833496: 73%|███████▎ | 564/776 [12:04<04:09, 1.18s/it] | |
| 1|564|Loss: 5.112013816833496: 73%|███████▎ | 565/776 [12:05<04:04, 1.16s/it] | |
| 1|565|Loss: 5.521562576293945: 73%|███████▎ | 565/776 [12:05<04:04, 1.16s/it] | |
| 1|565|Loss: 5.521562576293945: 73%|███████▎ | 566/776 [12:07<04:14, 1.21s/it] | |
| 1|566|Loss: 5.155182361602783: 73%|███████▎ | 566/776 [12:07<04:14, 1.21s/it] | |
| 1|566|Loss: 5.155182361602783: 73%|███████▎ | 567/776 [12:08<04:08, 1.19s/it] | |
| 1|567|Loss: 5.664989471435547: 73%|███████▎ | 567/776 [12:08<04:08, 1.19s/it] | |
| 1|567|Loss: 5.664989471435547: 73%|███████▎ | 568/776 [12:09<04:03, 1.17s/it] | |
| 1|568|Loss: 5.081148147583008: 73%|███████▎ | 568/776 [12:09<04:03, 1.17s/it] | |
| 1|568|Loss: 5.081148147583008: 73%|███████▎ | 569/776 [12:10<04:02, 1.17s/it] | |
| 1|569|Loss: 5.098517894744873: 73%|███████▎ | 569/776 [12:10<04:02, 1.17s/it] | |
| 1|569|Loss: 5.098517894744873: 73%|███████▎ | 570/776 [12:11<03:58, 1.16s/it] | |
| 1|570|Loss: 4.990883827209473: 73%|███████▎ | 570/776 [12:11<03:58, 1.16s/it] | |
| 1|570|Loss: 4.990883827209473: 74%|███████▎ | 571/776 [12:12<03:55, 1.15s/it] | |
| 1|571|Loss: 5.356252193450928: 74%|███████▎ | 571/776 [12:12<03:55, 1.15s/it] | |
| 1|571|Loss: 5.356252193450928: 74%|███████▎ | 572/776 [12:13<03:53, 1.14s/it] | |
| 1|572|Loss: 4.908764362335205: 74%|███████▎ | 572/776 [12:14<03:53, 1.14s/it] | |
| 1|572|Loss: 4.908764362335205: 74%|███████▍ | 573/776 [12:15<03:50, 1.14s/it] | |
| 1|573|Loss: 4.8660569190979: 74%|███████▍ | 573/776 [12:15<03:50, 1.14s/it] | |
| 1|573|Loss: 4.8660569190979: 74%|███████▍ | 574/776 [12:16<04:06, 1.22s/it] | |
| 1|574|Loss: 4.903832912445068: 74%|███████▍ | 574/776 [12:16<04:06, 1.22s/it] | |
| 1|574|Loss: 4.903832912445068: 74%|███████▍ | 575/776 [12:17<03:59, 1.19s/it] | |
| 1|575|Loss: 5.26066255569458: 74%|███████▍ | 575/776 [12:17<03:59, 1.19s/it] | |
| 1|575|Loss: 5.26066255569458: 74%|███████▍ | 576/776 [12:18<03:54, 1.17s/it] | |
| 1|576|Loss: 5.230278015136719: 74%|███████▍ | 576/776 [12:18<03:54, 1.17s/it] | |
| 1|576|Loss: 5.230278015136719: 74%|███████▍ | 577/776 [12:19<03:49, 1.15s/it] | |
| 1|577|Loss: 5.010391712188721: 74%|███████▍ | 577/776 [12:19<03:49, 1.15s/it] | |
| 1|577|Loss: 5.010391712188721: 74%|███████▍ | 578/776 [12:21<03:59, 1.21s/it] | |
| 1|578|Loss: 5.036949634552002: 74%|███████▍ | 578/776 [12:21<03:59, 1.21s/it] | |
| 1|578|Loss: 5.036949634552002: 75%|███████▍ | 579/776 [12:22<03:54, 1.19s/it] | |
| 1|579|Loss: 5.090181350708008: 75%|███████▍ | 579/776 [12:22<03:54, 1.19s/it] | |
| 1|579|Loss: 5.090181350708008: 75%|███████▍ | 580/776 [12:23<03:50, 1.18s/it] | |
| 1|580|Loss: 5.1319403648376465: 75%|███████▍ | 580/776 [12:23<03:50, 1.18s/it] | |
| 1|580|Loss: 5.1319403648376465: 75%|███████▍ | 581/776 [12:24<03:46, 1.16s/it] | |
| 1|581|Loss: 5.13538932800293: 75%|███████▍ | 581/776 [12:24<03:46, 1.16s/it] | |
| 1|581|Loss: 5.13538932800293: 75%|███████▌ | 582/776 [12:25<03:42, 1.15s/it] | |
| 1|582|Loss: 5.034479141235352: 75%|███████▌ | 582/776 [12:25<03:42, 1.15s/it] | |
| 1|582|Loss: 5.034479141235352: 75%|███████▌ | 583/776 [12:26<03:39, 1.14s/it] | |
| 1|583|Loss: 5.1129231452941895: 75%|███████▌ | 583/776 [12:26<03:39, 1.14s/it] | |
| 1|583|Loss: 5.1129231452941895: 75%|███████▌ | 584/776 [12:28<03:37, 1.13s/it] | |
| 1|584|Loss: 5.063475608825684: 75%|███████▌ | 584/776 [12:28<03:37, 1.13s/it] | |
| 1|584|Loss: 5.063475608825684: 75%|███████▌ | 585/776 [12:29<03:36, 1.13s/it] | |
| 1|585|Loss: 5.389310836791992: 75%|███████▌ | 585/776 [12:29<03:36, 1.13s/it] | |
| 1|585|Loss: 5.389310836791992: 76%|███████▌ | 586/776 [12:30<03:50, 1.21s/it] | |
| 1|586|Loss: 5.0814714431762695: 76%|███████▌ | 586/776 [12:30<03:50, 1.21s/it] | |
| 1|586|Loss: 5.0814714431762695: 76%|███████▌ | 587/776 [12:31<03:44, 1.19s/it] | |
| 1|587|Loss: 4.861522197723389: 76%|███████▌ | 587/776 [12:31<03:44, 1.19s/it] | |
| 1|587|Loss: 4.861522197723389: 76%|███████▌ | 588/776 [12:32<03:39, 1.17s/it] | |
| 1|588|Loss: 5.236099720001221: 76%|███████▌ | 588/776 [12:32<03:39, 1.17s/it] | |
| 1|588|Loss: 5.236099720001221: 76%|███████▌ | 589/776 [12:33<03:35, 1.15s/it] | |
| 1|589|Loss: 5.584919452667236: 76%|███████▌ | 589/776 [12:33<03:35, 1.15s/it] | |
| 1|589|Loss: 5.584919452667236: 76%|███████▌ | 590/776 [12:35<03:43, 1.20s/it] | |
| 1|590|Loss: 5.312676429748535: 76%|███████▌ | 590/776 [12:35<03:43, 1.20s/it] | |
| 1|590|Loss: 5.312676429748535: 76%|███████▌ | 591/776 [12:36<03:38, 1.18s/it] | |
| 1|591|Loss: 6.036230564117432: 76%|███████▌ | 591/776 [12:36<03:38, 1.18s/it] | |
| 1|591|Loss: 6.036230564117432: 76%|███████▋ | 592/776 [12:37<03:35, 1.17s/it] | |
| 1|592|Loss: 5.9947829246521: 76%|███████▋ | 592/776 [12:37<03:35, 1.17s/it] | |
| 1|592|Loss: 5.9947829246521: 76%|███████▋ | 593/776 [12:38<03:32, 1.16s/it] | |
| 1|593|Loss: 5.926812171936035: 76%|███████▋ | 593/776 [12:38<03:32, 1.16s/it] | |
| 1|593|Loss: 5.926812171936035: 77%|███████▋ | 594/776 [12:39<03:30, 1.15s/it] | |
| 1|594|Loss: 5.64717435836792: 77%|███████▋ | 594/776 [12:39<03:30, 1.15s/it] | |
| 1|594|Loss: 5.64717435836792: 77%|███████▋ | 595/776 [12:40<03:27, 1.15s/it] | |
| 1|595|Loss: 5.621748447418213: 77%|███████▋ | 595/776 [12:40<03:27, 1.15s/it] | |
| 1|595|Loss: 5.621748447418213: 77%|███████▋ | 596/776 [12:42<03:25, 1.14s/it] | |
| 1|596|Loss: 5.586417198181152: 77%|███████▋ | 596/776 [12:42<03:25, 1.14s/it] | |
| 1|596|Loss: 5.586417198181152: 77%|███████▋ | 597/776 [12:43<03:39, 1.23s/it] | |
| 1|597|Loss: 5.880124568939209: 77%|███████▋ | 597/776 [12:43<03:39, 1.23s/it] | |
| 1|597|Loss: 5.880124568939209: 77%|███████▋ | 598/776 [12:44<03:34, 1.20s/it] | |
| 1|598|Loss: 5.838248252868652: 77%|███████▋ | 598/776 [12:44<03:34, 1.20s/it] | |
| 1|598|Loss: 5.838248252868652: 77%|███████▋ | 599/776 [12:45<03:29, 1.19s/it] | |
| 1|599|Loss: 5.335123062133789: 77%|███████▋ | 599/776 [12:45<03:29, 1.19s/it] | |
| 1|599|Loss: 5.335123062133789: 77%|███████▋ | 600/776 [12:46<03:25, 1.17s/it] | |
| 1|600|Loss: 5.34372091293335: 77%|███████▋ | 600/776 [12:46<03:25, 1.17s/it] | |
| 1|600|Loss: 5.34372091293335: 77%|███████▋ | 601/776 [12:47<03:21, 1.15s/it] | |
| 1|601|Loss: 5.1826629638671875: 77%|███████▋ | 601/776 [12:47<03:21, 1.15s/it] | |
| 1|601|Loss: 5.1826629638671875: 78%|███████▊ | 602/776 [12:49<03:30, 1.21s/it] | |
| 1|602|Loss: 5.165835857391357: 78%|███████▊ | 602/776 [12:49<03:30, 1.21s/it] | |
| 1|602|Loss: 5.165835857391357: 78%|███████▊ | 603/776 [12:50<03:36, 1.25s/it] | |
| 1|603|Loss: 5.092622756958008: 78%|███████▊ | 603/776 [12:50<03:36, 1.25s/it] | |
| 1|603|Loss: 5.092622756958008: 78%|███████▊ | 604/776 [12:51<03:30, 1.22s/it] | |
| 1|604|Loss: 5.172447681427002: 78%|███████▊ | 604/776 [12:51<03:30, 1.22s/it] | |
| 1|604|Loss: 5.172447681427002: 78%|███████▊ | 605/776 [12:52<03:23, 1.19s/it] | |
| 1|605|Loss: 5.175489902496338: 78%|███████▊ | 605/776 [12:52<03:23, 1.19s/it] | |
| 1|605|Loss: 5.175489902496338: 78%|███████▊ | 606/776 [12:54<03:19, 1.17s/it] | |
| 1|606|Loss: 5.398399353027344: 78%|███████▊ | 606/776 [12:54<03:19, 1.17s/it] | |
| 1|606|Loss: 5.398399353027344: 78%|███████▊ | 607/776 [12:55<03:15, 1.16s/it] | |
| 1|607|Loss: 5.0227952003479: 78%|███████▊ | 607/776 [12:55<03:15, 1.16s/it] | |
| 1|607|Loss: 5.0227952003479: 78%|███████▊ | 608/776 [12:56<03:27, 1.24s/it] | |
| 1|608|Loss: 4.7826762199401855: 78%|███████▊ | 608/776 [12:56<03:27, 1.24s/it] | |
| 1|608|Loss: 4.7826762199401855: 78%|███████▊ | 609/776 [12:57<03:24, 1.22s/it] | |
| 1|609|Loss: 5.473695755004883: 78%|███████▊ | 609/776 [12:57<03:24, 1.22s/it] | |
| 1|609|Loss: 5.473695755004883: 79%|███████▊ | 610/776 [12:58<03:18, 1.19s/it] | |
| 1|610|Loss: 5.342655181884766: 79%|███████▊ | 610/776 [12:58<03:18, 1.19s/it] | |
| 1|610|Loss: 5.342655181884766: 79%|███████▊ | 611/776 [13:00<03:13, 1.18s/it] | |
| 1|611|Loss: 5.087533950805664: 79%|███████▊ | 611/776 [13:00<03:13, 1.18s/it] | |
| 1|611|Loss: 5.087533950805664: 79%|███████▉ | 612/776 [13:01<03:10, 1.16s/it] | |
| 1|612|Loss: 4.967813968658447: 79%|███████▉ | 612/776 [13:01<03:10, 1.16s/it] | |
| 1|612|Loss: 4.967813968658447: 79%|███████▉ | 613/776 [13:02<03:07, 1.15s/it] | |
| 1|613|Loss: 5.2761712074279785: 79%|███████▉ | 613/776 [13:02<03:07, 1.15s/it] | |
| 1|613|Loss: 5.2761712074279785: 79%|███████▉ | 614/776 [13:03<03:16, 1.21s/it] | |
| 1|614|Loss: 4.855869770050049: 79%|███████▉ | 614/776 [13:03<03:16, 1.21s/it] | |
| 1|614|Loss: 4.855869770050049: 79%|███████▉ | 615/776 [13:05<03:21, 1.25s/it] | |
| 1|615|Loss: 5.247289180755615: 79%|███████▉ | 615/776 [13:05<03:21, 1.25s/it] | |
| 1|615|Loss: 5.247289180755615: 79%|███████▉ | 616/776 [13:06<03:14, 1.22s/it] | |
| 1|616|Loss: 5.020791053771973: 79%|███████▉ | 616/776 [13:06<03:14, 1.22s/it] | |
| 1|616|Loss: 5.020791053771973: 80%|███████▉ | 617/776 [13:07<03:09, 1.19s/it] | |
| 1|617|Loss: 5.123473167419434: 80%|███████▉ | 617/776 [13:07<03:09, 1.19s/it] | |
| 1|617|Loss: 5.123473167419434: 80%|███████▉ | 618/776 [13:08<03:04, 1.17s/it] | |
| 1|618|Loss: 5.781302452087402: 80%|███████▉ | 618/776 [13:08<03:04, 1.17s/it] | |
| 1|618|Loss: 5.781302452087402: 80%|███████▉ | 619/776 [13:09<03:02, 1.16s/it] | |
| 1|619|Loss: 5.100361347198486: 80%|███████▉ | 619/776 [13:09<03:02, 1.16s/it] | |
| 1|619|Loss: 5.100361347198486: 80%|███████▉ | 620/776 [13:10<03:11, 1.23s/it] | |
| 1|620|Loss: 5.404480457305908: 80%|███████▉ | 620/776 [13:10<03:11, 1.23s/it] | |
| 1|620|Loss: 5.404480457305908: 80%|████████ | 621/776 [13:12<03:05, 1.20s/it] | |
| 1|621|Loss: 4.858031272888184: 80%|████████ | 621/776 [13:12<03:05, 1.20s/it] | |
| 1|621|Loss: 4.858031272888184: 80%|████████ | 622/776 [13:13<03:01, 1.18s/it] | |
| 1|622|Loss: 5.269679546356201: 80%|████████ | 622/776 [13:13<03:01, 1.18s/it] | |
| 1|622|Loss: 5.269679546356201: 80%|████████ | 623/776 [13:14<02:58, 1.17s/it] | |
| 1|623|Loss: 4.914243698120117: 80%|████████ | 623/776 [13:14<02:58, 1.17s/it] | |
| 1|623|Loss: 4.914243698120117: 80%|████████ | 624/776 [13:15<02:56, 1.16s/it] | |
| 1|624|Loss: 5.375041961669922: 80%|████████ | 624/776 [13:15<02:56, 1.16s/it] | |
| 1|624|Loss: 5.375041961669922: 81%|████████ | 625/776 [13:16<02:53, 1.15s/it] | |
| 1|625|Loss: 5.516076564788818: 81%|████████ | 625/776 [13:16<02:53, 1.15s/it] | |
| 1|625|Loss: 5.516076564788818: 81%|████████ | 626/776 [13:17<02:59, 1.20s/it] | |
| 1|626|Loss: 5.264232635498047: 81%|████████ | 626/776 [13:17<02:59, 1.20s/it] | |
| 1|626|Loss: 5.264232635498047: 81%|████████ | 627/776 [13:19<03:04, 1.24s/it] | |
| 1|627|Loss: 5.057028770446777: 81%|████████ | 627/776 [13:19<03:04, 1.24s/it] | |
| 1|627|Loss: 5.057028770446777: 81%|████████ | 628/776 [13:20<02:58, 1.20s/it] | |
| 1|628|Loss: 5.207719802856445: 81%|████████ | 628/776 [13:20<02:58, 1.20s/it] | |
| 1|628|Loss: 5.207719802856445: 81%|████████ | 629/776 [13:21<02:54, 1.19s/it] | |
| 1|629|Loss: 5.533774375915527: 81%|████████ | 629/776 [13:21<02:54, 1.19s/it] | |
| 1|629|Loss: 5.533774375915527: 81%|████████ | 630/776 [13:22<02:50, 1.16s/it] | |
| 1|630|Loss: 5.165842056274414: 81%|████████ | 630/776 [13:22<02:50, 1.16s/it] | |
| 1|630|Loss: 5.165842056274414: 81%|████████▏ | 631/776 [13:23<02:47, 1.15s/it] | |
| 1|631|Loss: 5.275313377380371: 81%|████████▏ | 631/776 [13:23<02:47, 1.15s/it] | |
| 1|631|Loss: 5.275313377380371: 81%|████████▏ | 632/776 [13:25<02:56, 1.23s/it] | |
| 1|632|Loss: 4.985404968261719: 81%|████████▏ | 632/776 [13:25<02:56, 1.23s/it] | |
| 1|632|Loss: 4.985404968261719: 82%|████████▏ | 633/776 [13:26<02:50, 1.19s/it] | |
| 1|633|Loss: 5.050546169281006: 82%|████████▏ | 633/776 [13:26<02:50, 1.19s/it] | |
| 1|633|Loss: 5.050546169281006: 82%|████████▏ | 634/776 [13:27<02:46, 1.18s/it] | |
| 1|634|Loss: 5.07232141494751: 82%|████████▏ | 634/776 [13:27<02:46, 1.18s/it] | |
| 1|634|Loss: 5.07232141494751: 82%|████████▏ | 635/776 [13:28<02:45, 1.17s/it] | |
| 1|635|Loss: 4.905629634857178: 82%|████████▏ | 635/776 [13:28<02:45, 1.17s/it] | |
| 1|635|Loss: 4.905629634857178: 82%|████████▏ | 636/776 [13:29<02:42, 1.16s/it] | |
| 1|636|Loss: 5.028162002563477: 82%|████████▏ | 636/776 [13:29<02:42, 1.16s/it] | |
| 1|636|Loss: 5.028162002563477: 82%|████████▏ | 637/776 [13:30<02:40, 1.15s/it] | |
| 1|637|Loss: 5.008671760559082: 82%|████████▏ | 637/776 [13:30<02:40, 1.15s/it] | |
| 1|637|Loss: 5.008671760559082: 82%|████████▏ | 638/776 [13:32<02:45, 1.20s/it] | |
| 1|638|Loss: 5.066326141357422: 82%|████████▏ | 638/776 [13:32<02:45, 1.20s/it] | |
| 1|638|Loss: 5.066326141357422: 82%|████████▏ | 639/776 [13:33<02:50, 1.24s/it] | |
| 1|639|Loss: 5.2565436363220215: 82%|████████▏ | 639/776 [13:33<02:50, 1.24s/it] | |
| 1|639|Loss: 5.2565436363220215: 82%|████████▏ | 640/776 [13:34<02:43, 1.20s/it] | |
| 1|640|Loss: 5.0616135597229: 82%|████████▏ | 640/776 [13:34<02:43, 1.20s/it] | |
| 1|640|Loss: 5.0616135597229: 83%|████████▎ | 641/776 [13:35<02:40, 1.19s/it] | |
| 1|641|Loss: 5.05810022354126: 83%|████████▎ | 641/776 [13:35<02:40, 1.19s/it] | |
| 1|641|Loss: 5.05810022354126: 83%|████████▎ | 642/776 [13:36<02:36, 1.17s/it] | |
| 1|642|Loss: 5.352077007293701: 83%|████████▎ | 642/776 [13:36<02:36, 1.17s/it] | |
| 1|642|Loss: 5.352077007293701: 83%|████████▎ | 643/776 [13:38<02:33, 1.15s/it] | |
| 1|643|Loss: 4.985960960388184: 83%|████████▎ | 643/776 [13:38<02:33, 1.15s/it] | |
| 1|643|Loss: 4.985960960388184: 83%|████████▎ | 644/776 [13:39<02:42, 1.23s/it] | |
| 1|644|Loss: 4.8950605392456055: 83%|████████▎ | 644/776 [13:39<02:42, 1.23s/it] | |
| 1|644|Loss: 4.8950605392456055: 83%|████████▎ | 645/776 [13:40<02:37, 1.20s/it] | |
| 1|645|Loss: 5.130889892578125: 83%|████████▎ | 645/776 [13:40<02:37, 1.20s/it] | |
| 1|645|Loss: 5.130889892578125: 83%|████████▎ | 646/776 [13:41<02:33, 1.18s/it] | |
| 1|646|Loss: 5.218526363372803: 83%|████████▎ | 646/776 [13:41<02:33, 1.18s/it] | |
| 1|646|Loss: 5.218526363372803: 83%|████████▎ | 647/776 [13:42<02:30, 1.16s/it] | |
| 1|647|Loss: 5.028182506561279: 83%|████████▎ | 647/776 [13:42<02:30, 1.16s/it] | |
| 1|647|Loss: 5.028182506561279: 84%|████████▎ | 648/776 [13:43<02:27, 1.15s/it] | |
| 1|648|Loss: 5.019755840301514: 84%|████████▎ | 648/776 [13:43<02:27, 1.15s/it] | |
| 1|648|Loss: 5.019755840301514: 84%|████████▎ | 649/776 [13:45<02:25, 1.14s/it] | |
| 1|649|Loss: 4.889018535614014: 84%|████████▎ | 649/776 [13:45<02:25, 1.14s/it] | |
| 1|649|Loss: 4.889018535614014: 84%|████████▍ | 650/776 [13:46<02:31, 1.21s/it] | |
| 1|650|Loss: 4.872690200805664: 84%|████████▍ | 650/776 [13:46<02:31, 1.21s/it] | |
| 1|650|Loss: 4.872690200805664: 84%|████████▍ | 651/776 [13:47<02:36, 1.25s/it] | |
| 1|651|Loss: 4.69959020614624: 84%|████████▍ | 651/776 [13:47<02:36, 1.25s/it] | |
| 1|651|Loss: 4.69959020614624: 84%|████████▍ | 652/776 [13:48<02:30, 1.21s/it] | |
| 1|652|Loss: 4.9436163902282715: 84%|████████▍ | 652/776 [13:48<02:30, 1.21s/it] | |
| 1|652|Loss: 4.9436163902282715: 84%|████████▍ | 653/776 [13:50<02:26, 1.19s/it] | |
| 1|653|Loss: 5.219525337219238: 84%|████████▍ | 653/776 [13:50<02:26, 1.19s/it] | |
| 1|653|Loss: 5.219525337219238: 84%|████████▍ | 654/776 [13:51<02:22, 1.17s/it] | |
| 1|654|Loss: 4.953904628753662: 84%|████████▍ | 654/776 [13:51<02:22, 1.17s/it] | |
| 1|654|Loss: 4.953904628753662: 84%|████████▍ | 655/776 [13:52<02:29, 1.24s/it] | |
| 1|655|Loss: 4.752069473266602: 84%|████████▍ | 655/776 [13:52<02:29, 1.24s/it] | |
| 1|655|Loss: 4.752069473266602: 85%|████████▍ | 656/776 [13:53<02:24, 1.20s/it] | |
| 1|656|Loss: 4.998683452606201: 85%|████████▍ | 656/776 [13:53<02:24, 1.20s/it] | |
| 1|656|Loss: 4.998683452606201: 85%|████████▍ | 657/776 [13:54<02:19, 1.18s/it] | |
| 1|657|Loss: 5.241946220397949: 85%|████████▍ | 657/776 [13:54<02:19, 1.18s/it] | |
| 1|657|Loss: 5.241946220397949: 85%|████████▍ | 658/776 [13:55<02:16, 1.16s/it] | |
| 1|658|Loss: 4.997658729553223: 85%|████████▍ | 658/776 [13:55<02:16, 1.16s/it] | |
| 1|658|Loss: 4.997658729553223: 85%|████████▍ | 659/776 [13:57<02:14, 1.15s/it] | |
| 1|659|Loss: 5.1118645668029785: 85%|████████▍ | 659/776 [13:57<02:14, 1.15s/it] | |
| 1|659|Loss: 5.1118645668029785: 85%|████████▌ | 660/776 [13:58<02:12, 1.15s/it] | |
| 1|660|Loss: 5.095217227935791: 85%|████████▌ | 660/776 [13:58<02:12, 1.15s/it] | |
| 1|660|Loss: 5.095217227935791: 85%|████████▌ | 661/776 [13:59<02:11, 1.14s/it] | |
| 1|661|Loss: 5.092507839202881: 85%|████████▌ | 661/776 [13:59<02:11, 1.14s/it] | |
| 1|661|Loss: 5.092507839202881: 85%|████████▌ | 662/776 [14:00<02:16, 1.20s/it] | |
| 1|662|Loss: 4.969951629638672: 85%|████████▌ | 662/776 [14:00<02:16, 1.20s/it] | |
| 1|662|Loss: 4.969951629638672: 85%|████████▌ | 663/776 [14:01<02:13, 1.18s/it] | |
| 1|663|Loss: 4.70338773727417: 85%|████████▌ | 663/776 [14:01<02:13, 1.18s/it] | |
| 1|663|Loss: 4.70338773727417: 86%|████████▌ | 664/776 [14:03<02:17, 1.23s/it] | |
| 1|664|Loss: 5.134364128112793: 86%|████████▌ | 664/776 [14:03<02:17, 1.23s/it] | |
| 1|664|Loss: 5.134364128112793: 86%|████████▌ | 665/776 [14:04<02:12, 1.20s/it] | |
| 1|665|Loss: 4.950214862823486: 86%|████████▌ | 665/776 [14:04<02:12, 1.20s/it] | |
| 1|665|Loss: 4.950214862823486: 86%|████████▌ | 666/776 [14:05<02:18, 1.26s/it] | |
| 1|666|Loss: 4.973013877868652: 86%|████████▌ | 666/776 [14:05<02:18, 1.26s/it] | |
| 1|666|Loss: 4.973013877868652: 86%|████████▌ | 667/776 [14:06<02:13, 1.23s/it] | |
| 1|667|Loss: 4.501241207122803: 86%|████████▌ | 667/776 [14:06<02:13, 1.23s/it] | |
| 1|667|Loss: 4.501241207122803: 86%|████████▌ | 668/776 [14:07<02:08, 1.19s/it] | |
| 1|668|Loss: 4.666746139526367: 86%|████████▌ | 668/776 [14:07<02:08, 1.19s/it] | |
| 1|668|Loss: 4.666746139526367: 86%|████████▌ | 669/776 [14:09<02:05, 1.17s/it] | |
| 1|669|Loss: 4.681262969970703: 86%|████████▌ | 669/776 [14:09<02:05, 1.17s/it] | |
| 1|669|Loss: 4.681262969970703: 86%|████████▋ | 670/776 [14:10<02:02, 1.16s/it] | |
| 1|670|Loss: 4.763436794281006: 86%|████████▋ | 670/776 [14:10<02:02, 1.16s/it] | |
| 1|670|Loss: 4.763436794281006: 86%|████████▋ | 671/776 [14:11<02:00, 1.15s/it] | |
| 1|671|Loss: 4.7781829833984375: 86%|████████▋ | 671/776 [14:11<02:00, 1.15s/it] | |
| 1|671|Loss: 4.7781829833984375: 87%|████████▋ | 672/776 [14:12<01:58, 1.14s/it] | |
| 1|672|Loss: 4.801881313323975: 87%|████████▋ | 672/776 [14:12<01:58, 1.14s/it] | |
| 1|672|Loss: 4.801881313323975: 87%|████████▋ | 673/776 [14:13<01:58, 1.15s/it] | |
| 1|673|Loss: 4.4668731689453125: 87%|████████▋ | 673/776 [14:13<01:58, 1.15s/it] | |
| 1|673|Loss: 4.4668731689453125: 87%|████████▋ | 674/776 [14:14<02:02, 1.20s/it] | |
| 1|674|Loss: 5.120551586151123: 87%|████████▋ | 674/776 [14:14<02:02, 1.20s/it] | |
| 1|674|Loss: 5.120551586151123: 87%|████████▋ | 675/776 [14:15<01:58, 1.17s/it] | |
| 1|675|Loss: 4.864243984222412: 87%|████████▋ | 675/776 [14:15<01:58, 1.17s/it] | |
| 1|675|Loss: 4.864243984222412: 87%|████████▋ | 676/776 [14:17<02:01, 1.22s/it] | |
| 1|676|Loss: 4.625429153442383: 87%|████████▋ | 676/776 [14:17<02:01, 1.22s/it] | |
| 1|676|Loss: 4.625429153442383: 87%|████████▋ | 677/776 [14:18<01:58, 1.20s/it] | |
| 1|677|Loss: 5.217968940734863: 87%|████████▋ | 677/776 [14:18<01:58, 1.20s/it] | |
| 1|677|Loss: 5.217968940734863: 87%|████████▋ | 678/776 [14:19<02:03, 1.26s/it] | |
| 1|678|Loss: 5.141139984130859: 87%|████████▋ | 678/776 [14:19<02:03, 1.26s/it] | |
| 1|678|Loss: 5.141139984130859: 88%|████████▊ | 679/776 [14:20<01:58, 1.22s/it] | |
| 1|679|Loss: 5.116586685180664: 88%|████████▊ | 679/776 [14:20<01:58, 1.22s/it] | |
| 1|679|Loss: 5.116586685180664: 88%|████████▊ | 680/776 [14:22<01:54, 1.19s/it] | |
| 1|680|Loss: 4.858044624328613: 88%|████████▊ | 680/776 [14:22<01:54, 1.19s/it] | |
| 1|680|Loss: 4.858044624328613: 88%|████████▊ | 681/776 [14:23<01:51, 1.17s/it] | |
| 1|681|Loss: 4.608528137207031: 88%|████████▊ | 681/776 [14:23<01:51, 1.17s/it] | |
| 1|681|Loss: 4.608528137207031: 88%|████████▊ | 682/776 [14:24<01:48, 1.16s/it] | |
| 1|682|Loss: 4.544140815734863: 88%|████████▊ | 682/776 [14:24<01:48, 1.16s/it] | |
| 1|682|Loss: 4.544140815734863: 88%|████████▊ | 683/776 [14:25<01:47, 1.15s/it] | |
| 1|683|Loss: 4.986246109008789: 88%|████████▊ | 683/776 [14:25<01:47, 1.15s/it] | |
| 1|683|Loss: 4.986246109008789: 88%|████████▊ | 684/776 [14:26<01:45, 1.15s/it] | |
| 1|684|Loss: 4.996842384338379: 88%|████████▊ | 684/776 [14:26<01:45, 1.15s/it] | |
| 1|684|Loss: 4.996842384338379: 88%|████████▊ | 685/776 [14:27<01:43, 1.14s/it] | |
| 1|685|Loss: 4.718258857727051: 88%|████████▊ | 685/776 [14:27<01:43, 1.14s/it] | |
| 1|685|Loss: 4.718258857727051: 88%|████████▊ | 686/776 [14:29<01:47, 1.20s/it] | |
| 1|686|Loss: 4.742488861083984: 88%|████████▊ | 686/776 [14:29<01:47, 1.20s/it] | |
| 1|686|Loss: 4.742488861083984: 89%|████████▊ | 687/776 [14:30<01:44, 1.18s/it] | |
| 1|687|Loss: 4.75994873046875: 89%|████████▊ | 687/776 [14:30<01:44, 1.18s/it] | |
| 1|687|Loss: 4.75994873046875: 89%|████████▊ | 688/776 [14:31<01:48, 1.23s/it] | |
| 1|688|Loss: 4.997797966003418: 89%|████████▊ | 688/776 [14:31<01:48, 1.23s/it] | |
| 1|688|Loss: 4.997797966003418: 89%|████████▉ | 689/776 [14:32<01:44, 1.20s/it] | |
| 1|689|Loss: 5.128139495849609: 89%|████████▉ | 689/776 [14:32<01:44, 1.20s/it] | |
| 1|689|Loss: 5.128139495849609: 89%|████████▉ | 690/776 [14:34<01:48, 1.26s/it] | |
| 1|690|Loss: 6.020885944366455: 89%|████████▉ | 690/776 [14:34<01:48, 1.26s/it] | |
| 1|690|Loss: 6.020885944366455: 89%|████████▉ | 691/776 [14:35<01:43, 1.22s/it] | |
| 1|691|Loss: 5.789050102233887: 89%|████████▉ | 691/776 [14:35<01:43, 1.22s/it] | |
| 1|691|Loss: 5.789050102233887: 89%|████████▉ | 692/776 [14:36<01:39, 1.19s/it] | |
| 1|692|Loss: 5.668978691101074: 89%|████████▉ | 692/776 [14:36<01:39, 1.19s/it] | |
| 1|692|Loss: 5.668978691101074: 89%|████████▉ | 693/776 [14:37<01:37, 1.17s/it] | |
| 1|693|Loss: 5.595397472381592: 89%|████████▉ | 693/776 [14:37<01:37, 1.17s/it] | |
| 1|693|Loss: 5.595397472381592: 89%|████████▉ | 694/776 [14:38<01:35, 1.16s/it] | |
| 1|694|Loss: 5.6424174308776855: 89%|████████▉ | 694/776 [14:38<01:35, 1.16s/it] | |
| 1|694|Loss: 5.6424174308776855: 90%|████████▉ | 695/776 [14:39<01:33, 1.15s/it] | |
| 1|695|Loss: 5.519465923309326: 90%|████████▉ | 695/776 [14:39<01:33, 1.15s/it] | |
| 1|695|Loss: 5.519465923309326: 90%|████████▉ | 696/776 [14:40<01:31, 1.14s/it] | |
| 1|696|Loss: 5.174715042114258: 90%|████████▉ | 696/776 [14:40<01:31, 1.14s/it] | |
| 1|696|Loss: 5.174715042114258: 90%|████████▉ | 697/776 [14:41<01:29, 1.14s/it] | |
| 1|697|Loss: 5.638344764709473: 90%|████████▉ | 697/776 [14:41<01:29, 1.14s/it] | |
| 1|697|Loss: 5.638344764709473: 90%|████████▉ | 698/776 [14:43<01:33, 1.20s/it] | |
| 1|698|Loss: 5.839041233062744: 90%|████████▉ | 698/776 [14:43<01:33, 1.20s/it] | |
| 1|698|Loss: 5.839041233062744: 90%|█████████ | 699/776 [14:44<01:31, 1.19s/it] | |
| 1|699|Loss: 5.1978888511657715: 90%|█████████ | 699/776 [14:44<01:31, 1.19s/it] | |
| 1|699|Loss: 5.1978888511657715: 90%|█████████ | 700/776 [14:45<01:33, 1.23s/it] | |
| 1|700|Loss: 5.4108357429504395: 90%|█████████ | 700/776 [14:45<01:33, 1.23s/it] | |
| 1|700|Loss: 5.4108357429504395: 90%|█████████ | 701/776 [14:46<01:29, 1.20s/it] | |
| 1|701|Loss: 5.345335483551025: 90%|█████████ | 701/776 [14:46<01:29, 1.20s/it] | |
| 1|701|Loss: 5.345335483551025: 90%|█████████ | 702/776 [14:48<01:33, 1.26s/it] | |
| 1|702|Loss: 5.484263896942139: 90%|█████████ | 702/776 [14:48<01:33, 1.26s/it] | |
| 1|702|Loss: 5.484263896942139: 91%|█████████ | 703/776 [14:49<01:29, 1.23s/it] | |
| 1|703|Loss: 5.281320095062256: 91%|█████████ | 703/776 [14:49<01:29, 1.23s/it] | |
| 1|703|Loss: 5.281320095062256: 91%|█████████ | 704/776 [14:50<01:26, 1.21s/it] | |
| 1|704|Loss: 5.076435089111328: 91%|█████████ | 704/776 [14:50<01:26, 1.21s/it] | |
| 1|704|Loss: 5.076435089111328: 91%|█████████ | 705/776 [14:51<01:23, 1.18s/it] | |
| 1|705|Loss: 5.031891822814941: 91%|█████████ | 705/776 [14:51<01:23, 1.18s/it] | |
| 1|705|Loss: 5.031891822814941: 91%|█████████ | 706/776 [14:52<01:21, 1.16s/it] | |
| 1|706|Loss: 5.24143648147583: 91%|█████████ | 706/776 [14:52<01:21, 1.16s/it] | |
| 1|706|Loss: 5.24143648147583: 91%|█████████ | 707/776 [14:54<01:19, 1.15s/it] | |
| 1|707|Loss: 4.917424201965332: 91%|█████████ | 707/776 [14:54<01:19, 1.15s/it] | |
| 1|707|Loss: 4.917424201965332: 91%|█████████ | 708/776 [14:55<01:17, 1.14s/it] | |
| 1|708|Loss: 5.264927387237549: 91%|█████████ | 708/776 [14:55<01:17, 1.14s/it] | |
| 1|708|Loss: 5.264927387237549: 91%|█████████▏| 709/776 [14:56<01:16, 1.14s/it] | |
| 1|709|Loss: 5.086368083953857: 91%|█████████▏| 709/776 [14:56<01:16, 1.14s/it] | |
| 1|709|Loss: 5.086368083953857: 91%|█████████▏| 710/776 [14:57<01:18, 1.19s/it] | |
| 1|710|Loss: 5.181517601013184: 91%|█████████▏| 710/776 [14:57<01:18, 1.19s/it] | |
| 1|710|Loss: 5.181517601013184: 92%|█████████▏| 711/776 [14:58<01:16, 1.17s/it] | |
| 1|711|Loss: 5.327457904815674: 92%|█████████▏| 711/776 [14:58<01:16, 1.17s/it] | |
| 1|711|Loss: 5.327457904815674: 92%|█████████▏| 712/776 [15:00<01:17, 1.22s/it] | |
| 1|712|Loss: 5.32895565032959: 92%|█████████▏| 712/776 [15:00<01:17, 1.22s/it] | |
| 1|712|Loss: 5.32895565032959: 92%|█████████▏| 713/776 [15:01<01:19, 1.27s/it] | |
| 1|713|Loss: 5.450242042541504: 92%|█████████▏| 713/776 [15:01<01:19, 1.27s/it] | |
| 1|713|Loss: 5.450242042541504: 92%|█████████▏| 714/776 [15:02<01:16, 1.23s/it] | |
| 1|714|Loss: 5.255544662475586: 92%|█████████▏| 714/776 [15:02<01:16, 1.23s/it] | |
| 1|714|Loss: 5.255544662475586: 92%|█████████▏| 715/776 [15:03<01:13, 1.20s/it] | |
| 1|715|Loss: 5.304224491119385: 92%|█████████▏| 715/776 [15:03<01:13, 1.20s/it] | |
| 1|715|Loss: 5.304224491119385: 92%|█████████▏| 716/776 [15:04<01:10, 1.18s/it] | |
| 1|716|Loss: 5.135974407196045: 92%|█████████▏| 716/776 [15:04<01:10, 1.18s/it] | |
| 1|716|Loss: 5.135974407196045: 92%|█████████▏| 717/776 [15:05<01:08, 1.17s/it] | |
| 1|717|Loss: 4.933016300201416: 92%|█████████▏| 717/776 [15:05<01:08, 1.17s/it] | |
| 1|717|Loss: 4.933016300201416: 93%|█████████▎| 718/776 [15:07<01:07, 1.16s/it] | |
| 1|718|Loss: 5.101485252380371: 93%|█████████▎| 718/776 [15:07<01:07, 1.16s/it] | |
| 1|718|Loss: 5.101485252380371: 93%|█████████▎| 719/776 [15:08<01:05, 1.15s/it] | |
| 1|719|Loss: 5.349795341491699: 93%|█████████▎| 719/776 [15:08<01:05, 1.15s/it] | |
| 1|719|Loss: 5.349795341491699: 93%|█████████▎| 720/776 [15:09<01:03, 1.14s/it] | |
| 1|720|Loss: 5.366320610046387: 93%|█████████▎| 720/776 [15:09<01:03, 1.14s/it] | |
| 1|720|Loss: 5.366320610046387: 93%|█████████▎| 721/776 [15:10<01:02, 1.14s/it] | |
| 1|721|Loss: 5.076739311218262: 93%|█████████▎| 721/776 [15:10<01:02, 1.14s/it] | |
| 1|721|Loss: 5.076739311218262: 93%|█████████▎| 722/776 [15:11<01:04, 1.20s/it] | |
| 1|722|Loss: 5.383163928985596: 93%|█████████▎| 722/776 [15:11<01:04, 1.20s/it] | |
| 1|722|Loss: 5.383163928985596: 93%|█████████▎| 723/776 [15:12<01:02, 1.18s/it] | |
| 1|723|Loss: 5.072611331939697: 93%|█████████▎| 723/776 [15:12<01:02, 1.18s/it] | |
| 1|723|Loss: 5.072611331939697: 93%|█████████▎| 724/776 [15:14<01:04, 1.25s/it] | |
| 1|724|Loss: 5.349552154541016: 93%|█████████▎| 724/776 [15:14<01:04, 1.25s/it] | |
| 1|724|Loss: 5.349552154541016: 93%|█████████▎| 725/776 [15:15<01:04, 1.27s/it] | |
| 1|725|Loss: 5.131572723388672: 93%|█████████▎| 725/776 [15:15<01:04, 1.27s/it] | |
| 1|725|Loss: 5.131572723388672: 94%|█████████▎| 726/776 [15:16<01:01, 1.22s/it] | |
| 1|726|Loss: 5.3264384269714355: 94%|█████████▎| 726/776 [15:16<01:01, 1.22s/it] | |
| 1|726|Loss: 5.3264384269714355: 94%|█████████▎| 727/776 [15:17<00:59, 1.20s/it] | |
| 1|727|Loss: 5.233364582061768: 94%|█████████▎| 727/776 [15:17<00:59, 1.20s/it] | |
| 1|727|Loss: 5.233364582061768: 94%|█████████▍| 728/776 [15:19<00:56, 1.18s/it] | |
| 1|728|Loss: 5.161427974700928: 94%|█████████▍| 728/776 [15:19<00:56, 1.18s/it] | |
| 1|728|Loss: 5.161427974700928: 94%|█████████▍| 729/776 [15:20<00:54, 1.16s/it] | |
| 1|729|Loss: 5.0542802810668945: 94%|█████████▍| 729/776 [15:20<00:54, 1.16s/it] | |
| 1|729|Loss: 5.0542802810668945: 94%|█████████▍| 730/776 [15:21<00:53, 1.16s/it] | |
| 1|730|Loss: 5.18071174621582: 94%|█████████▍| 730/776 [15:21<00:53, 1.16s/it] | |
| 1|730|Loss: 5.18071174621582: 94%|█████████▍| 731/776 [15:22<00:52, 1.16s/it] | |
| 1|731|Loss: 5.486654758453369: 94%|█████████▍| 731/776 [15:22<00:52, 1.16s/it] | |
| 1|731|Loss: 5.486654758453369: 94%|█████████▍| 732/776 [15:23<00:50, 1.15s/it] | |
| 1|732|Loss: 4.902393817901611: 94%|█████████▍| 732/776 [15:23<00:50, 1.15s/it] | |
| 1|732|Loss: 4.902393817901611: 94%|█████████▍| 733/776 [15:24<00:49, 1.14s/it] | |
| 1|733|Loss: 5.065219402313232: 94%|█████████▍| 733/776 [15:24<00:49, 1.14s/it] | |
| 1|733|Loss: 5.065219402313232: 95%|█████████▍| 734/776 [15:26<00:50, 1.20s/it] | |
| 1|734|Loss: 4.990045547485352: 95%|█████████▍| 734/776 [15:26<00:50, 1.20s/it] | |
| 1|734|Loss: 4.990045547485352: 95%|█████████▍| 735/776 [15:27<00:48, 1.18s/it] | |
| 1|735|Loss: 5.437990665435791: 95%|█████████▍| 735/776 [15:27<00:48, 1.18s/it] | |
| 1|735|Loss: 5.437990665435791: 95%|█████████▍| 736/776 [15:28<00:49, 1.24s/it] | |
| 1|736|Loss: 5.233785629272461: 95%|█████████▍| 736/776 [15:28<00:49, 1.24s/it] | |
| 1|736|Loss: 5.233785629272461: 95%|█████████▍| 737/776 [15:29<00:49, 1.27s/it] | |
| 1|737|Loss: 5.1625895500183105: 95%|█████████▍| 737/776 [15:29<00:49, 1.27s/it] | |
| 1|737|Loss: 5.1625895500183105: 95%|█████████▌| 738/776 [15:31<00:46, 1.23s/it] | |
| 1|738|Loss: 5.198235034942627: 95%|█████████▌| 738/776 [15:31<00:46, 1.23s/it] | |
| 1|738|Loss: 5.198235034942627: 95%|█████████▌| 739/776 [15:32<00:44, 1.20s/it] | |
| 1|739|Loss: 5.055983066558838: 95%|█████████▌| 739/776 [15:32<00:44, 1.20s/it] | |
| 1|739|Loss: 5.055983066558838: 95%|█████████▌| 740/776 [15:33<00:42, 1.18s/it] | |
| 1|740|Loss: 5.411418437957764: 95%|█████████▌| 740/776 [15:33<00:42, 1.18s/it] | |
| 1|740|Loss: 5.411418437957764: 95%|█████████▌| 741/776 [15:34<00:40, 1.16s/it] | |
| 1|741|Loss: 5.240295886993408: 95%|█████████▌| 741/776 [15:34<00:40, 1.16s/it] | |
| 1|741|Loss: 5.240295886993408: 96%|█████████▌| 742/776 [15:35<00:39, 1.15s/it] | |
| 1|742|Loss: 5.47500467300415: 96%|█████████▌| 742/776 [15:35<00:39, 1.15s/it] | |
| 1|742|Loss: 5.47500467300415: 96%|█████████▌| 743/776 [15:36<00:37, 1.15s/it] | |
| 1|743|Loss: 4.882148265838623: 96%|█████████▌| 743/776 [15:36<00:37, 1.15s/it] | |
| 1|743|Loss: 4.882148265838623: 96%|█████████▌| 744/776 [15:37<00:36, 1.14s/it] | |
| 1|744|Loss: 5.022932529449463: 96%|█████████▌| 744/776 [15:37<00:36, 1.14s/it] | |
| 1|744|Loss: 5.022932529449463: 96%|█████████▌| 745/776 [15:38<00:35, 1.13s/it] | |
| 1|745|Loss: 5.034208297729492: 96%|█████████▌| 745/776 [15:38<00:35, 1.13s/it] | |
| 1|745|Loss: 5.034208297729492: 96%|█████████▌| 746/776 [15:40<00:35, 1.19s/it] | |
| 1|746|Loss: 5.130537986755371: 96%|█████████▌| 746/776 [15:40<00:35, 1.19s/it] | |
| 1|746|Loss: 5.130537986755371: 96%|█████████▋| 747/776 [15:41<00:34, 1.18s/it] | |
| 1|747|Loss: 5.43413782119751: 96%|█████████▋| 747/776 [15:41<00:34, 1.18s/it] | |
| 1|747|Loss: 5.43413782119751: 96%|█████████▋| 748/776 [15:42<00:34, 1.24s/it] | |
| 1|748|Loss: 5.069612979888916: 96%|█████████▋| 748/776 [15:42<00:34, 1.24s/it] | |
| 1|748|Loss: 5.069612979888916: 97%|█████████▋| 749/776 [15:44<00:34, 1.27s/it] | |
| 1|749|Loss: 5.045153617858887: 97%|█████████▋| 749/776 [15:44<00:34, 1.27s/it] | |
| 1|749|Loss: 5.045153617858887: 97%|█████████▋| 750/776 [15:45<00:31, 1.23s/it] | |
| 1|750|Loss: 5.187655925750732: 97%|█████████▋| 750/776 [15:45<00:31, 1.23s/it] | |
| 1|750|Loss: 5.187655925750732: 97%|█████████▋| 751/776 [15:46<00:30, 1.20s/it] | |
| 1|751|Loss: 5.17631721496582: 97%|█████████▋| 751/776 [15:46<00:30, 1.20s/it] | |
| 1|751|Loss: 5.17631721496582: 97%|█████████▋| 752/776 [15:47<00:28, 1.20s/it] | |
| 1|752|Loss: 5.991663455963135: 97%|█████████▋| 752/776 [15:47<00:28, 1.20s/it] | |
| 1|752|Loss: 5.991663455963135: 97%|█████████▋| 753/776 [15:48<00:27, 1.18s/it] | |
| 1|753|Loss: 5.2432966232299805: 97%|█████████▋| 753/776 [15:48<00:27, 1.18s/it] | |
| 1|753|Loss: 5.2432966232299805: 97%|█████████▋| 754/776 [15:49<00:25, 1.16s/it] | |
| 1|754|Loss: 5.230746269226074: 97%|█████████▋| 754/776 [15:49<00:25, 1.16s/it] | |
| 1|754|Loss: 5.230746269226074: 97%|█████████▋| 755/776 [15:51<00:24, 1.15s/it] | |
| 1|755|Loss: 5.189696311950684: 97%|█████████▋| 755/776 [15:51<00:24, 1.15s/it] | |
| 1|755|Loss: 5.189696311950684: 97%|█████████▋| 756/776 [15:52<00:22, 1.14s/it] | |
| 1|756|Loss: 5.238321781158447: 97%|█████████▋| 756/776 [15:52<00:22, 1.14s/it] | |
| 1|756|Loss: 5.238321781158447: 98%|█████████▊| 757/776 [15:53<00:21, 1.14s/it] | |
| 1|757|Loss: 5.0649943351745605: 98%|█████████▊| 757/776 [15:53<00:21, 1.14s/it] | |
| 1|757|Loss: 5.0649943351745605: 98%|█████████▊| 758/776 [15:54<00:21, 1.20s/it] | |
| 1|758|Loss: 5.163542747497559: 98%|█████████▊| 758/776 [15:54<00:21, 1.20s/it] | |
| 1|758|Loss: 5.163542747497559: 98%|█████████▊| 759/776 [15:55<00:19, 1.17s/it] | |
| 1|759|Loss: 5.369495391845703: 98%|█████████▊| 759/776 [15:55<00:19, 1.17s/it] | |
| 1|759|Loss: 5.369495391845703: 98%|█████████▊| 760/776 [15:57<00:19, 1.24s/it] | |
| 1|760|Loss: 5.2620744705200195: 98%|█████████▊| 760/776 [15:57<00:19, 1.24s/it] | |
| 1|760|Loss: 5.2620744705200195: 98%|█████████▊| 761/776 [15:58<00:18, 1.26s/it] | |
| 1|761|Loss: 5.13279390335083: 98%|█████████▊| 761/776 [15:58<00:18, 1.26s/it] | |
| 1|761|Loss: 5.13279390335083: 98%|█████████▊| 762/776 [15:59<00:17, 1.23s/it] | |
| 1|762|Loss: 5.231086254119873: 98%|█████████▊| 762/776 [15:59<00:17, 1.23s/it] | |
| 1|762|Loss: 5.231086254119873: 98%|█████████▊| 763/776 [16:00<00:15, 1.20s/it] | |
| 1|763|Loss: 5.282261371612549: 98%|█████████▊| 763/776 [16:00<00:15, 1.20s/it] | |
| 1|763|Loss: 5.282261371612549: 98%|█████████▊| 764/776 [16:01<00:14, 1.18s/it] | |
| 1|764|Loss: 5.3079047203063965: 98%|█████████▊| 764/776 [16:01<00:14, 1.18s/it] | |
| 1|764|Loss: 5.3079047203063965: 99%|█████████▊| 765/776 [16:02<00:12, 1.17s/it] | |
| 1|765|Loss: 5.212764739990234: 99%|█████████▊| 765/776 [16:02<00:12, 1.17s/it] | |
| 1|765|Loss: 5.212764739990234: 99%|█████████▊| 766/776 [16:04<00:11, 1.15s/it] | |
| 1|766|Loss: 5.329611301422119: 99%|█████████▊| 766/776 [16:04<00:11, 1.15s/it] | |
| 1|766|Loss: 5.329611301422119: 99%|█████████▉| 767/776 [16:05<00:10, 1.14s/it] | |
| 1|767|Loss: 5.385195732116699: 99%|█████████▉| 767/776 [16:05<00:10, 1.14s/it] | |
| 1|767|Loss: 5.385195732116699: 99%|█████████▉| 768/776 [16:06<00:09, 1.14s/it] | |
| 1|768|Loss: 5.156403064727783: 99%|█████████▉| 768/776 [16:06<00:09, 1.14s/it] | |
| 1|768|Loss: 5.156403064727783: 99%|█████████▉| 769/776 [16:07<00:07, 1.14s/it] | |
| 1|769|Loss: 4.608053684234619: 99%|█████████▉| 769/776 [16:07<00:07, 1.14s/it] | |
| 1|769|Loss: 4.608053684234619: 99%|█████████▉| 770/776 [16:08<00:07, 1.21s/it] | |
| 1|770|Loss: 5.120639324188232: 99%|█████████▉| 770/776 [16:08<00:07, 1.21s/it] | |
| 1|770|Loss: 5.120639324188232: 99%|█████████▉| 771/776 [16:10<00:06, 1.26s/it] | |
| 1|771|Loss: 4.95815896987915: 99%|█████████▉| 771/776 [16:10<00:06, 1.26s/it] | |
| 1|771|Loss: 4.95815896987915: 99%|█████████▉| 772/776 [16:11<00:04, 1.22s/it] | |
| 1|772|Loss: 5.099244117736816: 99%|█████████▉| 772/776 [16:11<00:04, 1.22s/it] | |
| 1|772|Loss: 5.099244117736816: 100%|█████████▉| 773/776 [16:12<00:03, 1.26s/it] | |
| 1|773|Loss: 5.313309192657471: 100%|█████████▉| 773/776 [16:12<00:03, 1.26s/it] | |
| 1|773|Loss: 5.313309192657471: 100%|█████████▉| 774/776 [16:13<00:02, 1.22s/it] | |
| 1|774|Loss: 5.418424606323242: 100%|█████████▉| 774/776 [16:13<00:02, 1.22s/it] | |
| 1|774|Loss: 5.418424606323242: 100%|█████████▉| 775/776 [16:14<00:01, 1.19s/it] | |
| 1|775|Loss: 5.0594892501831055: 100%|█████████▉| 775/776 [16:14<00:01, 1.19s/it] | |
| 1|775|Loss: 5.0594892501831055: 100%|██████████| 776/776 [16:16<00:00, 1.17s/it] | |
| 1|776|Loss: 5.039679527282715: 100%|██████████| 776/776 [16:16<00:00, 1.17s/it] INFO:torchtune.utils._logging:Saving checkpoint. This may take some time. Retrieving full model state dict... | |
| INFO:torchtune.utils._logging:Getting full model state dict took 72.64 secs | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.27 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00001-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00002-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.66 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00003-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.63 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00004-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00005-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00006-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00007-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.66 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00008-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.63 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00009-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00010-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00011-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00012-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.66 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00013-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.63 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00014-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00015-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00016-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00017-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.66 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00018-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.63 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00019-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00020-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00021-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00022-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.66 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00023-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.63 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00024-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00025-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00026-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.34 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00027-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.66 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00028-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 4.63 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00029-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Model checkpoint of size 1.96 GiB saved to /home/brian/model/Llama3.3-70B-fft-output/epoch_0/model-00030-of-00030.safetensors | |
| INFO:torchtune.utils._logging:Saving final epoch checkpoint. | |
| INFO:torchtune.utils._logging:The full model checkpoint, including all weights and configurations, has been saved successfully.You can now use this checkpoint for further training or inference. | |
| INFO:torchtune.utils._logging:Saving checkpoint took 336.23 secs | |
| worker-0:544573:545855 [2] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544575:545857 [4] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544572:545853 [1] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544574:545845 [3] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544577:545851 [6] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544576:545847 [5] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544578:545859 [7] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544439:545736 [0] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544441:545635 [2] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544443:545633 [4] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544445:545650 [6] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544440:545654 [1] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544444:545639 [5] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544446:545648 [7] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544442:545644 [3] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544446:560052 [7] NCCL INFO comm 0x561e314b9bd0 rank 1 nranks 2 cudaDev 7 busId b7000 - Abort COMPLETE | |
| worker-1:544446:545742 [7] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544445:560050 [6] NCCL INFO comm 0x558c66abf370 rank 1 nranks 2 cudaDev 6 busId b3000 - Abort COMPLETE | |
| worker-1:544445:545732 [6] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544443:560049 [4] NCCL INFO comm 0x5645eba77300 rank 1 nranks 2 cudaDev 4 busId ab000 - Abort COMPLETE | |
| worker-1:544443:545740 [4] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544440:560046 [1] NCCL INFO comm 0x55a19b08e290 rank 1 nranks 2 cudaDev 1 busId 91000 - Abort COMPLETE | |
| worker-1:544442:560051 [3] NCCL INFO comm 0x55c0f43244b0 rank 1 nranks 2 cudaDev 3 busId 99000 - Abort COMPLETE | |
| worker-1:544440:545730 [1] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544442:545734 [3] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544444:560048 [5] NCCL INFO comm 0x55ca5f9cd020 rank 1 nranks 2 cudaDev 5 busId af000 - Abort COMPLETE | |
| worker-1:544444:545738 [5] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544441:560047 [2] NCCL INFO comm 0x56101a4956f0 rank 1 nranks 2 cudaDev 2 busId 95000 - Abort COMPLETE | |
| worker-1:544441:545744 [2] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544573:561554 [2] NCCL INFO comm 0x5640ba357a80 rank 2 nranks 8 cudaDev 2 busId 95000 - Abort COMPLETE | |
| worker-0:544573:545752 [2] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544572:561553 [1] NCCL INFO comm 0x556975ec4a10 rank 1 nranks 8 cudaDev 1 busId 91000 - Abort COMPLETE | |
| worker-0:544572:545768 [1] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544578:561559 [7] NCCL INFO comm 0x55a99bcd4aa0 rank 7 nranks 8 cudaDev 7 busId b7000 - Abort COMPLETE | |
| worker-0:544578:545760 [7] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544575:561558 [4] NCCL INFO comm 0x5571647ec3b0 rank 4 nranks 8 cudaDev 4 busId ab000 - Abort COMPLETE | |
| worker-0:544575:545750 [4] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544574:561555 [3] NCCL INFO comm 0x555924c564e0 rank 3 nranks 8 cudaDev 3 busId 99000 - Abort COMPLETE | |
| worker-0:544574:545758 [3] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544576:561556 [5] NCCL INFO comm 0x55c6aa5c6b00 rank 5 nranks 8 cudaDev 5 busId af000 - Abort COMPLETE | |
| worker-0:544576:545754 [5] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544577:561557 [6] NCCL INFO comm 0x559a9b5d95f0 rank 6 nranks 8 cudaDev 6 busId b3000 - Abort COMPLETE | |
| worker-0:544577:545762 [6] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544573:561569 [2] NCCL INFO comm 0x5640ba340ff0 rank 0 nranks 2 cudaDev 2 busId 95000 - Abort COMPLETE | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-0:544572:561570 [1] NCCL INFO comm 0x556975eae0b0 rank 0 nranks 2 cudaDev 1 busId 91000 - Abort COMPLETE | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-0:544578:561580 [7] NCCL INFO comm 0x55a99bcbe170 rank 0 nranks 2 cudaDev 7 busId b7000 - Abort COMPLETE | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544575:561581 [4] NCCL INFO comm 0x5571647d5ab0 rank 0 nranks 2 cudaDev 4 busId ab000 - Abort COMPLETE | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-0:544574:561582 [3] NCCL INFO comm 0x555924c3fb10 rank 0 nranks 2 cudaDev 3 busId 99000 - Abort COMPLETE | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-0:544576:561583 [5] NCCL INFO comm 0x55c6aa5b0230 rank 0 nranks 2 cudaDev 5 busId af000 - Abort COMPLETE | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544446:560053 [7] NCCL INFO comm 0x561e314d06d0 rank 7 nranks 8 cudaDev 7 busId b7000 - Abort COMPLETE | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 7 | |
| worker-0:544577:561584 [6] NCCL INFO comm 0x559a9b5c2d10 rank 0 nranks 2 cudaDev 6 busId b3000 - Abort COMPLETE | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544445:560054 [6] NCCL INFO comm 0x558c66ad5e60 rank 6 nranks 8 cudaDev 6 busId b3000 - Abort COMPLETE | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 6 | |
| worker-1:544440:560056 [1] NCCL INFO comm 0x55a19b0a4e30 rank 1 nranks 8 cudaDev 1 busId 91000 - Abort COMPLETE | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 1 | |
| worker-1:544444:560058 [5] NCCL INFO comm 0x55ca5f9e3ac0 rank 5 nranks 8 cudaDev 5 busId af000 - Abort COMPLETE | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 5 | |
| worker-1:544442:560057 [3] NCCL INFO comm 0x55c0f433aef0 rank 3 nranks 8 cudaDev 3 busId 99000 - Abort COMPLETE | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 3 | |
| worker-1:544441:560059 [2] NCCL INFO comm 0x56101a4ac2a0 rank 2 nranks 8 cudaDev 2 busId 95000 - Abort COMPLETE | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544443:560055 [4] NCCL INFO comm 0x5645eba8dd80 rank 4 nranks 8 cudaDev 4 busId ab000 - Abort COMPLETE | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 2 | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 4 | |
| worker-1:544439:560045 [0] NCCL INFO comm 0x55d1fd231e80 rank 0 nranks 8 cudaDev 0 busId 8d000 - Abort COMPLETE | |
| worker-1:544439:545642 [0] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| 1|776|Loss: 5.039679527282715: 100%|██████████| 776/776 [21:55<00:00, 1.69s/it] | |
| worker-0:544571:545849 [0] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544439:560082 [0] NCCL INFO comm 0x55d1fd21b220 rank 1 nranks 2 cudaDev 0 busId 8d000 - Abort COMPLETE | |
| worker-1:544440:545412 [1] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544439:545411 [0] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544441:545409 [2] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544443:545423 [4] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544442:545410 [3] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544445:545421 [6] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544444:545419 [5] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544446:545417 [7] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544571:561601 [0] NCCL INFO comm 0x55ec66bf52e0 rank 0 nranks 8 cudaDev 0 busId 8d000 - Abort COMPLETE | |
| worker-0:544571:545756 [0] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544446:560075 [7] NCCL INFO comm 0x561dd66d3a80 rank 15 nranks 16 cudaDev 7 busId b7000 - Abort COMPLETE | |
| worker-1:544443:560081 [4] NCCL INFO comm 0x5645e5c93110 rank 12 nranks 16 cudaDev 4 busId ab000 - Abort COMPLETE | |
| worker-1:544445:560076 [6] NCCL INFO comm 0x558c0bcb8dd0 rank 14 nranks 16 cudaDev 6 busId b3000 - Abort COMPLETE | |
| worker-0:544571:561610 [0] NCCL INFO comm 0x55ec66bde920 rank 0 nranks 2 cudaDev 0 busId 8d000 - Abort COMPLETE | |
| worker-0:544572:545514 [1] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544571:545513 [0] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544573:545509 [2] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544574:545510 [3] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544576:545503 [5] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544577:545502 [6] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544575:545507 [4] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-0:544578:545501 [7] NCCL INFO [Service thread] Connection closed by localRank 0 | |
| worker-1:544442:560079 [3] NCCL INFO comm 0x55c0f3540b00 rank 11 nranks 16 cudaDev 3 busId 99000 - Abort COMPLETE | |
| worker-1:544444:560078 [5] NCCL INFO comm 0x55ca04bc7280 rank 13 nranks 16 cudaDev 5 busId af000 - Abort COMPLETE | |
| worker-1:544440:560077 [1] NCCL INFO comm 0x55a1402a7ca0 rank 9 nranks 16 cudaDev 1 busId 91000 - Abort COMPLETE | |
| worker-1:544441:560080 [2] NCCL INFO comm 0x560fbf6ae730 rank 10 nranks 16 cudaDev 2 busId 95000 - Abort COMPLETE | |
| worker-1:544439:560091 [0] NCCL INFO comm 0x55d1a2434c10 rank 8 nranks 16 cudaDev 0 busId 8d000 - Abort COMPLETE | |
| worker-0:544577:561600 [6] NCCL INFO comm 0x559a407dbca0 rank 6 nranks 16 cudaDev 6 busId b3000 - Abort COMPLETE | |
| worker-0:544574:561589 [3] NCCL INFO comm 0x5558c9e593a0 rank 3 nranks 16 cudaDev 3 busId 99000 - Abort COMPLETE | |
| worker-0:544572:561586 [1] NCCL INFO comm 0x55691b0c7fb0 rank 1 nranks 16 cudaDev 1 busId 91000 - Abort COMPLETE | |
| worker-0:544575:561588 [4] NCCL INFO comm 0x5571099efd80 rank 4 nranks 16 cudaDev 4 busId ab000 - Abort COMPLETE | |
| worker-0:544578:561587 [7] NCCL INFO comm 0x55a940ed73e0 rank 7 nranks 16 cudaDev 7 busId b7000 - Abort COMPLETE | |
| worker-0:544576:561590 [5] NCCL INFO comm 0x55c64f7ca4d0 rank 5 nranks 16 cudaDev 5 busId af000 - Abort COMPLETE | |
| worker-0:544573:561585 [2] NCCL INFO comm 0x56405f55b420 rank 2 nranks 16 cudaDev 2 busId 95000 - Abort COMPLETE | |
| worker-0:544571:561611 [0] NCCL INFO comm 0x55ec0bcf2090 rank 0 nranks 16 cudaDev 0 busId 8d000 - Abort COMPLETE | |
| I0410 12:18:49.675000 544254 torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish. | |
| I0410 12:18:49.677000 544254 torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish | |
| I0410 12:18:50.314000 544220 torch/distributed/elastic/agent/server/api.py:864] [default] worker group successfully finished. Waiting 300 seconds for other agents to finish. | |
| I0410 12:18:50.316000 544220 torch/distributed/elastic/agent/server/api.py:917] Local worker group finished (WorkerState.SUCCEEDED). Waiting 300 seconds for other agents to finish | |
| I0410 12:18:50.318000 544254 torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.640521764755249 seconds | |
| I0410 12:18:50.317000 544220 torch/distributed/elastic/agent/server/api.py:931] Done waiting for other agents. Elapsed: 0.0012269020080566406 seconds | |
| Running with torchrun... | |
| Running with torchrun... |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment