VRehnberg · October 30, 2024 14:56
diff --git a/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1_partial.log b/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1_partial.log
 6.34s call     unit/utils/test_init_on_device.py::TestOnDevice::test_on_device[meta]
 6.34s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask0]
 6.32s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[non_tensor4]
 6.32s call     unit/runtime/zero/test_zero_context.py::TestSerialContext::test_throughput_calculation
 6.31s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output1]
 6.30s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-19]
 6.29s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs2[mask1]
 6.27s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-15]
 6.27s call     unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0]
 6.26s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[non_tensor4]
 6.26s call     unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[False-1]
 6.26s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[None]
 6.25s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs1_outputs1[mask0]
 6.25s call     unit/runtime/test_ds_config_dict.py::TestBasicConfig::test_accelerator
 6.24s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[True]
 6.24s call     unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[False-2]
 6.23s call     unit/runtime/test_ds_config_dict.py::TestInitNoOptimizer::test
 6.23s call     unit/comm/test_dist.py::TestWorldSizeOverrideDistTest::test_world_size_1
 6.22s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[True]
 6.22s call     unit/runtime/half_precision/test_fp16.py::TestAdamwFP16Basic::test
 6.22s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-10]
 6.22s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[True]
 6.22s call     unit/runtime/half_precision/test_dynamic_loss_scale.py::TestFused::test_all_overflow
 6.22s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs1[mask1]
 6.22s call     unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.01-0.01-19-False]
 6.21s call     unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[2]
 6.21s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[non_tensor3]
 6.21s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[None]
 6.21s call     unit/runtime/test_ds_config_dict.py::TestBasicConfig::test_check_version
 6.20s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[True]
 6.20s call     unit/runtime/test_data.py::TestDataLoaderDropLast::test[1-True]
 6.19s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs1[mask1]
 6.19s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[non_tensor3]
 6.19s call     unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[False]
 6.19s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[None]
 6.19s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs3[mask0]
 6.18s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask0]
 6.18s call     unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None]
 6.18s call     unit/runtime/zero/test_zero.py::TestIncorectAllgatherBucketSize::test[1000]
 6.18s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output2]
 6.18s call     unit/runtime/half_precision/test_bf16.py::TestZeroAllowUntestedOptimizer::test
 6.17s call     unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable[None]
 6.17s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask0]
 6.16s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-15]
 6.16s call     unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[200-20-0.1-0.2]
 6.16s call     unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_same_lrscheler_and_callable[Callable]
 6.15s call     unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[2]
 6.15s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-33]
 6.14s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithoutGrad::test_ckpt_non_tensor_output_ordering[None]
 6.14s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithoutGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output2]
 6.14s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-True-True]
 6.14s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs3[mask1]
 6.14s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Callable-Callable]
 6.14s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask0]
 6.13s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-True]
 6.13s call     unit/profiling/flops_profiler/test_flops_profiler.py::TestFlopsProfiler::test
 6.12s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-33]
 6.12s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-19]
 6.12s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[2]
 6.11s call     unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[False-3]
 6.10s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Callable-None]
 6.09s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[None]
 6.09s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_arg_none[mask0]
 6.09s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_arg_none[mask0]
 6.08s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output3]
 6.08s call     unit/comm/test_dist.py::TestDistInferenceAllReduce::test[dtype1]
 6.07s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-True]
 6.07s call     unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[True]
 6.06s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[None]
 6.06s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask1]
 6.06s call     unit/runtime/test_ds_config_dict.py::TestConfigLoad::test_dict
 6.06s call     unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[Callable]
 6.06s call     unit/runtime/half_precision/test_dynamic_loss_scale.py::TestUnfused::test_all_overflow
 6.05s call     unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[True-2]
 6.05s call     unit/runtime/test_data.py::TestDataLoaderDropLast::test[4-False]
 6.04s call     unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.0001-1e-05-1-True]
 6.04s call     unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_same_lrscheler_and_callable[None]
 6.02s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[2]
 6.01s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-101]
 6.01s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[non_tensor3]
 6.00s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithGrad::test_ckpt_non_tensor_output_ordering[None]
 6.00s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[non_tensor4]
 6.00s call     unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable[_LRScheduler]
 5.99s call     unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-1]
 5.99s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Callable-_LRScheduler]
 5.99s call     unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_missing_latest
 5.99s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[True]
 5.99s call     unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-3]
 5.99s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[None]
 5.97s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask1]
 5.96s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-33]
 5.96s call     unit/runtime/test_data.py::TestDataLoaderDropLast::test[4-True]
 5.96s call     unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[OneCycle-params2]
 5.93s call     unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.0001-0.001-10-True]
 5.93s call     unit/runtime/zero/test_zero.py::TestZeroPartitionCache::test_training_partition_cache[False]
 5.93s call     unit/runtime/zero/test_zero_nesting_init.py::TestNestingInit::test_nesting_init
 5.92s call     unit/profiling/flops_profiler/test_flops_profiler.py::TestFlopsProfiler::test_flops_profiler_in_inference
 5.92s call     unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam]
 5.92s call     unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[non_tensor4]
 5.90s call     unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[True-1]
 5.90s call     unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[OneCycle-params2]
 5.88s call     unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable_onecyclelr_steplr[Callable]
 5.87s call     unit/runtime/test_lr_schedulers.py::TestLrRange::test[1e-05-1e-05-1-False]
 5.87s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-Callable]
 5.87s call     unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[100-10-0.1-0.2]
 5.86s call     unit/runtime/zero/test_zero_context.py::TestSerialContext::test_ext_param_getattr
 5.86s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-True]
 5.85s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-None]
 5.85s call     unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[False-1]
 5.85s call     unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-True-True]
 5.84s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-10]
 5.82s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-100]
 5.82s call     unit/runtime/zero/test_zero_dynamic_class.py::TestNewClassDeclaredInsideNestingInit::test_new_class_declared_inside_nesting_init
 5.81s call     unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[False-1]
 5.81s call     unit/comm/test_dist.py::TestDistInitNoEnv::test
 5.81s call     unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.001-0.001-10-False]
 5.80s call     unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[False-2]
 5.80s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-100]
 5.77s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0-21-21]
 5.76s call     unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable_onecyclelr_steplr[_LRScheduler]
 5.76s call     unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-3-1024-512-16-3-True-False]
 5.76s call     unit/runtime/zero/test_zero_dynamic_class.py::TestNewClassDeclaredNestingInit::test_new_class_declared_nesting_init
 5.75s call     unit/runtime/test_ds_config_dict.py::TestArgs::test_none_args
 5.74s call     unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[1]
 5.73s call     unit/runtime/test_multi_output_model.py::TestThreeOutputModel::test
 5.71s call     unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[False-2]
 5.71s call     unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-160-128-2-3-True-True-0.1]
 5.71s call     unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-3-1024-512-16-3-False-False]
 5.71s call     unit/runtime/zero/test_zero_context.py::TestSerialContext::test_subclass_param
 5.70s call     unit/runtime/zero/test_zero_context.py::TestMiCSGatheredParametersFree::test
 5.70s call     unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_ext_param_return
 5.70s call     unit/runtime/half_precision/test_fp16.py::TestFP16AdamTypes::test[True-AdamW]
 5.69s call     unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupDecayLR-params1]
 5.69s call     unit/utils/test_init_on_device.py::TestOnDevice::test_on_device[cuda:0]
 5.69s call     unit/comm/test_dist.py::TestDistInferenceAllReduce::test[dtype2]
 5.68s call     unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[non_tensor3]
 5.68s call     unit/runtime/zero/test_zero.py::TestZero3RepeatForwardLoop::test[True]
 5.67s call     unit/runtime/zero/test_zero_context_ancestry.py::TestSerialParamInit::test_subclass_param_init
 5.65s call     unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[600-300-0.1-0.0]
 5.65s call     unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-False-True]
 5.65s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-101]
 5.64s call     unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.1]
 5.64s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-False]
 5.63s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.1-0-10-0]
 5.63s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-False-False]
 5.62s call     unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[False]
 5.61s call     unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[LRRangeTest-params3]
 5.60s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-True-False]
 5.60s call     unit/runtime/test_ds_config_dict.py::TestConfigLoad::test_json
 5.59s call     unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[WarmupDecayLR-params1]
 5.59s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-19]
 5.59s call     unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[3]
 5.58s call     unit/runtime/test_ds_config_dict.py::TestArgs::test_no_args
 5.58s call     unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.9]
 5.57s call     unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[LRRangeTest-params3]
 5.57s call     unit/runtime/test_ds_config_dict.py::TestDistInit::test
 5.57s call     unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[600-550-0.0-0.0]
 5.55s call     unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[True-2]
 5.55s call     unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[False-1]
 5.55s call     unit/runtime/zero/test_zero_context.py::TestZeroGatheredParametersFree::test
 5.55s call     unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupLR-params0]
 5.55s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-210]
 5.54s call     unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[WarmupLR-params0]
 5.52s call     unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0.1-21-21]
 5.51s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-_LRScheduler]
 5.51s call     unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.01-0.01-19-True]
 5.51s call     unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_stage_3_output_type[dict]
 5.50s call     unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[3]
 5.50s call     unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[2]
 5.49s call     unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[True]
 5.49s call     unit/runtime/test_ds_config_dict.py::TestDeprecatedDeepScaleConfig::test
 5.49s call     unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[500-30-0.0-0.2]
 5.47s call     unit/runtime/half_precision/test_dynamic_loss_scale.py::TestFused::test_no_overflow
 5.47s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-False]
 5.45s call     unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_stage_3_output_type[tensor]
 5.44s call     unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-None]
 5.42s call     unit/runtime/half_precision/test_dynamic_loss_scale.py::TestUnfused::test_some_overflow
 5.40s call     unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-15]
 5.38s call     unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[3]
 5.37s call     unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[False-3]
 5.36s call     unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-False-resulting_optimizer0]
 5.35s call     unit/runtime/zero/test_zero_context.py::TestSerialContext::test_scattered_init_dist
 5.35s call     unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_stage_3_output_type[None]
 5.33s call     unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-False]
 5.29s call     unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[True-1]
 5.28s call     unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[1]
 5.26s call     unit/runtime/zero/test_zero.py::TestZero3RepeatForwardLoop::test[False]
 5.09s call     unit/runtime/zero/test_zero.py::TestZeroPartitionCache::test_training_partition_cache[True]
 4.98s call     unit/runtime/zero/test_zero.py::TestZero3DictFwd::test[list]
 4.93s call     unit/runtime/zero/test_zero.py::TestZero3DictFwd::test[tuple]
 4.74s call     unit/runtime/zero/test_zero.py::TestZero3DictFwd::test[dict]
 2.31s call     unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[False-True-False]
 1.74s call     unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-True-resulting_optimizer14]
 1.73s call     unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-True-resulting_optimizer6]
 1.65s call     unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-False-resulting_optimizer10]
 1.45s call     unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[False-False-True]
 1.32s call     unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp16]
 1.21s call     unit/ops/aio/test_aio.py::TestRead::test_parallel_read[False-True-True]
 1.17s call     unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-True-True-False]
 1.03s call     unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[True-False-True]
 1.03s call     unit/ops/aio/test_aio.py::TestRead::test_async_read[True-False-False-True]
 1.02s call     unit/ops/lion/test_lion.py::TestLionConfigs::test[Lion-True-DeepSpeedCPULion]
 1.00s call     unit/ops/aio/test_aio.py::TestRead::test_parallel_read[True-False-True]
 1.00s call     unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-True-True-False]

 (3081 durations < 1s hidden.  Use -vv to show these durations.)
 [36m[1m===================================================================================== short test summary info =====================================================================================[0m
 [31mFAILED[0m tests/unit/checkpoint/test_latest_checkpoint.py::[1mTestLatestCheckpoint::test_existing_latest[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_latest_checkpoint.py::[1mTestLatestCheckpoint::test_missing_latest[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestSaveTensorClone::test_save_tensor_clone[True-1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestSaveTensorClone::test_save_tensor_clone[True-2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestSaveTensorClone::test_save_tensor_clone[False-1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestSaveTensorClone::test_save_tensor_clone[False-2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestZeRONonDistributed::test_chmod_exception_handling[1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestZeRONonDistributed::test_chmod_exception_handling[2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/checkpoint/test_zero_optimizer.py::[1mTestZeRONonDistributed::test_chmod_exception_handling[3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[True-"I am 6' tall"][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[True-'I am 72" tall'][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[True-'"translate English to Romanian: "'][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[True-I'm going to tell them "DeepSpeed is the best"][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[False-"I am 6' tall"][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[False-'I am 72" tall'][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[False-'"translate English to Romanian: "'][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_user_args[False-I'm going to tell them "DeepSpeed is the best"][0m - FileNotFoundError: [Errno 2] No such file or directory: 'deepspeed'
 [31mFAILED[0m tests/unit/launcher/test_user_args.py::[1mtest_bash_string_args[0m - AssertionError: User args not parsed correctly: xargs: deepspeed: No such file or directory
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_backward.py::[1mTestCUDABackward::test_backward[64-160-128-2-24-False-True-0.2][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_backward.py::[1mTestCUDABackward::test_backward[64-1600-128-2-4-False-True-0.2][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_backward.py::[1mTestCUDABackward::test_backward[8-1600-128-25-3-True-True-0.05][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_backward.py::[1mTestCUDABackward::test_backward[8-160-128-2-3-True-True-0.1][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_backward.py::[1mTestCUDABackward::test_backward[8-1600-128-2-3-True-True-0.05][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_forward.py::[1mTestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-3-1024-512-16-3-True-False][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_forward.py::[1mTestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-True-True][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_forward.py::[1mTestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-3-1024-512-16-3-False-False][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/accelerators/test_accelerator_forward.py::[1mTestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-False-True][0m - RuntimeError: Error building extension 'transformer'
 [31mFAILED[0m tests/unit/ops/adam/test_adamw.py::[1mTestAdamConfigs::test[AdamW-False-False-False-resulting_optimizer0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/ops/adam/test_adamw.py::[1mTestAdamConfigs::test[AdamW-False-False-True-resulting_optimizer4][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_adamw.py::[1mTestAdamConfigs::test[Adam-False-False-False-resulting_optimizer8][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_adamw.py::[1mTestAdamConfigs::test[Adam-False-False-True-resulting_optimizer12][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[64-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[64-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[64-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[22-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[22-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[22-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[128-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[128-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[128-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[1024-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[1024-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[1024-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[1048576-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[1048576-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_cpu_adam.py::[1mTestCPUAdam::test_fused_adam_equal[1048576-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_hybrid_adam.py::[1mTestHybridAdam::test_hybrid_adam_equal[8-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_hybrid_adam.py::[1mTestHybridAdam::test_hybrid_adam_equal[8-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_hybrid_adam.py::[1mTestHybridAdam::test_hybrid_adam_equal[8-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_hybrid_adam.py::[1mTestHybridAdam::test_hybrid_adam_equal[16-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_hybrid_adam.py::[1mTestHybridAdam::test_hybrid_adam_equal[16-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/adam/test_hybrid_adam.py::[1mTestHybridAdam::test_hybrid_adam_equal[16-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::[1mtest_DS4Sci_EvoformerAttention[tensor_shape0-dtype0][0m - RuntimeError: Error building extension 'evoformer_attn'
 [31mFAILED[0m tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::[1mtest_DS4Sci_EvoformerAttention[tensor_shape0-dtype1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/evoformer_attn/evoformer_attn.so: cannot open shared object file: No such file or di...
 [31mFAILED[0m tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::[1mtest_DS4Sci_EvoformerAttention[tensor_shape1-dtype0][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/evoformer_attn/evoformer_attn.so: cannot open shared object file: No such file or di...
 [31mFAILED[0m tests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::[1mtest_DS4Sci_EvoformerAttention[tensor_shape1-dtype1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/evoformer_attn/evoformer_attn.so: cannot open shared object file: No such file or di...
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[64-fp16][0m - RuntimeError: Error building extension 'fused_lion'
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[64-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[64-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[22-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[22-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[22-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[128-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[128-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[128-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[1024-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[1024-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[1024-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[1048576-fp16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[1048576-bf16][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_cpu_lion.py::[1mTestCPULion::test_fused_lion_equal[1048576-fp32][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/ops/lion/test_lion.py::[1mTestLionConfigs::test[Lion-False-FusedLion][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_lion/fused_lion.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/profiling/flops_profiler/test_flops_profiler.py::[1mTestFlopsProfiler::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestConfigLoad::test_dict[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestConfigLoad::test_json[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestConfigLoad::test_hjson[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestDeprecatedDeepScaleConfig::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestDistInit::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestArgs::test_none_args[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_config_dict.py::[1mTestArgs::test_no_args[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestClientOptimizer::test[None][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestConfigOptimizer::test[True][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestConfigOptimizer::test[False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-bf16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-bf16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-bf16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-bf16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp32-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp32-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp32-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[None-fp32-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-bf16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-bf16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-bf16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-bf16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp32-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp32-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp32-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp16-fp32-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-bf16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-bf16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-bf16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-bf16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp32-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp32-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp32-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[bf16-fp32-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-bf16-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-bf16-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-bf16-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-bf16-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp32-zero1][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp32-zero2][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp32-zero3][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestOptimizerImplementation::test[fp32-fp32-None][0m - ImportError: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/fused_adam.so: cannot open shared object file: No such file or directory
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestClientLrScheduler::test[None-None][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestClientLrScheduler::test[None-_LRScheduler][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_ds_initialize.py::[1mTestClientLrScheduler::test[None-Callable][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestGetLrBeforeTrain::test[WarmupLR-params0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestGetLrBeforeTrain::test[WarmupDecayLR-params1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestGetLrBeforeTrain::test[OneCycle-params2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestGetLrBeforeTrain::test[LRRangeTest-params3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[log-10][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[log-15][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[log-19][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[log-33][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[linear-10][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[linear-15][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[linear-19][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_schedule[linear-33][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[log-10][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[log-15][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[log-19][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[log-33][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[linear-10][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[linear-15][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[linear-19][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrSchedule::test_lr_warmup_decay_schedule[linear-33][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestSchedulerOptimizerParity::test[WarmupLR-params0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestSchedulerOptimizerParity::test[WarmupDecayLR-params1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestSchedulerOptimizerParity::test[OneCycle-params2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestSchedulerOptimizerParity::test[LRRangeTest-params3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrRange::test[0.0001-1e-05-1-True][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrRange::test[1e-05-1e-05-1-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrRange::test[0.0001-0.001-10-True][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrRange::test[0.001-0.001-10-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrRange::test[0.01-0.01-19-True][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestLrRange::test[0.01-0.01-19-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_lr[1e-05-0.01-0.001-10-100][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_lr[0.001-0.1-0-21-21][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_lr[1e-05-0.01-0.001-10-101][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_lr[0.001-0.1-0.1-21-21][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_lr[1e-05-0.1-0-10-0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_mom[0.08-0.09-0.001-100][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_mom[0.08-0.09-0-210][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_mom[0.08-0.09-0.001-101][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestOneCycle::test_mom[0.08-0.09-0-211][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestWarmupCosineLR::test_lr[100-10-0.1-0.2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestWarmupCosineLR::test_lr[200-20-0.1-0.2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestWarmupCosineLR::test_lr[500-30-0.0-0.2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestWarmupCosineLR::test_lr[600-300-0.1-0.0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_lr_schedulers.py::[1mTestWarmupCosineLR::test_lr[600-550-0.0-0.0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_multi_output_model.py::[1mTestTwoOutputModel::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_multi_output_model.py::[1mTestThreeOutputModel::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_pld.py::[1mTestPLDModel::test_pld_model[0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_pld.py::[1mTestPLDModel::test_pld_model[0.1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_pld.py::[1mTestPLDModel::test_pld_model[0.9][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_pld.py::[1mTestPLDModel::test_pld_model[1.0][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/test_pld.py::[1mTestNonPLDModel::test_non_pld_model[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_bf16.py::[1mTestAdamBF16ZeroOneCycleCompatibility::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_bf16.py::[1mTestZeroSupportedClientOptimizer::test[FusedAdam][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_dynamic_loss_scale.py::[1mTestFused::test_no_overflow[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_dynamic_loss_scale.py::[1mTestFused::test_all_overflow[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_dynamic_loss_scale.py::[1mTestFused::test_some_overflow[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_dynamic_loss_scale.py::[1mTestUnfused::test_no_overflow[0m - RuntimeError: Error building extension 'fused_lamb'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_dynamic_loss_scale.py::[1mTestUnfused::test_all_overflow[0m - RuntimeError: Error building extension 'fused_lamb'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_dynamic_loss_scale.py::[1mTestUnfused::test_some_overflow[0m - RuntimeError: Error building extension 'fused_lamb'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestAdamFP16ZeroOneCycleCompatibility::test[False-1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestAdamFP16ZeroOneCycleCompatibility::test[False-2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestAdamFP16ZeroOneCycleCompatibility::test[False-3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZeroStaticScale::test[False-1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZeroStaticScale::test[False-2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZeroStaticScale::test[False-3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZeroSupportedClientOptimizer::test[FusedAdam-1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZeroSupportedClientOptimizer::test[FusedAdam-2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZeroSupportedClientOptimizer::test[FusedAdam-3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestFP16AdamTypes::test[True-Adam][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestFP16AdamTypes::test[True-AdamW][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestFP16AdamTypes::test[False-Adam][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestFP16AdamTypes::test[False-AdamW][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/half_precision/test_fp16.py::[1mTestZero3LazyScatter::test[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroUnbalancedGradients::test[1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroUnbalancedGradients::test[2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroUnbalancedGradients::test[3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZero3RepeatForwardLoop::test[True][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZero3RepeatForwardLoop::test[False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestIncorectAllgatherBucketSize::test[1000][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestIncorectAllgatherBucketSize::test[1001][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZero3DictFwd::test[tuple][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZero3DictFwd::test[list][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZero3DictFwd::test[dict][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroAdamOptimizerStepCount::test[1][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroAdamOptimizerStepCount::test[2][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroAdamOptimizerStepCount::test[3][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestZeroPartitionCache::test_training_partition_cache[True][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestEmptyParameterGroup::test_empty_param_groups[dtype0-True-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestEmptyParameterGroup::test_empty_param_groups[dtype0-False-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestEmptyParameterGroup::test_empty_param_groups[dtype1-True-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestEmptyParameterGroup::test_empty_param_groups[dtype1-False-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestEmptyParameterGroup::test_empty_param_groups[dtype2-True-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero.py::[1mTestEmptyParameterGroup::test_empty_param_groups[dtype2-False-False][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero_context.py::[1mTestSerialContext::test_throughput_calculation[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero_context.py::[1mTestSerialContext::test_ext_param_getattr[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero_context_return.py::[1mTestReturnParam::test_ext_param_return[0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero_context_return.py::[1mTestReturnParam::test_stage_3_output_type[tensor][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero_context_return.py::[1mTestReturnParam::test_stage_3_output_type[dict][0m - RuntimeError: Error building extension 'fused_adam'
 [31mFAILED[0m tests/unit/runtime/zero/test_zero_context_return.py::[1mTestReturnParam::test_stage_3_output_type[None][0m - RuntimeError: Error building extension 'fused_adam'
 [31m===================================================== [31m[1m233 failed[0m, [32m369 passed[0m, [33m581 skipped[0m, [33m4426 deselected[0m, [33m88 warnings[0m[31m in 2322.76s (0:38:42)[0m[31m =====================================================[0m
 (at easybuild/tools/run.py:695 in parse_cmd_output)
 == 2024-10-30 15:56:41,920 build_log.py:369 WARNING Test failure ignored: 'cmd "export PYTHONPATH=/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages:$PYTHONPATH &&  mv deepspeed deepspeed.src && pytest tests/unit/ -k "not TestTensorBoard and not TestWandb and not TestCometMonitor" && mv deepspeed.src deepspeed " exited with exit code 1 and output:\n\x1b[1m======================================================================================= test session starts =======================================================================================\x1b[0m\nplatform linux -- Python 3.11.3, pytest-7.4.0, pluggy-1.2.0 -- /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/bin/python\ncachedir: .pytest_cache\nrootdir: /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests\nconfigfile: pytest.ini\nplugins: mock-3.11.1, xdist-3.3.1\n\x1b[1mcollecting ... \x1b[0mcollected 5602 items / 4426 deselected / 7 skipped / 1176 selected\n\ntests/unit/accelerator/test_accelerator.py::test_abstract_methods_defined[deepspeed.accelerator.mps_accelerator] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                     [  0%]\x1b[0m\ntests/unit/accelerator/test_accelerator.py::test_abstract_methods_defined[deepspeed.accelerator.npu_accelerator] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                     [  0%]\x1b[0m\ntests/unit/accelerator/test_accelerator.py::test_abstract_methods_defined[deepspeed.accelerator.hpu_accelerator] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                     [  0%]\x1b[0m\ntests/unit/accelerator/test_accelerator.py::test_abstract_methods_defined[deepspeed.accelerator.cuda_accelerator] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                    [  0%]\x1b[0m\ntests/unit/accelerator/test_accelerator.py::test_abstract_methods_defined[deepspeed.accelerator.cpu_accelerator] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                     [  0%]\x1b[0m\ntests/unit/accelerator/test_accelerator.py::test_abstract_methods_defined[deepspeed.accelerator.xpu_accelerator] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                     [  0%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_command_line \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                                                          [  0%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[None] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                                   [  0%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings1] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                          [  0%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings2] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                          [  0%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings3] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                          [  0%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_resource_manager_arg_mappings[arg_mappings4] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                          [  1%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources0] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                                [  1%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources1] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                                [  1%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources2] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                                [  1%]\x1b[0m\ntests/unit/autotuning/test_autotuning.py::test_autotuner_resources[active_resources3] \x1b[32mPASSED\x1b[0m\x1b[33m                                                                                                [  1%]\x1b[0m\ntests/unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_existing_latest \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [  1%]\x1b[0m\ntests/unit/checkpoint/test_latest_checkpoint.py::TestLatestCheckpoint::test_missing_latest \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                           [  1%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[0-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, ...)\x1b[31m [  1%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[1-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, ...)\x1b[31m [  1%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[2-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, ...)\x1b[31m [  1%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[2-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [  1%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[3-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, ...)\x1b[31m [  1%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_lr_scheduler[3-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[0-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[1-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[2-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[2-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[3-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_lr_scheduler.py::TestLRSchedulerCheckpoint::test_checkpoint_no_lr_scheduler[3-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe[4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m              [  2%]\x1b[0m\ntests/unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 avail...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[4-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 avai...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 avail...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_moe_checkpoint.py::TestMoECheckpoint::test_checkpoint_moe_and_zero[2-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 avai...)\x1b[31m [  2%]\x1b[0m\ntests/unit/checkpoint/test_other_optimizer.py::TestOtherOptimizerCheckpoint::test_checkpoint_unfused_optimizer \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [  3%]\x1b[0m\ntests/unit/checkpoint/test_other_optimizer.py::TestOtherOptimizerCheckpoint::test_checkpoint_fused_optimizer \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, ...)\x1b[31m [  3%]\x1b[0m\ntests/unit/checkpoint/test_other_optimizer.py::TestOtherOptimizerCheckpoint::test_checkpoint_fp32_optimizer \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [  3%]\x1b[0m\ntests/unit/checkpoint/test_pipeline.py::TestPipelineCheckpoint::test_checkpoint_pipe_engine[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m       [  3%]\x1b[0m\ntests/unit/checkpoint/test_pipeline.py::TestPipelineCheckpoint::test_checkpoint_pipe_engine[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m       [  3%]\x1b[0m\ntests/unit/checkpoint/test_pipeline.py::TestPipelineCheckpoint::test_checkpoint_pipe_module[base_topo0-test_topo0] \x1b[33mSKIPPED\x1b[0m (got empty parameter set [\'base_topo\', \'test_topo\'], functio...)\x1b[31m [  3%]\x1b[0m\ntests/unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_111 \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [  3%]\x1b[0m\ntests/unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_121 \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [  3%]\x1b[0m\ntests/unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_122 \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [  3%]\x1b[0m\ntests/unit/checkpoint/test_reshape_checkpoint.py::test_reshape_222_to_211 \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [  3%]\x1b[0m\ntests/unit/checkpoint/test_shared_weights.py::TestCheckpointSharedWeights::test_checkpoint_shared_weights \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 a...)\x1b[31m [  3%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[False-False-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requir...)\x1b[31m [  3%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[False-False-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[False-False-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[True-False-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[True-False-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[True-False-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[True-True-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[True-True-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_sparse.py::TestSparseCheckpoint::test_non_strict_load_sparse[True-True-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, ...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_tag_validation.py::TestCheckpointValidationTag::test_checkpoint_unique_tag[FAIL] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_tag_validation.py::TestCheckpointValidationTag::test_checkpoint_unique_tag[WARN] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_tag_validation.py::TestCheckpointValidationTag::test_checkpoint_unique_tag[IGNORE] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [  4%]\x1b[0m\ntests/unit/checkpoint/test_tag_validation.py::TestCheckpointValidationTag::test_checkpoint_unknown_tag_validation \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requi...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-False-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-False-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-False-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-False-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-False-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-False-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-True-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-True-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-True-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-True-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-True-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  5%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[False-True-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-False-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-False-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-False-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-False-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-False-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-False-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-True-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-True-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-True-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-True-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-True-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  6%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to2[True-True-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-False-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-False-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-False-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-False-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-False-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-False-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-True-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-True-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-True-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-True-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-True-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  7%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[False-True-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-False-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-False-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-False-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-False-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-False-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-False-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-True-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-True-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-True-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-True-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  8%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-True-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_4to2[True-True-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-False-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-False-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-False-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-False-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-False-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-False-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ava...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-True-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-True-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-True-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-True-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [  9%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-True-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[False-True-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-False-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-False-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-False-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-False-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-False-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-False-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avai...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-True-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-True-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-True-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-True-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 10%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-True-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_universal_checkpoint.py::TestZeROUniversalCheckpointDP::test_dp_world_size_2to4[True-True-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_pipeline_checkpoint_loading[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 avail...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[1-False-Adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 a...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[2-False-Adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 a...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[2-True-deepspeed_adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[3-False-Adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 a...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_optimizer_state[3-True-deepspeed_adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[1-False-Adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[2-False-Adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[2-True-deepspeed_adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[3-False-Adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 11%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_not_load_optimizer_state[3-True-deepspeed_adam] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_hybrid_optimizer_state[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m     [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_hybrid_optimizer_state[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m     [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_module_only[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_module_only[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_module_only[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpoint::test_load_module_only[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available:...)\x1b[31m [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available...)\x1b[31m [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-False-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available...)\x1b[31m [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[True-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 12%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[False-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[False-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[False-False-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_fixed_dp[False-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availab...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[True-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[True-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[True-False-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[True-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availab...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[False-True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[False-True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availab...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[False-False-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availab...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROElasticCheckpoint::test_elastic_checkpoint_change_dp[False-False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availa...)\x1b[31m [ 13%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_immediate_save_load[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_load_immediate_save[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_save_before_accum_grad_is_done[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_save_before_accum_grad_is_done[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_save_before_accum_grad_is_done[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROSaveLoadEdgeCase::test_save_before_accum_grad_is_done[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required...)\x1b[31m [ 14%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_optimizer_state[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_optimizer_state[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_optimizer_state[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_not_load_optimizer_state[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_not_load_optimizer_state[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_not_load_optimizer_state[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 require...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_module_only[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ava...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_module_only[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ava...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_load_module_only[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ava...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_save_exclude_frozen_weights[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_save_exclude_frozen_weights[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_save_exclude_custom_frozen_weights[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available:...)\x1b[31m [ 15%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeROCheckpointFrozenWeights::test_save_exclude_custom_frozen_weights[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available:...)\x1b[31m [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[True-1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                    [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[True-2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                    [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[False-1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                   [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestSaveTensorClone::test_save_tensor_clone[False-2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                   [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                               [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                               [ 16%]\x1b[0m\ntests/unit/checkpoint/test_zero_optimizer.py::TestZeRONonDistributed::test_chmod_exception_handling[3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                               [ 16%]\x1b[0m\ntests/unit/comm/test_dist.py::TestInit::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                                                         [ 16%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistArgs::test[hello-icosahedron-1138-purple] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                      [ 16%]\x1b[0m\ntests/unit/comm/test_dist.py::TestGroupedDistTest::test_one[1138] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                    [ 16%]\x1b[0m\ntests/unit/comm/test_dist.py::TestGroupedDistTest::test_two[1138] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                    [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestWorldSizeOverrideDistTest::test_world_size_2 \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                       [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestWorldSizeOverrideDistTest::test_world_size_1 \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistributedFixture::test[2-16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                     [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistributedFixture::test[2-32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                     [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistributedFixture::test[4-16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                     [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistributedFixture::test[4-32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                     [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistAllReduce::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInferenceAllReduce::test[dtype0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInferenceAllReduce::test[dtype1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInferenceAllReduce::test[dtype2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInit::test_already_init[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                  [ 17%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInit::test_already_init[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                 [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInit::test_already_init[None] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                  [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInit::test_no_init[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                       [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInit::test_no_init[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                      [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInit::test_no_init[None] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                       [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInitNoEnv::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInitWithModel::test_already_init[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                         [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInitWithModel::test_already_init[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                        [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInitWithModel::test_no_init[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                              [ 18%]\x1b[0m\ntests/unit/comm/test_dist.py::TestDistInitWithModel::test_no_init[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                             [ 18%]\x1b[0m\ntests/unit/compression/test_compression.py::TestCompression::test_linear_layer_compress \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m              [ 18%]\x1b[0m\ntests/unit/compression/test_compression.py::TestCompression::test_mpu_compress \x1b[33mSKIPPED\x1b[0m (megatron-lm is currently broken so this test cannot be run.)\x1b[31m                                        [ 18%]\x1b[0m\ntests/unit/compression/test_compression.py::TestCompression::test_conv1d_convertion \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                  [ 19%]\x1b[0m\ntests/unit/compression/test_dequantization.py::TestDequantization::test_dequantize \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                   [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_basic_10k \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_old_version \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                              [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_disabled \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                 [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_valid_world_size \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                         [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_world_size \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                       [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_future_elastic_version \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                   [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_missing_max_batch \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                        [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_missing_micro_batch \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                      [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_empty_config \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                             [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_model_parallel_v1_invalid \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 19%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_model_parallel_v2_invalid \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_model_parallel_v2_valid \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                  [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_config_values[micro_batch_sizes-value0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                          [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_config_values[min_gpus--1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_config_values[max_gpus--1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_config_values[micro_batch_sizes-5] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                               [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_config_values[micro_batch_sizes-value4] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                          [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_invalid_config_values[micro_batch_sizes-value5] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                          [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::test_proper_mbsz \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                              [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::TestNonElasticBatchParams::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                               [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::TestNonElasticBatchParamsWithOverride::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                   [ 20%]\x1b[0m\ntests/unit/elasticity/test_elastic.py::TestElasticConfigChanged::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_model_quantization[4bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_model_quantization[8bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[4bits-0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[4bits-1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[8bits-0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_quantized_linear[8bits-1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required,...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_float_int4_quantization \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_half_int4_quantization \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_float_int8_quantization \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_half_int8_quantization \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant[4bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 r...)\x1b[31m [ 21%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant[8bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 r...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_cpu_offload[4bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are av...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_cpu_offload[8bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are av...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_post_init_quant_nvme_offload \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availabl...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization[4bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization[8bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_cpu_offload[4bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GP...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_cpu_offload[8bits] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GP...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/inference/quantization/test_intX_quantization.py::TestQuantizedInt::test_zero3_int4_quantized_initialization_nvme_offload \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are...)\x1b[31m [ 22%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_no_ds_arguments_no_ds_parser \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                          [ 22%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_no_ds_arguments \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                       [ 22%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_no_ds_enable_argument \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                 [ 22%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_no_ds_config_argument \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                 [ 23%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_no_ds_parser \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                          [ 23%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_core_deepscale_arguments \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                              [ 23%]\x1b[0m\ntests/unit/launcher/test_ds_arguments.py::test_core_binding_arguments \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 23%]\x1b[0m\ntests/unit/launcher/test_multinode_runner.py::test_pdsh_runner \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                       [ 23%]\x1b[0m\ntests/unit/launcher/test_multinode_runner.py::test_openmpi_runner \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                    [ 23%]\x1b[0m\ntests/unit/launcher/test_multinode_runner.py::test_mpich_runner \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                      [ 23%]\x1b[0m\ntests/unit/launcher/test_multinode_runner.py::test_slurm_runner \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                      [ 23%]\x1b[0m\ntests/unit/launcher/test_multinode_runner.py::test_mvapich_runner \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                    [ 23%]\x1b[0m\ntests/unit/launcher/test_run.py::test_parser_mutual_exclusive \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                        [ 23%]\x1b[0m\ntests/unit/launcher/test_run.py::test_parser_local \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                   [ 23%]\x1b[0m\ntests/unit/launcher/test_run.py::test_parser_multinode \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                               [ 23%]\x1b[0m\ntests/unit/launcher/test_run.py::test_parser_errors \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                  [ 24%]\x1b[0m\ntests/unit/launcher/test_run.py::test_num_plus_parser \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                [ 24%]\x1b[0m\ntests/unit/launcher/test_run.py::test_hostfile_good \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                  [ 24%]\x1b[0m\ntests/unit/launcher/test_run.py::test_hostfiles_bad \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                  [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[True-"I am 6\' tall"] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                           [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[True-\'I am 72" tall\'] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                          [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[True-\'"translate English to Romanian: "\'] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                      [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[True-I\'m going to tell them "DeepSpeed is the best"] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                           [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[False-"I am 6\' tall"] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                          [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[False-\'I am 72" tall\'] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                         [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[False-\'"translate English to Romanian: "\'] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                     [ 24%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_user_args[False-I\'m going to tell them "DeepSpeed is the best"] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                          [ 25%]\x1b[0m\ntests/unit/launcher/test_user_args.py::test_bash_string_args \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                         [ 25%]\x1b[0m\ntests/unit/linear/test_ctx.py::TestEngine::test_model \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                                [ 25%]\x1b[0m\ntests/unit/linear/test_ctx.py::TestInitTransformers::test_pretrained_init \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                            [ 25%]\x1b[0m\ntests/unit/linear/test_ctx.py::TestInitTransformers::test_config_init \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_mp.py::TestConfigurableMP::test_gpt2_basic \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and <=1.13)\x1b[31m                  [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_mp.py::TestConfigurableMP::test_gpt2_mp2_no_resize \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and <=1.13)\x1b[31m          [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_mp.py::TestConfigurableResizeMP::test \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and <=1.13)\x1b[31m                       [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurablePP::test_pp_basic \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and <=1.13)\x1b[31m                    [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurableResizePP::test_world_size_2to1[1-2-1-1] \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and ...)\x1b[31m [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurableResizePP::test_world_size_4to1[2-2-1-1] \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and ...)\x1b[31m [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurableResizePP::test_world_size_4to2[2-2-2-1] \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and ...)\x1b[31m [ 25%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurableResizePP::test_world_size_1to4[1-1-2-2] \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and ...)\x1b[31m [ 26%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurableResizePP::test_world_size_2to4[1-2-1-4] \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and ...)\x1b[31m [ 26%]\x1b[0m\ntests/unit/model_parallelism/test_configurable_parallel_pp.py::TestConfigurableResizePP::test_world_size_2to4[2-1-2-2] \x1b[33mSKIPPED\x1b[0m (Megatron-LM package requires Pytorch version >=1.5 and ...)\x1b[31m [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestSimpleMoE::test[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                                   [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestSimpleMoE::test[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                                   [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestSimpleMoE::test[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                                   [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[True-0-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[True-0-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[True-1-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[True-1-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[True-2-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[True-2-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 26%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[False-0-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[False-0-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[False-1-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[False-1-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[False-2-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestMoE::test[False-2-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestPRMoE::test[2-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                  [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestPRMoE::test[2-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                                                 [ 27%]\x1b[0m\ntests/unit/moe/test_moe.py::TestTopk::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                                           [ 27%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-1-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                            [ 27%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-1-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                            [ 27%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-True-2-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                            [ 27%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-1-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-1-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[True-False-2-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-1-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-1-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-True-2-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-1-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                          [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-1-4] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                          [ 28%]\x1b[0m\ntests/unit/moe/test_moe_tp.py::TestMOETensorParallel::test[False-False-2-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                          [ 28%]\x1b[0m\ntests/unit/monitor/test_monitor.py::TestCSVMonitor::test_csv_monitor \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                 [ 28%]\x1b[0m\ntests/unit/monitor/test_monitor.py::TestCSVMonitor::test_empty_csv_monitor \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                           [ 28%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[64-160-128-2-24-False-True-0.2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                            [ 28%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[64-1600-128-2-4-False-True-0.2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                            [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-1600-128-25-3-True-True-0.05] \x1b[31mFAILED\x1b[0m\x1b[31m                                                            [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-160-128-2-3-True-True-0.1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                               [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_backward.py::TestCUDABackward::test_backward[8-1600-128-2-3-True-True-0.05] \x1b[31mFAILED\x1b[0m\x1b[31m                                                             [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-3-1024-512-16-3-True-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                    [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-True-True] \x1b[31mFAILED\x1b[0m\x1b[31m                                     [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-3-1024-512-16-3-False-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                   [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardSmallBatchSize::test_forward_with_small_bsz[8-7-1024-512-16-3-False-True] \x1b[31mFAILED\x1b[0m\x1b[31m                                    [ 29%]\x1b[0m\ntests/unit/ops/accelerators/test_accelerator_forward.py::TestCUDAForwardStochastic::test_forward_stochastic[batch_size0-hidden_size0-seq_len0-heads0-num_layers0-is_preln0-use_fp160] \x1b[33mSKIPPED\x1b[0m\x1b[31m [ 29%]\x1b[0m\ntests/unit/ops/adagrad/test_cpu_adagrad.py::TestCPUAdagrad::test_cpu_adagrad_opt \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 29%]\x1b[0m\ntests/unit/ops/adagrad/test_cpu_adagrad.py::TestCPUAdagrad::test_cpu_adagrad_opt_sparse_embedding \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                    [ 29%]\x1b[0m\ntests/unit/ops/adagrad/test_cpu_adagrad.py::TestCPUAdagradGPUError::test_cpu_adagrad_gpu_error \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m       [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-False-resulting_optimizer0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                               [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-False-resulting_optimizer1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-False-resulting_optimizer2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-False-resulting_optimizer3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-False-True-resulting_optimizer4] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-False-True-True-resulting_optimizer5] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-False-True-resulting_optimizer6] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[AdamW-True-True-True-resulting_optimizer7] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                  [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-False-resulting_optimizer8] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-False-resulting_optimizer9] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-False-resulting_optimizer10] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                [ 30%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-False-resulting_optimizer11] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 31%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-False-True-resulting_optimizer12] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                [ 31%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-False-True-True-resulting_optimizer13] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 31%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-False-True-resulting_optimizer14] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 31%]\x1b[0m\ntests/unit/ops/adam/test_adamw.py::TestAdamConfigs::test[Adam-True-True-True-resulting_optimizer15] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                  [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[64-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[22-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 31%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[128-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1024-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_fused_adam_equal[1048576-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[64-fp16] \x1b[33mSKIPPED\x1b[0m (torch.optim.AdamW with half precision inf/nan output.)\x1b[31m                                          [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[64-bf16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[64-fp32] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[22-fp16] \x1b[33mSKIPPED\x1b[0m (torch.optim.AdamW with half precision inf/nan output.)\x1b[31m                                          [ 32%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[22-bf16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[22-fp32] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[128-fp16] \x1b[33mSKIPPED\x1b[0m (torch.optim.AdamW with half precision inf/nan output.)\x1b[31m                                         [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[128-bf16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[128-fp32] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[1024-fp16] \x1b[33mSKIPPED\x1b[0m (torch.optim.AdamW with half precision inf/nan output.)\x1b[31m                                        [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[1024-bf16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[1024-fp32] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[1048576-fp16] \x1b[33mSKIPPED\x1b[0m (torch.optim.AdamW with half precision inf/nan output.)\x1b[31m                                     [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[1048576-bf16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdam::test_torch_adamw_equal[1048576-fp32] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 33%]\x1b[0m\ntests/unit/ops/adam/test_cpu_adam.py::TestCPUAdamGPUError::test_cpu_adam_gpu_error \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                   [ 34%]\x1b[0m\ntests/unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                              [ 34%]\x1b[0m\ntests/unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                              [ 34%]\x1b[0m\ntests/unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[8-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                              [ 34%]\x1b[0m\ntests/unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                             [ 34%]\x1b[0m\ntests/unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                             [ 34%]\x1b[0m\ntests/unit/ops/adam/test_hybrid_adam.py::TestHybridAdam::test_hybrid_adam_equal[16-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                             [ 34%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[True-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                         [ 34%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[True-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                        [ 34%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[True-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                        [ 34%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[True-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 34%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[False-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                        [ 34%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[False-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[False-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_parallel_read[False-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-True-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-True-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-True-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-True-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-False-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-False-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-False-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[True-False-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-True-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 35%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-True-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-True-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-True-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-False-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-False-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-False-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestRead::test_async_read[False-False-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[True-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[True-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[True-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[True-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[False-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                      [ 36%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[False-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[False-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_parallel_write[False-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-True-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-True-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-True-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-True-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-False-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-False-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-False-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[True-False-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 37%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-True-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-True-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-True-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-True-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-False-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-False-True-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-False-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 38%]\x1b[0m\ntests/unit/ops/aio/test_aio.py::TestWrite::test_async_write[False-False-False-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 38%]\x1b[0m\ntests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::test_DS4Sci_EvoformerAttention[tensor_shape0-dtype0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                             [ 38%]\x1b[0m\ntests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::test_DS4Sci_EvoformerAttention[tensor_shape0-dtype1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                             [ 38%]\x1b[0m\ntests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::test_DS4Sci_EvoformerAttention[tensor_shape1-dtype0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                             [ 38%]\x1b[0m\ntests/unit/ops/deepspeed4science/test_DS4Sci_EvoformerAttention.py::test_DS4Sci_EvoformerAttention[tensor_shape1-dtype1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                             [ 38%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[64-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[64-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[64-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[22-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[22-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[22-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[128-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[128-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[128-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[1024-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[1024-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[1024-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 39%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[1048576-fp16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 40%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[1048576-bf16] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 40%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULion::test_fused_lion_equal[1048576-fp32] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 40%]\x1b[0m\ntests/unit/ops/lion/test_cpu_lion.py::TestCPULionGPUError::test_cpu_lion_gpu_error \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                   [ 40%]\x1b[0m\ntests/unit/ops/lion/test_lion.py::TestLionConfigs::test[Lion-False-FusedLion] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                        [ 40%]\x1b[0m\ntests/unit/ops/lion/test_lion.py::TestLionConfigs::test[Lion-True-DeepSpeedCPULion] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 40%]\x1b[0m\ntests/unit/pipe/test_pipe_module.py::TestPipeModuleSequential::test[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                           [ 40%]\x1b[0m\ntests/unit/pipe/test_pipe_module.py::TestPipeModuleSequential::test[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                            [ 40%]\x1b[0m\ntests/unit/profiling/flops_profiler/test_flops_profiler.py::TestFlopsProfiler::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 40%]\x1b[0m\ntests/unit/profiling/flops_profiler/test_flops_profiler.py::TestFlopsProfiler::test_flops_profiler_in_inference \x1b[32mPASSED\x1b[0m\x1b[31m                                                                      [ 40%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastDisable::test_missing_amp_autocast[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 40%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastDisable::test_missing_amp_autocast[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 40%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastDisable::test_disable_autocast_linear[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m        [ 41%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastDisable::test_disable_autocast_linear[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m         [ 41%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastEnable::test_autocast_linear[False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 41%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastEnable::test_autocast_linear[False-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 41%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastEnable::test_autocast_linear[True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 41%]\x1b[0m\ntests/unit/runtime/test_autocast.py::TestAutoCastEnable::test_autocast_linear[True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 41%]\x1b[0m\ntests/unit/runtime/test_data.py::test_repeating_loader \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                               [ 41%]\x1b[0m\ntests/unit/runtime/test_data.py::TestDataLoaderDropLast::test[1-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 41%]\x1b[0m\ntests/unit/runtime/test_data.py::TestDataLoaderDropLast::test[4-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 41%]\x1b[0m\ntests/unit/runtime/test_data.py::TestDataLoaderDropLast::test[1-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 41%]\x1b[0m\ntests/unit/runtime/test_data.py::TestDataLoaderDropLast::test[4-False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 41%]\x1b[0m\ntests/unit/runtime/test_data_efficiency.py::TestDataEfficiency::test_curriculum_learning \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 42%]\x1b[0m\ntests/unit/runtime/test_data_efficiency.py::TestLegacyCurriculumScheduler::test_fixed_discrete \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m       [ 42%]\x1b[0m\ntests/unit/runtime/test_data_efficiency.py::TestLegacyCurriculumScheduler::test_fixed_linear \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m         [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestBasicConfig::test_accelerator \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                         [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestBasicConfig::test_check_version \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestBatchConfig::test[2-32-16-1-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                     [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestBatchConfig::test[2-32-8-2-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                      [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestBatchConfig::test[2-33-17-2-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                    [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestBatchConfig::test[2-32-18-1-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                    [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::test_temp_config_json \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                     [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::test_gather_16bit_params_on_model_save[stage3_gather_16bit_weights_on_model_save] \x1b[32mPASSED\x1b[0m\x1b[31m                                                         [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::test_gather_16bit_params_on_model_save[stage3_gather_fp16_weights_on_model_save] \x1b[32mPASSED\x1b[0m\x1b[31m                                                          [ 42%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bf16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                           [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::test_get_bfloat16_enabled[bfloat16] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestConfigLoad::test_dict \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                 [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestConfigLoad::test_json \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                 [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestConfigLoad::test_hjson \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestDeprecatedDeepScaleConfig::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                       [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestDistInit::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                        [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestInitNoOptimizer::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                 [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestArgs::test_none_args \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                  [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestArgs::test_no_args \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                    [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_dict.py::TestNoModel::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                         [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_only_required_fields \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 43%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_duplicate_key \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                         [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base_deprecatedfield \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                         [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base_aliasfield \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                              [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                               [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                               [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base_literalfail[config_dict2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                               [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_config_model.py::test_config_base_deprecatedfail \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                          [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestNoOptim::test[0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                       [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestNoOptim::test[3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                       [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                            [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[Optimizer] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 44%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientOptimizer::test[Callable] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                        [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[True] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                            [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestConfigOptimizer::test[False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                           [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 45%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-bf16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[None-fp32-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 46%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-bf16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-bf16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-bf16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-bf16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-bf16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp16-fp32-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 47%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-bf16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 48%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[bf16-fp32-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-bf16-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-bf16-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-bf16-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-bf16-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-bf16-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 49%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-zero3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-amp] \x1b[33mSKIPPED\x1b[0m (Amp is not installed can\'t run amp check)\x1b[31m                                               [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestOptimizerImplementation::test[fp32-fp32-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                     [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-_LRScheduler] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                             [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[None-Callable] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                 [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-None] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-_LRScheduler] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                        [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Optimizer-Callable] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Callable-None] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 50%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Callable-_LRScheduler] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                         [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrScheduler::test[Callable-Callable] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                             [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_same_lrscheler_and_callable[None] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                          [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_same_lrscheler_and_callable[_LRScheduler] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                  [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_same_lrscheler_and_callable[Callable] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                      [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable[None] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                          [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable[_LRScheduler] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                  [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable[Callable] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                      [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable_onecyclelr_steplr[None] \x1b[32mPASSED\x1b[0m\x1b[31m                                                        [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable_onecyclelr_steplr[_LRScheduler] \x1b[32mPASSED\x1b[0m\x1b[31m                                                [ 51%]\x1b[0m\ntests/unit/runtime/test_ds_initialize.py::TestClientLrSchedulerInit::test_diff_lrscheler_and_callable_onecyclelr_steplr[Callable] \x1b[32mPASSED\x1b[0m\x1b[31m                                                    [ 51%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[WarmupLR-params0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 51%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[WarmupDecayLR-params1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[OneCycle-params2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestGetLrBeforeTrain::test[LRRangeTest-params3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                            [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-10] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                            [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-15] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                            [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-19] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                            [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[log-33] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                            [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-10] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-15] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-19] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_schedule[linear-33] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-10] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                      [ 52%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-15] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                      [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-19] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                      [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[log-33] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                      [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-10] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                   [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-15] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                   [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-19] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                   [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrSchedule::test_lr_warmup_decay_schedule[linear-33] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                   [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupLR-params0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                       [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[WarmupDecayLR-params1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                  [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[OneCycle-params2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                       [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestSchedulerOptimizerParity::test[LRRangeTest-params3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                    [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.0001-1e-05-1-True] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                     [ 53%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrRange::test[1e-05-1e-05-1-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                     [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.0001-0.001-10-True] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.001-0.001-10-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.01-0.01-19-True] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                       [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestLrRange::test[0.01-0.01-19-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                      [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-100] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                             [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0-21-21] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.01-0.001-10-101] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                             [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[0.001-0.1-0.1-21-21] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                 [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_lr[1e-05-0.1-0-10-0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-100] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                [ 54%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-210] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0.001-101] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestOneCycle::test_mom[0.08-0.09-0-211] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[100-10-0.1-0.2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[200-20-0.1-0.2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[500-30-0.0-0.2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[600-300-0.1-0.0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 55%]\x1b[0m\ntests/unit/runtime/test_lr_schedulers.py::TestWarmupCosineLR::test_lr[600-550-0.0-0.0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                               [ 55%]\x1b[0m\ntests/unit/runtime/test_multi_output_model.py::TestTwoOutputModel::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                              [ 55%]\x1b[0m\ntests/unit/runtime/test_multi_output_model.py::TestThreeOutputModel::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                            [ 55%]\x1b[0m\ntests/unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuAdam-Adam] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 55%]\x1b[0m\ntests/unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuAdamW-AdamW] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                               [ 55%]\x1b[0m\ntests/unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[True-MuSGD-SGD] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 56%]\x1b[0m\ntests/unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuAdam-Adam] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                [ 56%]\x1b[0m\ntests/unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuAdamW-AdamW] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 56%]\x1b[0m\ntests/unit/runtime/test_mup_optimizers.py::TestMuPOptimizers::test[False-MuSGD-SGD] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::test_pld_schedule[0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                 [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::test_pld_schedule[0.1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                               [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::test_pld_schedule[0.9] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                               [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::test_pld_schedule[1.0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                               [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                      [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                    [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::TestPLDModel::test_pld_model[0.9] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                    [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::TestPLDModel::test_pld_model[1.0] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                    [ 56%]\x1b[0m\ntests/unit/runtime/test_pld.py::TestNonPLDModel::test_non_pld_model \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                  [ 57%]\x1b[0m\ntests/unit/runtime/test_runtime_utils.py::test_call_to_str \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                           [ 57%]\x1b[0m\ntests/unit/runtime/test_runtime_utils.py::TestClipGradNorm::test_gather \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                              [ 57%]\x1b[0m\ntests/unit/runtime/test_runtime_utils.py::TestClipGradNorm::test_clipped_val \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                         [ 57%]\x1b[0m\ntests/unit/runtime/test_runtime_utils.py::TestCheckOverflow::test[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                             [ 57%]\x1b[0m\ntests/unit/runtime/test_runtime_utils.py::TestCheckOverflow::test[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                              [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs1_outputs1[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs1[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs2[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 57%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_inputs2_outputs3[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                                            [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                    [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestActivationCheckpoint::test_ckpt_arg_none[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                    [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[None] \x1b[32mPASSED\x1b[0m\x1b[31m                                              [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                 [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[True] \x1b[32mPASSED\x1b[0m\x1b[31m                                              [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[non_tensor3] \x1b[32mPASSED\x1b[0m\x1b[31m                                       [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_input[non_tensor4] \x1b[32mPASSED\x1b[0m\x1b[31m                                       [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[None] \x1b[32mPASSED\x1b[0m\x1b[31m                                             [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                [ 58%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[True] \x1b[32mPASSED\x1b[0m\x1b[31m                                             [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[non_tensor3] \x1b[32mPASSED\x1b[0m\x1b[31m                                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensor::test_ckpt_non_tensor_output[non_tensor4] \x1b[32mPASSED\x1b[0m\x1b[31m                                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[None] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output1] \x1b[32mPASSED\x1b[0m\x1b[31m        [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output2] \x1b[32mPASSED\x1b[0m\x1b[31m        [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing.py::TestCheckpointNonTensorOutputOrdering::test_ckpt_non_tensor_output_ordering[non_tensor_output3] \x1b[32mPASSED\x1b[0m\x1b[31m        [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs1_outputs1[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs1_outputs1[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs1[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs1[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs2[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 59%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs2[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs3[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_inputs2_outputs3[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                      [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_arg_none[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                              [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithGrad::test_ckpt_arg_none[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                              [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[None] \x1b[32mPASSED\x1b[0m\x1b[31m                        [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[2] \x1b[32mPASSED\x1b[0m\x1b[31m                           [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[True] \x1b[32mPASSED\x1b[0m\x1b[31m                        [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[non_tensor3] \x1b[32mPASSED\x1b[0m\x1b[31m                 [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_input[non_tensor4] \x1b[32mPASSED\x1b[0m\x1b[31m                 [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[None] \x1b[32mPASSED\x1b[0m\x1b[31m                       [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[2] \x1b[32mPASSED\x1b[0m\x1b[31m                          [ 60%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[True] \x1b[32mPASSED\x1b[0m\x1b[31m                       [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[non_tensor3] \x1b[32mPASSED\x1b[0m\x1b[31m                [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithGrad::test_ckpt_non_tensor_output[non_tensor4] \x1b[32mPASSED\x1b[0m\x1b[31m                [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithGrad::test_ckpt_non_tensor_output_ordering[None] \x1b[32mPASSED\x1b[0m\x1b[31m [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output1] \x1b[32mPASSED\x1b[0m\x1b[31m [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output2] \x1b[32mPASSED\x1b[0m\x1b[31m [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output3] \x1b[32mPASSED\x1b[0m\x1b[31m [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs1_outputs1[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs1_outputs1[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs1[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs1[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs2[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 61%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs2[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs3[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_inputs2_outputs3[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                   [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_arg_none[mask0] \x1b[32mPASSED\x1b[0m\x1b[31m                           [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestActivationCheckpointWithoutGrad::test_ckpt_arg_none[mask1] \x1b[32mPASSED\x1b[0m\x1b[31m                           [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[None] \x1b[32mPASSED\x1b[0m\x1b[31m                     [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[2] \x1b[32mPASSED\x1b[0m\x1b[31m                        [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[True] \x1b[32mPASSED\x1b[0m\x1b[31m                     [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[non_tensor3] \x1b[32mPASSED\x1b[0m\x1b[31m              [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_input[non_tensor4] \x1b[32mPASSED\x1b[0m\x1b[31m              [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[None] \x1b[32mPASSED\x1b[0m\x1b[31m                    [ 62%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[2] \x1b[32mPASSED\x1b[0m\x1b[31m                       [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[True] \x1b[32mPASSED\x1b[0m\x1b[31m                    [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[non_tensor3] \x1b[32mPASSED\x1b[0m\x1b[31m             [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorWithoutGrad::test_ckpt_non_tensor_output[non_tensor4] \x1b[32mPASSED\x1b[0m\x1b[31m             [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithoutGrad::test_ckpt_non_tensor_output_ordering[None] \x1b[32mPASSED\x1b[0m\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithoutGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output1] \x1b[32mPASSED\x1b[0m\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithoutGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output2] \x1b[32mPASSED\x1b[0m\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py::TestCheckpointNonTensorOutputOrderingWithoutGrad::test_ckpt_non_tensor_output_ordering[non_tensor_output3] \x1b[32mPASSED\x1b[0m\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/comm/test_coalesced_collectives.py::TestReduceScatterCoalesced::test_single_input \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/comm/test_coalesced_collectives.py::TestReduceScatterCoalesced::test_two_inputs \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m   [ 63%]\x1b[0m\ntests/unit/runtime/comm/test_coalesced_collectives.py::TestReduceScatterCoalescedTensorSmallerThanWorldSize::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requi...)\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/comm/test_coalesced_collectives.py::TestAllToAllQuantReduceFallback::test_1d_tensor \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 avai...)\x1b[31m [ 63%]\x1b[0m\ntests/unit/runtime/comm/test_coalesced_collectives.py::TestAllToAllQuantReduceFallback::test_non_divisible \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ...)\x1b[31m [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-2-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[none-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 64%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-2-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[cpu-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-1-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-1-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-1-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-2-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-2-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 65%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-2-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 66%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 66%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 66%]\x1b[0m\ntests/unit/runtime/compile/test_compile_zero.py::TestZeRO::test_compile_zero[nvme-3-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestAdamBF16ZeroOneCycleCompatibility::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroAllowUntestedOptimizer::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroEmptyPartition::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                         [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[Adam] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                         [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroSupportedClientOptimizer::test[FusedAdam] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                    [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZero2ReduceScatterOff::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                      [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroEmptyGrad::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                              [ 66%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[fp16-fp16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[fp16-bf16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[fp16-fp32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[bf16-fp16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[bf16-bf16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[bf16-fp32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[default-fp16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[default-bf16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_bf16.py::TestZeroDtypeCocktail::test[default-fp32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_dynamic_loss_scale.py::TestFused::test_no_overflow \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                            [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_dynamic_loss_scale.py::TestFused::test_all_overflow \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                           [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_dynamic_loss_scale.py::TestFused::test_some_overflow \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 67%]\x1b[0m\ntests/unit/runtime/half_precision/test_dynamic_loss_scale.py::TestUnfused::test_no_overflow \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_dynamic_loss_scale.py::TestUnfused::test_all_overflow \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_dynamic_loss_scale.py::TestUnfused::test_some_overflow \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                        [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestLambFP32GradClip::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                           [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestLambFP16::test__basic \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                            [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestLambFP16::test_empty_grad \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                        [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP32EmptyGrad::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                          [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamwFP16Basic::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                             [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16OptimizerForMoE::test_unfused_gradnorm \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m       [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16OptimizerForMoE::test_fused_gradnorm \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m         [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16OptimizerForMoE::test_lamb_gradnorm[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m   [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16OptimizerForMoE::test_lamb_gradnorm[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m    [ 68%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamwFP16EmptyGrad::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                         [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[True-1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                  [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[True-2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                  [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[True-3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                  [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[False-1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                 [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[False-2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                 [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAdamFP16ZeroOneCycleCompatibility::test[False-3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                 [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[True-1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[True-2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[True-3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                    [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[False-1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[False-2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 69%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroStaticScale::test[False-3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[True-1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                         [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[True-2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                         [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[True-3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                         [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[False-1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                        [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[False-2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                        [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroAllowUntestedOptimizer::test[False-3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                        [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                 [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                 [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[True-3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                 [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[False-1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                [ 70%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[False-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyPartition::test[False-3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 3 required, 1 available)\x1b[31m                [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAmp::test_adam_basic \x1b[33mSKIPPED\x1b[0m (apex/amp is not installed)\x1b[31m                                                                                [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAmp::test_lamb_basic \x1b[33mSKIPPED\x1b[0m (apex/amp is not installed)\x1b[31m                                                                                [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAmp::test_adam_O2 \x1b[33mSKIPPED\x1b[0m (apex/amp is not installed)\x1b[31m                                                                                   [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestAmp::test_adam_O2_empty_grad \x1b[33mSKIPPED\x1b[0m (apex/amp is not installed)\x1b[31m                                                                        [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                  [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                  [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[FusedAdam-3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                  [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                       [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                       [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroSupportedClientOptimizer::test[Adam-3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                       [ 71%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZero2ReduceScatterOff::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                      [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16AdamTypes::test[True-Adam] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                   [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16AdamTypes::test[True-AdamW] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16AdamTypes::test[False-Adam] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                  [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestFP16AdamTypes::test[False-AdamW] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                 [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZero3LazyScatter::test \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                           [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyGrad::test[1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                           [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyGrad::test[2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                           [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/test_fp16.py::TestZeroEmptyGrad::test[3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                           [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamBasic::test[fp32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamBasic::test[fp16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamExpAvgMask::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m              [ 72%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamCheckpointing::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamCheckpointing::test_overflow \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitAdamFP16Pipeline::test[topo_config0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 ava...)\x1b[31m [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamBasic::test[fp32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamBasic::test[fp16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamExpAvgMask::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamCheckpointing::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamCheckpointing::test_overflow \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestZeroOneAdamFP16Pipeline::test[topo_config0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 av...)\x1b[31m [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambBasic::test[fp32] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambBasic::test[fp16] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m             [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLampExpAvgMask::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m              [ 73%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambCheckpointing::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 74%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambCheckpointing::test_overflow \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 74%]\x1b[0m\ntests/unit/runtime/half_precision/onebit/test_onebit.py::TestOneBitLambFP16Pipeline::test[topo_config0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 ava...)\x1b[31m [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_base[topo_config0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                  [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_base[topo_config1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                  [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_base[topo_config2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                  [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m         [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m         [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe.py::TestPipeCifar10::test_pipe_use_reentrant[topo_config2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m         [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_singlestage \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_train_schedule_singlestage \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                  [ 74%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[8] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_firststage[10] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                           [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[8] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                              [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_midstage[10] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                             [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                             [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[3] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                             [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[8] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                             [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_inference_schedule_laststage[10] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 75%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_schedule_firststage \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                         [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_schedule_laststage \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                          [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_pipe_schedule.py::test_pipe_stagequery \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                  [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_topology_2d \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                           [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_topology_dims \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                         [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_topology_match \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                        [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_topology_rank_repr \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                    [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_topology_3d \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                           [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_topology_comm_list \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                    [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::TestDistributedTopology::test_grid_pipe_data \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m               [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::TestDistributedTopology::test_stage_to_global \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m              [ 76%]\x1b[0m\ntests/unit/runtime/pipe/test_topology.py::test_primes \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                                [ 76%]\x1b[0m\ntests/unit/runtime/sparse_tensor/test_averaging_sparse_gradients.py::TestSparseAdam::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m            [ 77%]\x1b[0m\ntests/unit/runtime/sparse_tensor/test_csr.py::test_csr_addition_self \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                 [ 77%]\x1b[0m\ntests/unit/runtime/sparse_tensor/test_csr.py::test_csr_addition_different \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [ 77%]\x1b[0m\ntests/unit/runtime/sparse_tensor/test_sparse_grads.py::TestSparseAdam::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                          [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::TestPartitionedTensor::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                              [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::TestPartitionedTensorUnEven::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                        [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::TestPartitionedTensorMeta::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                          [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_prefix_sum \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                          [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_valid_partition \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                     [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_short_partition_uniform \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                             [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_short_partition \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                     [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_easy_balance_uniform \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                [ 77%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_easy_balance_balanced \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 78%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_int_balanced \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                        [ 78%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_float_balanced \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                      [ 78%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_float_lastheavy \x1b[33mSKIPPED\x1b[0m (Variance-minimizing partitioning returns different result.)\x1b[31m                                                       [ 78%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_float_midheavy \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                      [ 78%]\x1b[0m\ntests/unit/runtime/utils/test_partition.py::test_balance_bert \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                        [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_ignore_unused_parameters.py::TestStage2IgnoreUnusedParameters::test[False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                              [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_ignore_unused_parameters.py::TestStage2IgnoreUnusedParameters::test[True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                               [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_nvme_checkpointing.py::TestNVMeCheckpointing::test_nvme_checkpointing[cpu-cpu] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                          [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_nvme_checkpointing.py::TestNVMeCheckpointing::test_nvme_checkpointing[cpu-nvme] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                         [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_nvme_checkpointing.py::TestNVMeCheckpointing::test_nvme_checkpointing[nvme-nvme] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                        [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                           [ 78%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                           [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroUnbalancedGradients::test[3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                           [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3RepeatForwardLoop::test[True] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                         [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3RepeatForwardLoop::test[False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                        [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_1_param_group[True-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                     [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_1_param_group[True-3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                     [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_1_param_group[False-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                    [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_1_param_group[False-3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                    [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_2_param_groups[True-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                    [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_2_param_groups[True-3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                    [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_2_param_groups[False-2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                   [ 79%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroToFP32::test_2_param_groups[False-3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                   [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestIncorectAllgatherBucketSize::test[1000] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestIncorectAllgatherBucketSize::test[1001] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                    [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestPartitionNcclAlignment::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m                               [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_param_persistence_threshold[0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 a...)\x1b[31m [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_param_persistence_threshold[10] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ...)\x1b[31m [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_fp16_enabled[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m        [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_fp16_enabled[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m       [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_contiguous_gradients[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 avail...)\x1b[31m [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_contiguous_gradients[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 avai...)\x1b[31m [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_offload_optimizer[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m   [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_offload_optimizer[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m  [ 80%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_zero_grad[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m           [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_zero_grad[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m          [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_prefetching[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m         [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_prefetching[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m        [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_reduce_scatter[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m      [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_reduce_scatter[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m     [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_Dict] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availa...)\x1b[31m [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_NamedTuple] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ...)\x1b[31m [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_namedtuple] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are ...)\x1b[31m [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_Tuple] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are avail...)\x1b[31m [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBase::test_model_class[EltwiseMultiplicationTestNetwork_List] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are availa...)\x1b[31m [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningLargeParam::test[True-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m          [ 81%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningLargeParam::test[True-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m         [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningLargeParam::test[False-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m         [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningLargeParam::test[False-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m        [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningManyParams::test[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m               [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningManyParams::test[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m              [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3InitForParentWeightInitialization::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m               [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3ParamPartitioningBaseBF16::test \x1b[33mSKIPPED\x1b[0m (not working)\x1b[31m                                                                                        [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestParamPartitioningSkipInit::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                            [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroOffloadStage1::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                    [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3DictFwd::test[tuple] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                  [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3DictFwd::test[list] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                   [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZero3DictFwd::test[dict] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                                   [ 82%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[1] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                        [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[2] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                        [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroAdamOptimizerStepCount::test[3] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                                        [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroFrozenWeights::test[1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                 [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroFrozenWeights::test[2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                 [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroFrozenWeights::test[3] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                 [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                               [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroOffloadOptim::test[False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                              [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroPartitionCache::test_training_partition_cache[True] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                    [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestZeroPartitionCache::test_training_partition_cache[False] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                   [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                             [ 83%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-True-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                            [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                            [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype0-False-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                           [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                             [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-True-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                            [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                            [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype1-False-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                           [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                             [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-True-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                            [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-True] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                            [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero.py::TestEmptyParameterGroup::test_empty_param_groups[dtype2-False-False] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                           [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero_config.py::test_zero_config_deprecatedfields \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                       [ 84%]\x1b[0m\ntests/unit/runtime/zero/test_zero_config.py::test_zero_config_aliasfields \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_config.py::test_zero_config_pipeline_loading_checkpoint \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_config.py::test_zero_config_overlapcomm \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                            [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_config.py::test_zero_config_offload_configs \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                        [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_config.py::test_zero_offload_optimizer_config_pipeline \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                             [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestZeroGatheredParametersFree::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestMiCSGatheredParametersFree::test \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                   [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestSerialContext::test_subclass_param \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                 [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestSerialContext::test_scattered_init_dist \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                            [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestSerialContext::test_scatter_halftype \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                               [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestSerialContext::test_throughput_calculation \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                         [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestSerialContext::test_ext_param_getattr \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                              [ 85%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestScatterGather::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context.py::TestGatherUpdate::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                                 [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_ancestry.py::TestSerialParamInit::test_subclass_param_init \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_ancestry.py::TestDSInitWZinit::test \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m                        [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_ext_param_return \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                          [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_ext_param_returnobj \x1b[33mSKIPPED\x1b[0m (WIP)\x1b[31m                                                                                [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_stage_3_output_type[tensor] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                               [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_stage_3_output_type[dict] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                 [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_context_return.py::TestReturnParam::test_stage_3_output_type[None] \x1b[31mFAILED\x1b[0m\x1b[31m                                                                                 [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_dynamic_class.py::TestNewClassDeclaredNestingInit::test_new_class_declared_nesting_init \x1b[32mPASSED\x1b[0m\x1b[31m                                                            [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_dynamic_class.py::TestNewClassDeclaredInsideNestingInit::test_new_class_declared_inside_nesting_init \x1b[32mPASSED\x1b[0m\x1b[31m                                               [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_leaf_module.py::TestSetZ3LeafModule::test_choose_module_by_counter \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m [ 86%]\x1b[0m\ntests/unit/runtime/zero/test_zero_leaf_module.py::TestSetZ3LeafModule::test_choose_module_by_rank \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m    [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_leaf_module.py::TestSetZ3LeafModule::test_no_grad_input_error \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m      [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_leaf_module.py::TestSetZ3LeafModule::test_set_unset_leaf_modules \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m   [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_leaf_module.py::TestSetZ3LeafModule::test_set_no_match_class \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 available)\x1b[31m       [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_nesting_init.py::TestNestingInit::test_nesting_init \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_nesting_init.py::TestShutdownInNestingInit::test_shutdown_in_nesting_init \x1b[32mPASSED\x1b[0m\x1b[31m                                                                          [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_nesting_init.py::TestNestedParallelInit::test_nested_parallel_init \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                 [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_offloadpp.py::test_zero_partial_offload_config \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                     [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_offloadpp.py::TestZeroPartialOffloadConfigSweep::test[4-1024] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m      [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_offloadpp.py::TestZeroPartialOffloadConfigSweep::test[8-1024] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 4 required, 1 available)\x1b[31m      [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-1-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 87%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-1-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 re...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-1-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-1-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-2-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-2-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 re...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-2-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-2-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-3-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-3-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 re...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-3-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[none-3-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-1-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 88%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-1-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-1-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requi...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-1-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-2-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-2-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-2-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requi...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-2-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-3-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-3-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-3-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requi...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[cpu-3-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-1-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 89%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-1-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 re...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-1-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-1-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-2-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-2-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 re...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-2-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-2-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-3-local-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-3-local-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 re...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-3-full-True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 requ...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_zero_fragments[nvme-3-full-False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 req...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_bf16_fragments[True] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 avai...)\x1b[31m [ 90%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentGet::test_bf16_fragments[False] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 required, 1 ava...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-1-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 91%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-2-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[none-3-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 92%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-1-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-2-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 93%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[cpu-3-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2 ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-1-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-1-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-1-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-1-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-1-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-1-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-2-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-2-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-2-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-2-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 94%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-2-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-2-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-3-local-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-3-local-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-3-local-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: ...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-3-full-dtype0] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-3-full-dtype1] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tensor_fragment.py::TestTensorFragmentUpdate::test_zero_fragments[nvme-3-full-dtype2] \x1b[33mSKIPPED\x1b[0m (Skipping test because not enough GPUs are available: 2...)\x1b[31m [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_init[1-1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                     [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_init[2-2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                     [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_init[5-5] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                     [ 95%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_init[32-32] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                   [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[0-0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                   [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_baddim[33-33] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                 [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[32-32-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[32-32-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                 [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[32-32-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[32-32-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                 [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[23-29-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[23-29-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                 [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[23-29-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[23-29-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                 [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[29-23-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 96%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[29-23-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                 [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[29-23-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_forward[29-23-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                 [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[32-32-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                               [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[32-32-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[32-32-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                               [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[32-32-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[23-29-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                               [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[23-29-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[23-29-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                               [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[23-29-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[29-23-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                               [ 97%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[29-23-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[29-23-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                               [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_backward[29-23-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                                [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[32-32-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                    [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[32-32-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                     [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[32-32-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                    [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[32-32-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                     [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[23-29-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                    [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[23-29-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                     [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[23-29-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                    [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[23-29-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                     [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[29-23-1-1-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                    [ 98%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[29-23-1-1-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                     [ 99%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[29-23-2-2-False] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                    [ 99%]\x1b[0m\ntests/unit/runtime/zero/test_zero_tiled.py::test_tiled_returnbias_backward[29-23-2-2-True] \x1b[33mSKIPPED\x1b[0m (seeing nondeterministic failures, skipping for now)\x1b[31m                                     [ 99%]\x1b[0m\ntests/unit/runtime/zero/test_zeropp.py::test_zero_hpz_partition_size_config \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                          [ 99%]\x1b[0m\ntests/unit/utils/test_get_optim_files.py::test_get_optim_files[1] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                    [ 99%]\x1b[0m\ntests/unit/utils/test_get_optim_files.py::test_get_optim_files[2] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                    [ 99%]\x1b[0m\ntests/unit/utils/test_get_optim_files.py::test_get_optim_files[12] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                   [ 99%]\x1b[0m\ntests/unit/utils/test_get_optim_files.py::test_get_optim_files[24] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                   [ 99%]\x1b[0m\ntests/unit/utils/test_groups.py::test_get_expert_parallel_ranks \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                      [ 99%]\x1b[0m\ntests/unit/utils/test_init_on_device.py::TestOnDevice::test_on_device[meta] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                          [ 99%]\x1b[0m\ntests/unit/utils/test_init_on_device.py::TestOnDevice::test_on_device[cuda:0] \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                        [ 99%]\x1b[0m\ntests/unit/utils/test_partition_balanced.py::test_partition_balanced \x1b[32mPASSED\x1b[0m\x1b[31m                                                                                                                 [100%]\x1b[0m\n\n============================================================================================ FAILURES =============================================================================================\n\x1b[31m\x1b[1m____________________________________________________________________________ TestLatestCheckpoint.test_existing_latest ____________________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_latest_checkpoint.py", line 35, in test_existing_latest\n    checkpoint_correctness_verification(config_dict=config_dict,\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/common.py", line 181, in checkpoint_correctness_verification\n    ds_model = create_deepspeed_model(config_dict=config_dict, model=models[0], base_optimizer=base_optimizers[0])\n               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/common.py", line 158, in create_deepspeed_model\n    ds_model, _, _, _ = deepspeed.initialize(config=config_dict,\n                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_existing_latest>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d2b4190>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:18:11,743] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:18:14,010] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:18:14,010] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:18:15,057] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:18:15,095] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/3] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/3] g++ -MMD -MF fused_adam_frontend.o.d -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -fPIC -O3 -std=c++17 -g -Wno-reorder -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -DBF16_AVAILABLE -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/fused_adam_frontend.cpp -o fused_adam_frontend.o \nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nCreating extension directory /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m____________________________________________________________________________ TestLatestCheckpoint.test_missing_latest _____________________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_latest_checkpoint.py", line 57, in test_missing_latest\n    model, _, _, _ = deepspeed.initialize(config=config_dict, model=model, model_parameters=model.parameters())\n                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_missing_latest>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d31b490>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:18:58,590] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:00,664] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:00,664] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:01,469] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:01,506] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_______________________________________________________________________ TestSaveTensorClone.test_save_tensor_clone[True-1] ________________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 611, in test_save_tensor_clone\n    ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)\n                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_save_tensor_clone[True-1]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d59c090>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:04,702] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:06,536] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:06,536] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:07,388] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:07,421] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_______________________________________________________________________ TestSaveTensorClone.test_save_tensor_clone[True-2] ________________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 611, in test_save_tensor_clone\n    ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)\n                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_save_tensor_clone[True-2]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d132d50>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:10,214] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:12,154] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:12,154] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:12,964] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:13,000] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_______________________________________________________________________ TestSaveTensorClone.test_save_tensor_clone[False-1] _______________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 611, in test_save_tensor_clone\n    ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)\n                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_save_tensor_clone[False-1]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d3703d0>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:15,926] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:17,944] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:17,944] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:18,786] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:18,825] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_______________________________________________________________________ TestSaveTensorClone.test_save_tensor_clone[False-2] _______________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 611, in test_save_tensor_clone\n    ds_engine, _, _, _ = deepspeed.initialize(model=model, config_params=ds_config)\n                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_save_tensor_clone[False-2]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d1de990>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:21,682] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:23,733] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:23,733] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:24,569] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:24,604] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_____________________________________________________________________ TestZeRONonDistributed.test_chmod_exception_handling[1] _____________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 642, in test_chmod_exception_handling\n    engine, _, _, _ = deepspeed.initialize(args=args,\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_chmod_exception_handling[1]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d4aaed0>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:27,449] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:29,474] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:29,475] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:29,475] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:30,268] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_____________________________________________________________________ TestZeRONonDistributed.test_chmod_exception_handling[2] _____________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 642, in test_chmod_exception_handling\n    engine, _, _, _ = deepspeed.initialize(args=args,\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_chmod_exception_handling[2]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d1dca50>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:33,184] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:35,097] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:35,098] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:35,098] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:35,912] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_____________________________________________________________________ TestZeRONonDistributed.test_chmod_exception_handling[3] _____________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/checkpoint/test_zero_optimizer.py", line 642, in test_chmod_exception_handling\n    engine, _, _, _ = deepspeed.initialize(args=args,\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/__init__.py", line 181, in initialize\n    engine = DeepSpeedEngine(args=args,\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 308, in __init__\n    self._configure_optimizer(optimizer, model_parameters)\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1267, in _configure_optimizer\n    basic_optimizer = self._configure_basic_optimizer(model_parameters)\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/runtime/engine.py", line 1344, in _configure_basic_optimizer\n    optimizer = FusedAdam(\n                ^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/adam/fused_adam.py", line 94, in __init__\n    fused_adam_cuda = FusedAdamBuilder().load()\n                      ^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'fused_adam\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_chmod_exception_handling[3]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d4aa2d0>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'fused_adam\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:19:38,724] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n[2024-10-30 15:19:40,594] [INFO] [logging.py:96:log_dist] [Rank -1] DeepSpeed info: version=0.14.5+unknown, git-hash=unknown, git-branch=unknown\n[2024-10-30 15:19:40,595] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:19:40,595] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n[2024-10-30 15:19:41,383] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False\n1.11.1\n[1/2] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nFAILED: multi_tensor_adam.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=fused_adam -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 -DVERSION_GE_1_1 -DVERSION_GE_1_3 -DVERSION_GE_1_5 -lineinfo --use_fast_math -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -std=c++17 -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/adam/multi_tensor_adam.cu -o multi_tensor_adam.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/fused_adam/build.ninja...\nBuilding extension module fused_adam...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_______________________________________________________________________________ test_user_args[True-"I am 6\' tall"] _______________________________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...), multi_node = True\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--force_multi\', \'--num_nodes\', ...>, args = [\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m______________________________________________________________________________ test_user_args[True-\'I am 72" tall\'] _______________________________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...), multi_node = True\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--force_multi\', \'--num_nodes\', ...>, args = [\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m____________________________________________________________________ test_user_args[True-\'"translate English to Romanian: "\'] _____________________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...), multi_node = True\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--force_multi\', \'--num_nodes\', ...>, args = [\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m_______________________________________________________________ test_user_args[True-I\'m going to tell them "DeepSpeed is the best"] _______________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...), multi_node = True\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--force_multi\', \'--num_nodes\', ...>, args = [\'deepspeed\', \'--force_multi\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m______________________________________________________________________________ test_user_args[False-"I am 6\' tall"] _______________________________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False__I_am_6__0/user_arg_test.py\'), ...)\nmulti_node = False\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\'...>\nargs = [\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False__I_am_6__0/user_arg_test.py\'), ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m______________________________________________________________________________ test_user_args[False-\'I am 72" tall\'] ______________________________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False__I_am_72_0/user_arg_test.py\'), ...)\nmulti_node = False\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\'...>\nargs = [\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False__I_am_72_0/user_arg_test.py\'), ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m____________________________________________________________________ test_user_args[False-\'"translate English to Romanian: "\'] ____________________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False___transla0/user_arg_test.py\'), ...)\nmulti_node = False\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\'...>\nargs = [\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False___transla0/user_arg_test.py\'), ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m______________________________________________________________ test_user_args[False-I\'m going to tell them "DeepSpeed is the best"] _______________________________________________________________\x1b[0m\n\ncmd = (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False_I_m_going0/user_arg_test.py\'), ...)\nmulti_node = False\n\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mprompt\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'"I am 6\' tall"\'\'\'\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mI am 72\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m tall\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m, \x1b[33m"""\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mtranslate English to Romanian: \x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m,\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m\'\'\'I\'m going to tell them "DeepSpeed is the best"\'\'\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    ])\x1b[90m\x1b[39;49;00m\n    \x1b[37m@pytest\x1b[39;49;00m.mark.parametrize(\x1b[33m"\x1b[39;49;00m\x1b[33mmulti_node\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, [\x1b[94mTrue\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m])\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_user_args\x1b[39;49;00m(cmd, multi_node):\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m multi_node \x1b[95mand\x1b[39;49;00m get_accelerator().device_name() == \x1b[33m"\x1b[39;49;00m\x1b[33mcpu\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            pytest.skip(\x1b[33m"\x1b[39;49;00m\x1b[33mCPU accelerator does not support this test yet\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n>       p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:49: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1024: in __init__\n    \x1b[96mself\x1b[39;49;00m._execute_child(args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <Popen: returncode: 255 args: (\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\'...>\nargs = [\'deepspeed\', \'--num_nodes\', \'1\', \'--num_gpus\', \'1\', local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_user_args_False_I_m_going0/user_arg_test.py\'), ...]\nexecutable = b\'deepspeed\', preexec_fn = None, close_fds = True, pass_fds = (), cwd = None, env = None, startupinfo = None, creationflags = 0, shell = False, p2cread = -1, p2cwrite = -1\nc2pread = 48, c2pwrite = 49, errread = 50, errwrite = 51, restore_signals = True, gid = None, gids = None, uid = None, umask = -1, start_new_session = False, process_group = -1\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92m_execute_child\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, args, executable, preexec_fn, close_fds,\x1b[90m\x1b[39;49;00m\n                       pass_fds, cwd, env,\x1b[90m\x1b[39;49;00m\n                       startupinfo, creationflags, shell,\x1b[90m\x1b[39;49;00m\n                       p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                       c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                       errread, errwrite,\x1b[90m\x1b[39;49;00m\n                       restore_signals,\x1b[90m\x1b[39;49;00m\n                       gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                       start_new_session, process_group):\x1b[90m\x1b[39;49;00m\n    \x1b[90m    \x1b[39;49;00m\x1b[33m"""Execute program (POSIX version)"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, (\x1b[96mstr\x1b[39;49;00m, \x1b[96mbytes\x1b[39;49;00m)):\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melif\x1b[39;49;00m \x1b[96misinstance\x1b[39;49;00m(args, os.PathLike):\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n                \x1b[94mraise\x1b[39;49;00m \x1b[96mTypeError\x1b[39;49;00m(\x1b[33m\'\x1b[39;49;00m\x1b[33mpath-like args is not allowed when \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                                \x1b[33m\'\x1b[39;49;00m\x1b[33mshell is true\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [args]\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            args = \x1b[96mlist\x1b[39;49;00m(args)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m shell:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# On Android the default shell is at \'/system/bin/sh\'.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            unix_shell = (\x1b[33m\'\x1b[39;49;00m\x1b[33m/system/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[94mif\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                      \x1b[96mhasattr\x1b[39;49;00m(sys, \x1b[33m\'\x1b[39;49;00m\x1b[33mgetandroidapilevel\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m) \x1b[94melse\x1b[39;49;00m \x1b[33m\'\x1b[39;49;00m\x1b[33m/bin/sh\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n            args = [unix_shell, \x1b[33m"\x1b[39;49;00m\x1b[33m-c\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m] + args\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m executable:\x1b[90m\x1b[39;49;00m\n                args[\x1b[94m0\x1b[39;49;00m] = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m executable \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            executable = args[\x1b[94m0\x1b[39;49;00m]\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        sys.audit(\x1b[33m"\x1b[39;49;00m\x1b[33msubprocess.Popen\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, executable, args, cwd, env)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m (_USE_POSIX_SPAWN\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m os.path.dirname(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m preexec_fn \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m close_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m pass_fds\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m cwd \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (p2cread == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m p2cread > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (c2pwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m c2pwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m (errwrite == -\x1b[94m1\x1b[39;49;00m \x1b[95mor\x1b[39;49;00m errwrite > \x1b[94m2\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m start_new_session\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m process_group == -\x1b[94m1\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m gids \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m uid \x1b[95mis\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[95mand\x1b[39;49;00m umask < \x1b[94m0\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._posix_spawn(args, executable, env, restore_signals,\x1b[90m\x1b[39;49;00m\n                              p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                              c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                              errread, errwrite)\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        orig_executable = executable\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[90m# For transferring possible exec failure from child to parent.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Data format: "exception name:hex errno:description"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[90m# Pickle is not used; it is complex and involves memory allocation.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        errpipe_read, errpipe_write = os.pipe()\x1b[90m\x1b[39;49;00m\n        \x1b[90m# errpipe_write must not be in the standard io 0, 1, or 2 fd range.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        low_fds_to_close = []\x1b[90m\x1b[39;49;00m\n        \x1b[94mwhile\x1b[39;49;00m errpipe_write < \x1b[94m3\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            low_fds_to_close.append(errpipe_write)\x1b[90m\x1b[39;49;00m\n            errpipe_write = os.dup(errpipe_write)\x1b[90m\x1b[39;49;00m\n        \x1b[94mfor\x1b[39;49;00m low_fd \x1b[95min\x1b[39;49;00m low_fds_to_close:\x1b[90m\x1b[39;49;00m\n            os.close(low_fd)\x1b[90m\x1b[39;49;00m\n        \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# We must avoid complex work that could involve\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# malloc or free in the child process to avoid\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# potential deadlocks, thus we do all this here.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# and pass it to fork_exec()\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m env \x1b[95mis\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[94mNone\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = []\x1b[90m\x1b[39;49;00m\n                    \x1b[94mfor\x1b[39;49;00m k, v \x1b[95min\x1b[39;49;00m env.items():\x1b[90m\x1b[39;49;00m\n                        k = os.fsencode(k)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mif\x1b[39;49;00m \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m \x1b[95min\x1b[39;49;00m k:\x1b[90m\x1b[39;49;00m\n                            \x1b[94mraise\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m(\x1b[33m"\x1b[39;49;00m\x1b[33millegal environment variable name\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                        env_list.append(k + \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m=\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m + os.fsencode(v))\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    env_list = \x1b[94mNone\x1b[39;49;00m  \x1b[90m# Use execv instead of execve.\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                executable = os.fsencode(executable)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m os.path.dirname(executable):\x1b[90m\x1b[39;49;00m\n                    executable_list = (executable,)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# This matches the behavior of os._execvpe().\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    executable_list = \x1b[96mtuple\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                        os.path.join(os.fsencode(\x1b[96mdir\x1b[39;49;00m), executable)\x1b[90m\x1b[39;49;00m\n                        \x1b[94mfor\x1b[39;49;00m \x1b[96mdir\x1b[39;49;00m \x1b[95min\x1b[39;49;00m os.get_exec_path(env))\x1b[90m\x1b[39;49;00m\n                fds_to_keep = \x1b[96mset\x1b[39;49;00m(pass_fds)\x1b[90m\x1b[39;49;00m\n                fds_to_keep.add(errpipe_write)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m.pid = _fork_exec(\x1b[90m\x1b[39;49;00m\n                        args, executable_list,\x1b[90m\x1b[39;49;00m\n                        close_fds, \x1b[96mtuple\x1b[39;49;00m(\x1b[96msorted\x1b[39;49;00m(\x1b[96mmap\x1b[39;49;00m(\x1b[96mint\x1b[39;49;00m, fds_to_keep))),\x1b[90m\x1b[39;49;00m\n                        cwd, env_list,\x1b[90m\x1b[39;49;00m\n                        p2cread, p2cwrite, c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                        errread, errwrite,\x1b[90m\x1b[39;49;00m\n                        errpipe_read, errpipe_write,\x1b[90m\x1b[39;49;00m\n                        restore_signals, start_new_session,\x1b[90m\x1b[39;49;00m\n                        process_group, gid, gids, uid, umask,\x1b[90m\x1b[39;49;00m\n                        preexec_fn, _USE_VFORK)\x1b[90m\x1b[39;49;00m\n                \x1b[96mself\x1b[39;49;00m._child_created = \x1b[94mTrue\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                os.close(errpipe_write)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[96mself\x1b[39;49;00m._close_pipe_fds(p2cread, p2cwrite,\x1b[90m\x1b[39;49;00m\n                                 c2pread, c2pwrite,\x1b[90m\x1b[39;49;00m\n                                 errread, errwrite)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[90m# Wait for exec to fail or succeed; possibly raising an\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            \x1b[90m# exception (limited in size)\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            errpipe_data = \x1b[96mbytearray\x1b[39;49;00m()\x1b[90m\x1b[39;49;00m\n            \x1b[94mwhile\x1b[39;49;00m \x1b[94mTrue\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                part = os.read(errpipe_read, \x1b[94m50000\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                errpipe_data += part\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m part \x1b[95mor\x1b[39;49;00m \x1b[96mlen\x1b[39;49;00m(errpipe_data) > \x1b[94m50000\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[94mbreak\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mfinally\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n            \x1b[90m# be sure the FD is closed no matter what\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n            os.close(errpipe_read)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m errpipe_data:\x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                pid, sts = os.waitpid(\x1b[96mself\x1b[39;49;00m.pid, \x1b[94m0\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m pid == \x1b[96mself\x1b[39;49;00m.pid:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m._handle_exitstatus(sts)\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    \x1b[96mself\x1b[39;49;00m.returncode = sys.maxsize\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mChildProcessError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                \x1b[94mpass\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n            \x1b[94mtry\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name, hex_errno, err_msg = (\x1b[90m\x1b[39;49;00m\n                        errpipe_data.split(\x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m:\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m, \x1b[94m2\x1b[39;49;00m))\x1b[90m\x1b[39;49;00m\n                \x1b[90m# The encoding here should match the encoding\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# written in by the subprocess implementations\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                \x1b[90m# like _posixsubprocess\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = err_msg.decode()\x1b[90m\x1b[39;49;00m\n            \x1b[94mexcept\x1b[39;49;00m \x1b[96mValueError\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                exception_name = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mSubprocessError\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                hex_errno = \x1b[33mb\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m0\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                err_msg = \x1b[33m\'\x1b[39;49;00m\x1b[33mBad exception data from child: \x1b[39;49;00m\x1b[33m{!r}\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m.format(\x1b[90m\x1b[39;49;00m\n                              \x1b[96mbytes\x1b[39;49;00m(errpipe_data))\x1b[90m\x1b[39;49;00m\n            child_exception_type = \x1b[96mgetattr\x1b[39;49;00m(\x1b[90m\x1b[39;49;00m\n                    builtins, exception_name.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mascii\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m),\x1b[90m\x1b[39;49;00m\n                    SubprocessError)\x1b[90m\x1b[39;49;00m\n            \x1b[94mif\x1b[39;49;00m \x1b[96missubclass\x1b[39;49;00m(child_exception_type, \x1b[96mOSError\x1b[39;49;00m) \x1b[95mand\x1b[39;49;00m hex_errno:\x1b[90m\x1b[39;49;00m\n                errno_num = \x1b[96mint\x1b[39;49;00m(hex_errno, \x1b[94m16\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                child_exec_never_called = (err_msg == \x1b[33m"\x1b[39;49;00m\x1b[33mnoexec\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m child_exec_never_called:\x1b[90m\x1b[39;49;00m\n                    err_msg = \x1b[33m"\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    \x1b[90m# The error must be from chdir(cwd).\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n                    err_filename = cwd\x1b[90m\x1b[39;49;00m\n                \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_filename = orig_executable\x1b[90m\x1b[39;49;00m\n                \x1b[94mif\x1b[39;49;00m errno_num != \x1b[94m0\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n                    err_msg = os.strerror(errno_num)\x1b[90m\x1b[39;49;00m\n>               \x1b[94mraise\x1b[39;49;00m child_exception_type(errno_num, err_msg, err_filename)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE               FileNotFoundError: [Errno 2] No such file or directory: \'deepspeed\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py\x1b[0m:1917: FileNotFoundError\n\x1b[31m\x1b[1m______________________________________________________________________________________ test_bash_string_args ______________________________________________________________________________________\x1b[0m\n\ntmpdir = local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_bash_string_args0\')\nuser_script_fp = local(\'/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/pytest-of-c3-builder/pytest-0/test_bash_string_args0/user_arg_test.py\')\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mtest_bash_string_args\x1b[39;49;00m(tmpdir, user_script_fp):\x1b[90m\x1b[39;49;00m\n        bash_script = \x1b[33mf\x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m\x1b[33m\x1b[39;49;00m\n    \x1b[33m    ARGS=\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m--prompt \x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33mDeepSpeed is the best\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33m\x1b[39;49;00m\n    \x1b[33m    echo $\x1b[39;49;00m\x1b[33m{{\x1b[39;49;00m\x1b[33mARGS\x1b[39;49;00m\x1b[33m}}\x1b[39;49;00m\x1b[33m|xargs deepspeed --num_nodes 1 --num_gpus 1 \x1b[39;49;00m\x1b[33m{\x1b[39;49;00muser_script_fp\x1b[33m}\x1b[39;49;00m\x1b[33m\x1b[39;49;00m\n    \x1b[33m    \x1b[39;49;00m\x1b[33m"""\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        bash_fp = tmpdir.join(\x1b[33m"\x1b[39;49;00m\x1b[33mbash_script.sh\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n        \x1b[94mwith\x1b[39;49;00m \x1b[96mopen\x1b[39;49;00m(bash_fp, \x1b[33m"\x1b[39;49;00m\x1b[33mw\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m) \x1b[94mas\x1b[39;49;00m f:\x1b[90m\x1b[39;49;00m\n            f.write(bash_script)\x1b[90m\x1b[39;49;00m\n    \x1b[90m\x1b[39;49;00m\n        p = subprocess.Popen([\x1b[33m"\x1b[39;49;00m\x1b[33mbash\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, bash_fp], stdout=subprocess.PIPE, stderr=subprocess.PIPE)\x1b[90m\x1b[39;49;00m\n        out, err = p.communicate()\x1b[90m\x1b[39;49;00m\n>       \x1b[94massert\x1b[39;49;00m \x1b[33m"\x1b[39;49;00m\x1b[33mARG PARSE SUCCESS\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m \x1b[95min\x1b[39;49;00m out.decode(\x1b[33m"\x1b[39;49;00m\x1b[33mutf-8\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m), \x1b[33mf\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[33mUser args not parsed correctly: \x1b[39;49;00m\x1b[33m{\x1b[39;49;00merr.decode(\x1b[33m\'\x1b[39;49;00m\x1b[33mutf-8\x1b[39;49;00m\x1b[33m\'\x1b[39;49;00m)\x1b[33m}\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE       AssertionError: User args not parsed correctly: xargs: deepspeed: No such file or directory\x1b[0m\n\x1b[1m\x1b[31mE         \x1b[0m\n\x1b[1m\x1b[31mE       assert \'ARG PARSE SUCCESS\' in \'\'\x1b[0m\n\x1b[1m\x1b[31mE        +  where \'\' = <built-in method decode of bytes object at 0x14dd146d59c8>(\'utf-8\')\x1b[0m\n\x1b[1m\x1b[31mE        +    where <built-in method decode of bytes object at 0x14dd146d59c8> = b\'\'.decode\x1b[0m\n\n\x1b[1m\x1b[31mtests/unit/launcher/test_user_args.py\x1b[0m:66: AssertionError\n\x1b[31m\x1b[1m_________________________________________________________________ TestCUDABackward.test_backward[64-160-128-2-24-False-True-0.2] __________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 282, in test_backward\n    run_backward(ds_config, seq_len, atol=atol, verbose=True)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 218, in run_backward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 198, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 101, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 102, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_backward[64-160-128-2-24-False-True-0.2]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6c91e950>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:20:22,853] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:20:24,817] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:20:24,817] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 64, \'hidden_size\': 160, \'intermediate_size\': 160, \'heads\': 2, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 24, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': False, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/9] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[8/9] g++ -MMD -MF ds_transformer_cuda.o.d -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -std=c++17 -fPIC -O3 -std=c++17 -g -Wno-reorder -DBF16_AVAILABLE -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/ds_transformer_cuda.cpp -o ds_transformer_cuda.o \nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nCreating extension directory /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_________________________________________________________________ TestCUDABackward.test_backward[64-1600-128-2-4-False-True-0.2] __________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 282, in test_backward\n    run_backward(ds_config, seq_len, atol=atol, verbose=True)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 218, in run_backward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 198, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 101, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 102, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_backward[64-1600-128-2-4-False-True-0.2]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d463ad0>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:01,012] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:03,315] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:03,315] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 64, \'hidden_size\': 1600, \'intermediate_size\': 1600, \'heads\': 2, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 4, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': False, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_________________________________________________________________ TestCUDABackward.test_backward[8-1600-128-25-3-True-True-0.05] __________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 282, in test_backward\n    run_backward(ds_config, seq_len, atol=atol, verbose=True)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 218, in run_backward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 198, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 101, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 102, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_backward[8-1600-128-25-3-True-True-0.05]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d1dd390>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:07,328] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:09,251] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:09,251] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 1600, \'intermediate_size\': 1600, \'heads\': 25, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': True, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m___________________________________________________________________ TestCUDABackward.test_backward[8-160-128-2-3-True-True-0.1] ___________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 282, in test_backward\n    run_backward(ds_config, seq_len, atol=atol, verbose=True)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 218, in run_backward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 198, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 101, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 102, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_backward[8-160-128-2-3-True-True-0.1]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6f524ed0>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:13,866] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:15,833] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:15,833] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 160, \'intermediate_size\': 160, \'heads\': 2, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': True, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m__________________________________________________________________ TestCUDABackward.test_backward[8-1600-128-2-3-True-True-0.05] __________________________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 282, in test_backward\n    run_backward(ds_config, seq_len, atol=atol, verbose=True)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 218, in run_backward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 198, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 101, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_backward.py", line 102, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_backward[8-1600-128-2-3-True-True-0.05]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d381390>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:19,561] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:21,503] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:21,504] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 1600, \'intermediate_size\': 1600, \'heads\': 2, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': True, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_____________________________________________________ TestCUDAForwardSmallBatchSize.test_forward_with_small_bsz[8-3-1024-512-16-3-True-False] _____________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 286, in test_forward_with_small_bsz\n    run_forward(ds_config, seq_len, atol=3e-2, test_bsz=small_bsz)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 167, in run_forward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 147, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 52, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 53, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_forward_with_small_bsz[8-3-1024-512-16-3-True-False]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d0c2810>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:26,074] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:28,037] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:28,037] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 1024, \'intermediate_size\': 4096, \'heads\': 16, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': False, \'pre_layer_norm\': True, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_____________________________________________________ TestCUDAForwardSmallBatchSize.test_forward_with_small_bsz[8-7-1024-512-16-3-True-True] ______________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 286, in test_forward_with_small_bsz\n    run_forward(ds_config, seq_len, atol=3e-2, test_bsz=small_bsz)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 167, in run_forward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 147, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 52, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 53, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_forward_with_small_bsz[8-7-1024-512-16-3-True-True]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d1fce50>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:31,966] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:34,021] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:34,021] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 1024, \'intermediate_size\': 4096, \'heads\': 16, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': True, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m____________________________________________________ TestCUDAForwardSmallBatchSize.test_forward_with_small_bsz[8-3-1024-512-16-3-False-False] _____________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 286, in test_forward_with_small_bsz\n    run_forward(ds_config, seq_len, atol=3e-2, test_bsz=small_bsz)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 167, in run_forward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 147, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 52, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 53, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_forward_with_small_bsz[8-3-1024-512-16-3-False-False]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6d22d5d0>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:37,863] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:39,789] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:39,789] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 1024, \'intermediate_size\': 4096, \'heads\': 16, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': False, \'pre_layer_norm\': False, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\nninja: build stopped: subcommand failed.\n-------------------------------------------------------------------------------------- Captured stderr call ---------------------------------------------------------------------------------------\nUsing /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121 as PyTorch extensions root...\nDetected CUDA files, patching ldflags\nEmitting ninja build file /dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/xdg-cache-home/torch_extensions/py311_cu121/transformer/build.ninja...\nBuilding extension module transformer...\nAllowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n\x1b[31m\x1b[1m_____________________________________________________ TestCUDAForwardSmallBatchSize.test_forward_with_small_bsz[8-7-1024-512-16-3-False-True] _____________________________________________________\x1b[0m\nmultiprocessing.pool.RemoteTraceback: \n"""\nTraceback (most recent call last):\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2100, in _run_ninja_build\n    subprocess.run(\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/subprocess.py", line 571, in run\n    raise CalledProcessError(retcode, process.args,\nsubprocess.CalledProcessError: Command \'[\'ninja\', \'-v\']\' returned non-zero exit status 1.\n\nThe above exception was the direct cause of the following exception:\n\nTraceback (most recent call last):\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 125, in worker\n    result = (True, func(*args, **kwds))\n                    ^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py", line 51, in starmapstar\n    return list(itertools.starmap(args[0], args[1]))\n           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 306, in _dist_run\n    raise e\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 298, in _dist_run\n    self.run(**self._fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/common.py", line 438, in run\n    self._current_test(**fixture_kwargs)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 286, in test_forward_with_small_bsz\n    run_forward(ds_config, seq_len, atol=3e-2, test_bsz=small_bsz)\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 167, in run_forward\n    bert_encoder, ds_encoder = create_models(ds_config)\n                               ^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 147, in create_models\n    ds_encoder = DSEncoder(ds_config, weights, biases)\n                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 52, in __init__\n    self.layer = nn.ModuleList([\n                               ^\n  File "/dev/shm/DeepSpeed/0.14.5/foss-2023a-CUDA-12.1.1/DeepSpeed/DeepSpeed-0.14.5/tests/unit/ops/accelerators/test_accelerator_forward.py", line 53, in <listcomp>\n    copy.deepcopy(DeepSpeedTransformerLayer(config, weights, biases)) for _ in range(config.num_hidden_layers)\n                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/transformer/transformer.py", line 363, in __init__\n    transformer_cuda_module = TransformerBuilder().load()\n                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 529, in load\n    return self.jit_load(verbose)\n           ^^^^^^^^^^^^^^^^^^^^^^\n  File "/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/op_builder/builder.py", line 577, in jit_load\n    op_module = load(name=self.name,\n                ^^^^^^^^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1308, in load\n    return _jit_compile(\n           ^^^^^^^^^^^^^\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1710, in _jit_compile\n    _write_ninja_file_and_build_library(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 1823, in _write_ninja_file_and_build_library\n    _run_ninja_build(\n  File "/apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/utils/cpp_extension.py", line 2116, in _run_ninja_build\n    raise RuntimeError(message) from e\nRuntimeError: Error building extension \'transformer\'\n"""\n\n\x1b[33mThe above exception was the direct cause of the following exception:\x1b[0m\n\nitem = <Function test_forward_with_small_bsz[8-7-1024-512-16-3-False-True]>\n\n    \x1b[37m@pytest\x1b[39;49;00m.hookimpl(tryfirst=\x1b[94mTrue\x1b[39;49;00m)\x1b[90m\x1b[39;49;00m\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mpytest_runtest_call\x1b[39;49;00m(item):\x1b[90m\x1b[39;49;00m\n        \x1b[90m# We want to use our own launching function for distributed tests\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mgetattr\x1b[39;49;00m(item.cls, \x1b[33m"\x1b[39;49;00m\x1b[33mis_dist_test\x1b[39;49;00m\x1b[33m"\x1b[39;49;00m, \x1b[94mFalse\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n            dist_test_class = item.cls()\x1b[90m\x1b[39;49;00m\n>           dist_test_class(item._request)\x1b[90m\x1b[39;49;00m\n\n\x1b[1m\x1b[31mtests/conftest.py\x1b[0m:69: \n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:458: in __call__\n    \x1b[96mself\x1b[39;49;00m._launch_procs(procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:268: in _launch_procs\n    \x1b[96mself\x1b[39;49;00m._launch_daemonic_procs(num_procs)\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mtests/unit/common.py\x1b[0m:184: in _launch_daemonic_procs\n    skip_msgs = skip_msgs_async.get(\x1b[96mself\x1b[39;49;00m.exec_timeout)\x1b[90m\x1b[39;49;00m\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _\n\nself = <multiprocessing.pool.MapResult object at 0x14da6c933650>, timeout = 600\n\n    \x1b[94mdef\x1b[39;49;00m \x1b[92mget\x1b[39;49;00m(\x1b[96mself\x1b[39;49;00m, timeout=\x1b[94mNone\x1b[39;49;00m):\x1b[90m\x1b[39;49;00m\n        \x1b[96mself\x1b[39;49;00m.wait(timeout)\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[95mnot\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m.ready():\x1b[90m\x1b[39;49;00m\n            \x1b[94mraise\x1b[39;49;00m \x1b[96mTimeoutError\x1b[39;49;00m\x1b[90m\x1b[39;49;00m\n        \x1b[94mif\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._success:\x1b[90m\x1b[39;49;00m\n            \x1b[94mreturn\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n        \x1b[94melse\x1b[39;49;00m:\x1b[90m\x1b[39;49;00m\n>           \x1b[94mraise\x1b[39;49;00m \x1b[96mself\x1b[39;49;00m._value\x1b[90m\x1b[39;49;00m\n\x1b[1m\x1b[31mE           RuntimeError: Error building extension \'transformer\'\x1b[0m\n\n\x1b[1m\x1b[31m/apps/Test/software/Python/3.11.3-GCCcore-12.3.0/lib/python3.11/multiprocessing/pool.py\x1b[0m:774: RuntimeError\n-------------------------------------------------------------------------------------- Captured stdout call ---------------------------------------------------------------------------------------\n[2024-10-30 15:21:43,518] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\nBetter speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n[2024-10-30 15:21:45,484] [INFO] [comm.py:637:init_distributed] cdb=None\n[2024-10-30 15:21:45,485] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n1.11.1\nDeepSpeed Transformer config is  {\'layer_id\': 0, \'batch_size\': 8, \'hidden_size\': 1024, \'intermediate_size\': 4096, \'heads\': 16, \'attn_dropout_ratio\': 0.0, \'hidden_dropout_ratio\': 0.0, \'num_hidden_layers\': 3, \'initializer_range\': 0.02, \'fp16\': True, \'pre_layer_norm\': False, \'local_rank\': -1, \'seed\': -1, \'normalize_invertible\': False, \'gelu_checkpoint\': False, \'adjust_init_range\': True, \'test_gemm\': False, \'layer_norm_eps\': 1e-12, \'training\': True, \'is_grad_enabled\': True, \'attn_dropout_checkpoint\': False, \'stochastic_mode\': False, \'return_tuple\': False}\n[1/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nFAILED: transform_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/transform_kernels.cu -o transform_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[2/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nFAILED: cublas_wrappers.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/cublas_wrappers.cu -o cublas_wrappers.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[3/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nFAILED: gelu_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/gelu_kernels.cu -o gelu_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[4/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nFAILED: normalize_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/normalize_kernels.cu -o normalize_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[5/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nFAILED: dropout_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/dropout_kernels.cu -o dropout_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[6/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nFAILED: softmax_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/softmax_kernels.cu -o softmax_kernels.cuda.o \nnvcc fatal   : Unknown option \'-fPIC\'\n[7/8] /apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/includes -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/torch/csrc/api/include -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/TH -isystem /apps/Test/software/PyTorch/2.1.2-foss-2023a-CUDA-12.1.1/lib/python3.11/site-packages/torch/include/THC -isystem /apps/Common/software/CUDA/12.1.1/include -isystem /apps/Test/software/Python/3.11.3-GCCcore-12.3.0/include/python3.11 -D_GLIBCXX_USE_CXX11_ABI=1 -D__CUDA_NO_HALF_OPERATORS__ -D__CUDA_NO_HALF_CONVERSIONS__ -D__CUDA_NO_BFLOAT16_CONVERSIONS__ -D__CUDA_NO_HALF2_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_80,code=sm_80 --compiler-options \'-fPIC\' -fPIC -O3 --use_fast_math -std=c++17 -U__CUDA_NO_HALF_OPERATORS__ -U__CUDA_NO_HALF_CONVERSIONS__ -U__CUDA_NO_HALF2_OPERATORS__ --threads=8 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_80,code=compute_80 -DBF16_AVAILABLE -U__CUDA_NO_BFLOAT16_OPERATORS__ -U__CUDA_NO_BFLOAT162_OPERATORS__ -U__CUDA_NO_BFLOAT16_CONVERSIONS__ -c /cephyr/NOBACKUP/priv/c3-staff/eb-tmp/eb-lksbku1s/tmp0p2puzw9/lib/python3.11/site-packages/deepspeed/ops/csrc/transformer/general_kernels.cu -o general_kernels.cuda.o \nFAILED: general_kernels.cuda.o \n/apps/Common/software/CUDA/12.1.1/bin/nvcc  -ccbin gcc -DTORCH_EXTENSION_NAME=transformer -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\\"_gcc\\" -DPYBIND11_STDLIB=\\"_libstdcpp\\" -DPYBIND11_BUILD_ABI=\\"_cxxabi1017\\" -I/cephyr/NOBA