Created
October 31, 2025 09:50
-
-
Save AmosLewis/bae7dd671e0f82f4a97eb1d510a758b2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # Debug script to see the full FBGEMM-GPU build error for B200 | |
| CONTAINER="nvcr.io/nvidia/pytorch:25.10-py3" | |
| echo "==========================================" | |
| echo "Debugging FBGEMM-GPU Build for B200" | |
| echo "==========================================" | |
| echo "" | |
| docker run --gpus '"device=0"' --rm \ | |
| --network host \ | |
| --ipc=host \ | |
| -v $(pwd):/workspace \ | |
| -w /workspace \ | |
| $CONTAINER \ | |
| bash -c " | |
| set -e | |
| echo '1. Installing build dependencies...' | |
| apt-get update -qq | |
| apt-get install -y -qq git cmake ninja-build | |
| pip install -q setuptools_git_versioning scikit-build | |
| echo '' | |
| echo '2. PyTorch and CUDA versions:' | |
| python -c 'import torch; print(\"PyTorch:\", torch.__version__); print(\"CUDA:\", torch.version.cuda)' | |
| echo '' | |
| echo '3. Cloning FBGEMM...' | |
| cd /tmp | |
| rm -rf FBGEMM | |
| git clone -q --recursive https://github.com/pytorch/FBGEMM.git | |
| cd FBGEMM/fbgemm_gpu | |
| echo '' | |
| echo '4. Setting build environment for B200 using sm_100a...' | |
| # Per GitHub issue #4975, FBGEMM supports sm_100a (not plain sm_100) | |
| # sm_100a = B200 with full feature support | |
| unset TORCH_CUDA_ARCH_LIST | |
| export MAX_JOBS=8 # Increase parallel jobs | |
| export USE_CUDA=1 | |
| echo 'MAX_JOBS=' \$MAX_JOBS | |
| echo 'Building for CUDA arch: 10.0a (sm_100a - B200 with FBGEMM support)' | |
| echo '' | |
| echo '5. Building FBGEMM-GPU with sm_100a for B200...' | |
| echo '==========================================' | |
| # Use sm_100a which FBGEMM may have better support for | |
| python setup.py install \ | |
| --build-variant=cuda \ | |
| --build-target=default \ | |
| -DTORCH_CUDA_ARCH_LIST='10.0a' \ | |
| 2>&1 | tee /tmp/fbgemm_build_full.log | |
| BUILD_EXIT_CODE=\${PIPESTATUS[0]} | |
| echo '' | |
| echo '==========================================' | |
| echo 'Build exit code:' \$BUILD_EXIT_CODE | |
| echo '' | |
| if [ \$BUILD_EXIT_CODE -ne 0 ]; then | |
| echo '❌ Build FAILED. Checking CMake error...' | |
| echo '' | |
| echo 'CMake configuration errors:' | |
| grep -A 20 'CMake Error' /tmp/fbgemm_build_full.log || echo 'No CMake Error found in log' | |
| echo '' | |
| echo 'Checking for CUDA architecture issues:' | |
| grep -i 'arch\|sm_\|compute capability' /tmp/fbgemm_build_full.log || echo 'No architecture messages' | |
| echo '' | |
| echo 'Last 50 lines of build log:' | |
| tail -50 /tmp/fbgemm_build_full.log | |
| exit 1 | |
| fi | |
| echo '' | |
| echo '6. Testing FBGEMM-GPU import...' | |
| python -c ' | |
| import sys | |
| try: | |
| import fbgemm_gpu | |
| print(\"✓ FBGEMM-GPU imported successfully\") | |
| print(\"FBGEMM-GPU location:\", fbgemm_gpu.__file__) | |
| except Exception as e: | |
| print(\"❌ Failed to import fbgemm_gpu:\", e) | |
| sys.exit(1) | |
| ' | |
| echo '' | |
| echo '7. Testing basic CUDA ops...' | |
| python -c ' | |
| import torch | |
| import fbgemm_gpu | |
| if torch.cuda.is_available(): | |
| print(\"✓ CUDA available:\", torch.cuda.get_device_name(0)) | |
| print(\"✓ Compute capability:\", torch.cuda.get_device_capability(0)) | |
| # Try a simple FBGEMM-GPU operation | |
| print(\"Testing FBGEMM-GPU CUDA ops...\") | |
| x = torch.randn(10, 10).cuda() | |
| print(\"✓ Basic CUDA ops work\") | |
| else: | |
| print(\"❌ CUDA not available\") | |
| ' | |
| echo '' | |
| echo '==========================================' | |
| echo '✅ ALL TESTS PASSED!' | |
| echo '==========================================' | |
| " 2>&1 | tee fbgemm_debug.log | |
| echo "" | |
| echo "Debug log saved to: fbgemm_debug.log" | |
| echo "" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment