AmosLewis · October 31, 2025 09:50
diff --git a/debug_fbgemm_build.sh b/debug_fbgemm_build.sh
 #!/usr/bin/env bash
 # Debug script to see the full FBGEMM-GPU build error for B200

 CONTAINER="nvcr.io/nvidia/pytorch:25.10-py3"

 echo "=========================================="
 echo "Debugging FBGEMM-GPU Build for B200"
 echo "=========================================="
 echo ""

 docker run --gpus '"device=0"' --rm \
  --network host \
  --ipc=host \
  -v $(pwd):/workspace \
  -w /workspace \
  $CONTAINER \
  bash -c "
    set -e
    
    echo '1. Installing build dependencies...'
    apt-get update -qq
    apt-get install -y -qq git cmake ninja-build
    pip install -q setuptools_git_versioning scikit-build
    
    echo ''
    echo '2. PyTorch and CUDA versions:'
    python -c 'import torch; print(\"PyTorch:\", torch.__version__); print(\"CUDA:\", torch.version.cuda)'
    
    echo ''
    echo '3. Cloning FBGEMM...'
    cd /tmp
    rm -rf FBGEMM
    git clone -q --recursive https://github.com/pytorch/FBGEMM.git
    cd FBGEMM/fbgemm_gpu
    
    echo ''
    echo '4. Setting build environment for B200 using sm_100a...'
    # Per GitHub issue #4975, FBGEMM supports sm_100a (not plain sm_100)
    # sm_100a = B200 with full feature support
    unset TORCH_CUDA_ARCH_LIST
    
    export MAX_JOBS=8  # Increase parallel jobs
    export USE_CUDA=1
    
    echo 'MAX_JOBS=' \$MAX_JOBS
    echo 'Building for CUDA arch: 10.0a (sm_100a - B200 with FBGEMM support)'
    echo ''
    
    echo '5. Building FBGEMM-GPU with sm_100a for B200...'
    echo '=========================================='
    
    # Use sm_100a which FBGEMM may have better support for
    python setup.py install \
        --build-variant=cuda \
        --build-target=default \
        -DTORCH_CUDA_ARCH_LIST='10.0a' \
        2>&1 | tee /tmp/fbgemm_build_full.log
    
    BUILD_EXIT_CODE=\${PIPESTATUS[0]}
    
    echo ''
    echo '=========================================='
    echo 'Build exit code:' \$BUILD_EXIT_CODE
    echo ''
    
    if [ \$BUILD_EXIT_CODE -ne 0 ]; then
        echo '❌ Build FAILED. Checking CMake error...'
        echo ''
        echo 'CMake configuration errors:'
        grep -A 20 'CMake Error' /tmp/fbgemm_build_full.log || echo 'No CMake Error found in log'
        echo ''
        echo 'Checking for CUDA architecture issues:'
        grep -i 'arch\|sm_\|compute capability' /tmp/fbgemm_build_full.log || echo 'No architecture messages'
        echo ''
        echo 'Last 50 lines of build log:'
        tail -50 /tmp/fbgemm_build_full.log
        exit 1
    fi
    
    echo ''
    echo '6. Testing FBGEMM-GPU import...'
    python -c '
 import sys
 try:
    import fbgemm_gpu
    print(\"✓ FBGEMM-GPU imported successfully\")
    print(\"FBGEMM-GPU location:\", fbgemm_gpu.__file__)
 except Exception as e:
    print(\"❌ Failed to import fbgemm_gpu:\", e)
    sys.exit(1)
 '
    
    echo ''
    echo '7. Testing basic CUDA ops...'
    python -c '
 import torch
 import fbgemm_gpu
 if torch.cuda.is_available():
    print(\"✓ CUDA available:\", torch.cuda.get_device_name(0))
    print(\"✓ Compute capability:\", torch.cuda.get_device_capability(0))
    # Try a simple FBGEMM-GPU operation
    print(\"Testing FBGEMM-GPU CUDA ops...\")
    x = torch.randn(10, 10).cuda()
    print(\"✓ Basic CUDA ops work\")
 else:
    print(\"❌ CUDA not available\")
 '
    
    echo ''
    echo '=========================================='
    echo '✅ ALL TESTS PASSED!'
    echo '=========================================='
  " 2>&1 | tee fbgemm_debug.log

 echo ""
 echo "Debug log saved to: fbgemm_debug.log"
 echo ""
	#!/usr/bin/env bash
	# Debug script to see the full FBGEMM-GPU build error for B200

	CONTAINER="nvcr.io/nvidia/pytorch:25.10-py3"

	echo "=========================================="
	echo "Debugging FBGEMM-GPU Build for B200"
	echo "=========================================="
	echo ""

	docker run --gpus '"device=0"' --rm \
	--network host \
	--ipc=host \
	-v $(pwd):/workspace \
	-w /workspace \
	$CONTAINER \
	bash -c "
	set -e

	echo '1. Installing build dependencies...'
	apt-get update -qq
	apt-get install -y -qq git cmake ninja-build
	pip install -q setuptools_git_versioning scikit-build

	echo ''
	echo '2. PyTorch and CUDA versions:'
	python -c 'import torch; print(\"PyTorch:\", torch.__version__); print(\"CUDA:\", torch.version.cuda)'

	echo ''
	echo '3. Cloning FBGEMM...'
	cd /tmp
	rm -rf FBGEMM
	git clone -q --recursive https://github.com/pytorch/FBGEMM.git
	cd FBGEMM/fbgemm_gpu

	echo ''
	echo '4. Setting build environment for B200 using sm_100a...'
	# Per GitHub issue #4975, FBGEMM supports sm_100a (not plain sm_100)
	# sm_100a = B200 with full feature support
	unset TORCH_CUDA_ARCH_LIST

	export MAX_JOBS=8 # Increase parallel jobs
	export USE_CUDA=1

	echo 'MAX_JOBS=' \$MAX_JOBS
	echo 'Building for CUDA arch: 10.0a (sm_100a - B200 with FBGEMM support)'
	echo ''

	echo '5. Building FBGEMM-GPU with sm_100a for B200...'
	echo '=========================================='

	# Use sm_100a which FBGEMM may have better support for
	python setup.py install \
	--build-variant=cuda \
	--build-target=default \
	-DTORCH_CUDA_ARCH_LIST='10.0a' \
	2>&1 \| tee /tmp/fbgemm_build_full.log

	BUILD_EXIT_CODE=\${PIPESTATUS[0]}

	echo ''
	echo '=========================================='
	echo 'Build exit code:' \$BUILD_EXIT_CODE
	echo ''

	if [ \$BUILD_EXIT_CODE -ne 0 ]; then
	echo '❌ Build FAILED. Checking CMake error...'
	echo ''
	echo 'CMake configuration errors:'
	grep -A 20 'CMake Error' /tmp/fbgemm_build_full.log \|\| echo 'No CMake Error found in log'
	echo ''
	echo 'Checking for CUDA architecture issues:'
	grep -i 'arch\\|sm_\\|compute capability' /tmp/fbgemm_build_full.log \|\| echo 'No architecture messages'
	echo ''
	echo 'Last 50 lines of build log:'
	tail -50 /tmp/fbgemm_build_full.log
	exit 1
	fi

	echo ''
	echo '6. Testing FBGEMM-GPU import...'
	python -c '
	import sys
	try:
	import fbgemm_gpu
	print(\"✓ FBGEMM-GPU imported successfully\")
	print(\"FBGEMM-GPU location:\", fbgemm_gpu.__file__)
	except Exception as e:
	print(\"❌ Failed to import fbgemm_gpu:\", e)
	sys.exit(1)
	'

	echo ''
	echo '7. Testing basic CUDA ops...'
	python -c '
	import torch
	import fbgemm_gpu
	if torch.cuda.is_available():
	print(\"✓ CUDA available:\", torch.cuda.get_device_name(0))
	print(\"✓ Compute capability:\", torch.cuda.get_device_capability(0))
	# Try a simple FBGEMM-GPU operation
	print(\"Testing FBGEMM-GPU CUDA ops...\")
	x = torch.randn(10, 10).cuda()
	print(\"✓ Basic CUDA ops work\")
	else:
	print(\"❌ CUDA not available\")
	'

	echo ''
	echo '=========================================='
	echo '✅ ALL TESTS PASSED!'
	echo '=========================================='
	" 2>&1 \| tee fbgemm_debug.log

	echo ""
	echo "Debug log saved to: fbgemm_debug.log"
	echo ""
No results found