Skip to content

Instantly share code, notes, and snippets.

@zhanwenchen
Last active June 24, 2024 05:46
Show Gist options
  • Save zhanwenchen/3f75f8e0df5bc736bd7766c678614f9d to your computer and use it in GitHub Desktop.
Save zhanwenchen/3f75f8e0df5bc736bd7766c678614f9d to your computer and use it in GitHub Desktop.
# Modify apt sources lists
cd /etc/apt/sources.list.d/
sudo rm gds-11-7.conf cuda-12-3.conf cuda-12-2.conf cuda-12-1.conf 989_cuda-11.conf cuda-ubuntu2004-11-7-local.list cuda-ubuntu2004-11-7-local.list
# Modify apt preferences
cd /etc/apt/preferences.d
sudo rm cuda-repository-pin-600 nvidia-fabricmanager
# Startup shell environment variables
sudo vim /etc/profile.d/dlami.sh # comment out both
sudo vim /etc/environment # 1. add /usr/local/cuda/bin: to the front of PATH= 2. Add new line: LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
# Add nvidia repo sources.
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
sudo apt install ./cuda-keyring_1.1-1_all.deb
# nvidia-driver-550
sudo apt update
sudo apt install nvidia-driver-550
sudo reboot
# CUDA 12.4
sudo apt install cuda-toolkit-12-4
sudo bash -c "echo '/usr/local/cuda/lib64' >> /etc/ld.so.conf"
sudo ldconfig
sudo vim /etc/environment # on a new line, type LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc
echo 'sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-67607860.efs.us-west-2.amazonaws.com:/ efs' >> ~/.bashrc
# cuDNN 9.1 for CUDA 12.4
sudo apt install cudnn9-cuda-12=9.1*
sudo apt install libcudnn9-samples=9.1*
# NCCL, etc
sudo apt remove nvidia-fabricmanager-535
sudo apt install libcusparse-12-4 libcusparse-dev-12-4 libcusparselt0 libcusparselt-dev # CUSparse and CUSparse-LT
sudo apt install nvidia-fabricmanager-550 nvidia-fabricmanager-dev-550 cuda-drivers-fabricmanager-550 # NVIDIA Fabric Manager for NVLink/NVSwitch
sudo apt install libnccl2 libnccl-dev # NCCL
sudo apt install libxnvctrl0=550.* nvidia-settings=550.* # Miscellaneous NVIDIA management tools
sudo apt install nvidia-container-toolkit # 2.5 (Optional) NVIDIA Docker
### Start NVIDIA Fabric Manager service to avoid Error 802 - System Not Initialized
```bash
sudo systemctl enable nvidia-fabricmanager.service
sudo systemctl start nvidia-fabricmanager.service
```
# Edit dlami profile script to include the following
sudo vim /etc/profile.d/dlami.sh
#########
export LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda:/usr/local/cuda/targets/x86_64-linux/lib/:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/usr/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}
export PATH=/usr/local/cuda/bin:/usr/local/cuda/include${PATH:+:$PATH}
#########
# build pytorch
# 1. Dependencies
conda create -n clean_pytorch_ffmpeg_build cmake ninja intel::mkl-static intel::mkl-include astunparse "expecttest!=0.2.0" hypothesis numpy psutil pyyaml requests setuptools "typing-extensions>=4.8.0" sympy filelock networkx jinja2 fsspec
conda activate clean_pytorch_ffmpeg_build
conda install -c pytorch magma-cuda124
pip install types-dataclasses "optree>=0.9.1" lark
# 2. PyTorch sources
cd && git clone --recursive --single-branch --branch v2.3.1 https://github.com/pytorch/pytorch.git && cd pytorch
git submodule sync
git submodule update --init --recursive
################################################################################################################################################################################
# TODO: Monkey-patch ${HOME}/pytorch/aten/src/ATen/core/boxing/impl/boxing.h (line 36-48) according to <https://github.com/pytorch/pytorch/issues/122169#issuecomment-2146155541>
####################################################################################################################################################################################
# Build
export TORCH_CUDA_ARCH_LIST="8.0" # NOTE: For V100, it's 7.0. See https://developer.nvidia.com/cuda-gpus
export _GLIBCXX_USE_CXX11_ABI=1
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
export USE_SYSTEM_NCCL=1
export NCCL_ROOT=/usr
export NCCL_INCLUDE_DIR=/usr/include # Also need this for suppressing "COULD NOT FIND NCCL"
ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_PREFIX}/lib/libstdc++.so.6 # Fixes ImportError: ${CONDA_PREFIX}/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by ${CONDA_PREFIX}/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
python setup.py clean && echo "Done Cleaning"
(python setup.py install |& tee install_pytorch.log) && echo "DONE building pytorch"# Wait 10 mins for it to finish.
# Test
cd # Need to get out of the build directory. Otherwise bad things happen.
python -c "import torch; print(torch.cuda.is_available()); exit()"
python
import torch
torch.rand(2, 3, device='cuda') @ torch.rand(3, 2, device='cuda') # Check CUDA is working
torch.svd(torch.rand(3,3, device='cuda')) # Check MAGMA-CUDA is working
exit() # Get out of the Python shell.
# torchvision
cd && git clone --recursive --single-branch --branch v0.18.1 https://github.com/pytorch/vision.git && cd vision
conda activate clean_pytorch_ffmpeg_build
export TORCH_CUDA_ARCH_LIST="8.0" # NOTE: For V100, it's 7.0. See https://developer.nvidia.com/cuda-gpus
export TORCHVISION_INCLUDE=/usr/local/include:/usr/local/include/ffnvcodec:/usr/local/cuda/include # for cuviddec.h and nvcuvid.h
export TORCHVISION_LIBRARY=/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/local/lib:/usr/local/cuda/lib64 # for libnvcuvid.so
export _GLIBCXX_USE_CXX11_ABI=1
python setup.py install
# torchaudio
cd && git clone --recursive --single-branch --branch v2.3.1 https://github.com/pytorch/audio.git && cd audio
git submodule sync
git submodule update --init --recursive
export USE_CUDA=1
export USE_OPENMP=1
python setup.py install | tee install_torchaudio.log
cd && rm -rf audio
# transformers and other huggingface libraries
pip install transformers accelerate safetensors peft huggingface_hub timm peft
# go from starter env
conda create -n blip2 --clone clean_pytorch_ffmpeg_build
conda activate blip2
ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_PREFIX}/lib/libstdc++.so.6 # Fixes ImportError: ${CONDA_PREFIX}/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by ${CONDA_PREFIX}/lib/python3.12/site-packages/torch/lib/libtorch_python.so)
# pip install project-specific dependencies for blip2
# remove open3d from requirements.txt
pip install -e . --verbose
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment