Last active
June 24, 2024 05:46
-
-
Save zhanwenchen/3f75f8e0df5bc736bd7766c678614f9d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Modify apt sources lists | |
cd /etc/apt/sources.list.d/ | |
sudo rm gds-11-7.conf cuda-12-3.conf cuda-12-2.conf cuda-12-1.conf 989_cuda-11.conf cuda-ubuntu2004-11-7-local.list cuda-ubuntu2004-11-7-local.list | |
# Modify apt preferences | |
cd /etc/apt/preferences.d | |
sudo rm cuda-repository-pin-600 nvidia-fabricmanager | |
# Startup shell environment variables | |
sudo vim /etc/profile.d/dlami.sh # comment out both | |
sudo vim /etc/environment # 1. add /usr/local/cuda/bin: to the front of PATH= 2. Add new line: LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" | |
# Add nvidia repo sources. | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb | |
sudo apt install ./cuda-keyring_1.1-1_all.deb | |
# nvidia-driver-550 | |
sudo apt update | |
sudo apt install nvidia-driver-550 | |
sudo reboot | |
# CUDA 12.4 | |
sudo apt install cuda-toolkit-12-4 | |
sudo bash -c "echo '/usr/local/cuda/lib64' >> /etc/ld.so.conf" | |
sudo ldconfig | |
sudo vim /etc/environment # on a new line, type LD_LIBRARY_PATH="/usr/local/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" | |
echo 'export PATH=/usr/local/cuda/bin:$PATH' >> ~/.bashrc | |
echo 'sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2,noresvport fs-67607860.efs.us-west-2.amazonaws.com:/ efs' >> ~/.bashrc | |
# cuDNN 9.1 for CUDA 12.4 | |
sudo apt install cudnn9-cuda-12=9.1* | |
sudo apt install libcudnn9-samples=9.1* | |
# NCCL, etc | |
sudo apt remove nvidia-fabricmanager-535 | |
sudo apt install libcusparse-12-4 libcusparse-dev-12-4 libcusparselt0 libcusparselt-dev # CUSparse and CUSparse-LT | |
sudo apt install nvidia-fabricmanager-550 nvidia-fabricmanager-dev-550 cuda-drivers-fabricmanager-550 # NVIDIA Fabric Manager for NVLink/NVSwitch | |
sudo apt install libnccl2 libnccl-dev # NCCL | |
sudo apt install libxnvctrl0=550.* nvidia-settings=550.* # Miscellaneous NVIDIA management tools | |
sudo apt install nvidia-container-toolkit # 2.5 (Optional) NVIDIA Docker | |
### Start NVIDIA Fabric Manager service to avoid Error 802 - System Not Initialized | |
```bash | |
sudo systemctl enable nvidia-fabricmanager.service | |
sudo systemctl start nvidia-fabricmanager.service | |
``` | |
# Edit dlami profile script to include the following | |
sudo vim /etc/profile.d/dlami.sh | |
######### | |
export LD_LIBRARY_PATH=/opt/amazon/efa/lib:/opt/amazon/openmpi/lib:/opt/aws-ofi-nccl/lib:/usr/local/cuda/lib:/usr/local/cuda/lib64:/usr/local/cuda:/usr/local/cuda/targets/x86_64-linux/lib/:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/lib:/usr/lib${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH} | |
export PATH=/usr/local/cuda/bin:/usr/local/cuda/include${PATH:+:$PATH} | |
######### | |
# build pytorch | |
# 1. Dependencies | |
conda create -n clean_pytorch_ffmpeg_build cmake ninja intel::mkl-static intel::mkl-include astunparse "expecttest!=0.2.0" hypothesis numpy psutil pyyaml requests setuptools "typing-extensions>=4.8.0" sympy filelock networkx jinja2 fsspec | |
conda activate clean_pytorch_ffmpeg_build | |
conda install -c pytorch magma-cuda124 | |
pip install types-dataclasses "optree>=0.9.1" lark | |
# 2. PyTorch sources | |
cd && git clone --recursive --single-branch --branch v2.3.1 https://github.com/pytorch/pytorch.git && cd pytorch | |
git submodule sync | |
git submodule update --init --recursive | |
################################################################################################################################################################################ | |
# TODO: Monkey-patch ${HOME}/pytorch/aten/src/ATen/core/boxing/impl/boxing.h (line 36-48) according to <https://github.com/pytorch/pytorch/issues/122169#issuecomment-2146155541> | |
#################################################################################################################################################################################### | |
# Build | |
export TORCH_CUDA_ARCH_LIST="8.0" # NOTE: For V100, it's 7.0. See https://developer.nvidia.com/cuda-gpus | |
export _GLIBCXX_USE_CXX11_ABI=1 | |
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} | |
export USE_SYSTEM_NCCL=1 | |
export NCCL_ROOT=/usr | |
export NCCL_INCLUDE_DIR=/usr/include # Also need this for suppressing "COULD NOT FIND NCCL" | |
ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_PREFIX}/lib/libstdc++.so.6 # Fixes ImportError: ${CONDA_PREFIX}/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by ${CONDA_PREFIX}/lib/python3.12/site-packages/torch/lib/libtorch_python.so) | |
python setup.py clean && echo "Done Cleaning" | |
(python setup.py install |& tee install_pytorch.log) && echo "DONE building pytorch"# Wait 10 mins for it to finish. | |
# Test | |
cd # Need to get out of the build directory. Otherwise bad things happen. | |
python -c "import torch; print(torch.cuda.is_available()); exit()" | |
python | |
import torch | |
torch.rand(2, 3, device='cuda') @ torch.rand(3, 2, device='cuda') # Check CUDA is working | |
torch.svd(torch.rand(3,3, device='cuda')) # Check MAGMA-CUDA is working | |
exit() # Get out of the Python shell. | |
# torchvision | |
cd && git clone --recursive --single-branch --branch v0.18.1 https://github.com/pytorch/vision.git && cd vision | |
conda activate clean_pytorch_ffmpeg_build | |
export TORCH_CUDA_ARCH_LIST="8.0" # NOTE: For V100, it's 7.0. See https://developer.nvidia.com/cuda-gpus | |
export TORCHVISION_INCLUDE=/usr/local/include:/usr/local/include/ffnvcodec:/usr/local/cuda/include # for cuviddec.h and nvcuvid.h | |
export TORCHVISION_LIBRARY=/usr/local/lib:/usr/lib/x86_64-linux-gnu:/usr/local/lib:/usr/local/cuda/lib64 # for libnvcuvid.so | |
export _GLIBCXX_USE_CXX11_ABI=1 | |
python setup.py install | |
# torchaudio | |
cd && git clone --recursive --single-branch --branch v2.3.1 https://github.com/pytorch/audio.git && cd audio | |
git submodule sync | |
git submodule update --init --recursive | |
export USE_CUDA=1 | |
export USE_OPENMP=1 | |
python setup.py install | tee install_torchaudio.log | |
cd && rm -rf audio | |
# transformers and other huggingface libraries | |
pip install transformers accelerate safetensors peft huggingface_hub timm peft | |
# go from starter env | |
conda create -n blip2 --clone clean_pytorch_ffmpeg_build | |
conda activate blip2 | |
ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_PREFIX}/lib/libstdc++.so.6 # Fixes ImportError: ${CONDA_PREFIX}/bin/../lib/libstdc++.so.6: version `GLIBCXX_3.4.32' not found (required by ${CONDA_PREFIX}/lib/python3.12/site-packages/torch/lib/libtorch_python.so) | |
# pip install project-specific dependencies for blip2 | |
# remove open3d from requirements.txt | |
pip install -e . --verbose |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment