Рассмотрим пример
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
class TwoLinLayerNet(torch.nn.Module):
#!/bin/bash | |
set -e -u -o pipefail -o noglob | |
set -x | |
CUDA_VERSION=${CUDA_VERSION:-10.2} | |
CUDNN_VERSION=${CUDNN_VERSION:-7} | |
TENSORRT_VERSION=${TENSORRT_VERSION:-7} | |
UBUNTU_RELEASE=$(lsb_release -rs) # 18.04 | |
DISTRO=ubuntu${UBUNTU_RELEASE//\./} # ubuntu1804 |
PID=123 | |
path=/home/user123/file.txt | |
while test -d /proc/$PID; do | |
hdfs dfs -put -f "$path" | |
echo "[$(date '+%Y-%m-%d %H:%M:%S')] synced from $path, waiting for 1h..."; | |
sleep 1h | |
done | |
echo "Sync $path" | |
sleep 1h | |
hdfs dfs -put -f "$path" |
import os | |
import torch as th | |
import torch.distributed as dist | |
import torch.multiprocessing as mp | |
def run(rank: int, value: float, src:int, dst: int): | |
tensor = th.FloatTensor([value,])#.to(f"cuda:{rank}") | |
print(f"[rk={rank}] tensor before send-recv: {tensor}") | |
req = dist.isend(tensor=tensor, dst=dst) |
Рассмотрим пример
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
class TwoLinLayerNet(torch.nn.Module):
Plan A: | |
just run "sudo nvidia-uninstall" | |
Plan B: | |
#step 1. | |
sudo su | |
# step 2. delete all nvidia stuff | |
dpkg -l | grep -i nvidia | awk '{print $2}' | xargs -n1 sudo apt-get purge -y |
find commits by string of code for example by name of function: | |
git rev-list --all --since 2022-04-01 --until 2022-05-01 | xargs git grep "<some pattern>" |