Skip to content

Instantly share code, notes, and snippets.

@stas00
Forked from jeffra/all_reduce_bench.py
Last active August 14, 2023 15:58
Show Gist options
  • Save stas00/ec5e197b15e2e7aea0153f54d2f97c15 to your computer and use it in GitHub Desktop.
Save stas00/ec5e197b15e2e7aea0153f54d2f97c15 to your computer and use it in GitHub Desktop.
Need to adapt it to the newer version from https://gist.github.com/jeffra/b5e80466b4c86be00ea3b6f130fb7a36
# python -m torch.distributed.launch --nproc_per_node=2 all_reduce_bench.py
import torch
import torch.distributed as dist
import time
import argparse
import os
import fcntl
TRIALS = 5
N = 500000
M = 2000
def printflock(*msgs):
""" print """
with open(__file__, "r") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
try:
print(*msgs)
finally:
fcntl.flock(fh, fcntl.LOCK_UN)
def timed_allreduce(mat, global_rank):
pre = time.perf_counter()
dist.all_reduce(mat)
printflock(f"ignore me {int(mat[0][0])}") # required due to lazy evaluation
duration = time.perf_counter() - pre
tput = ((M*N*4*2)/duration)*8
size = M * N * 4
n = dist.get_world_size()
busbw = (size / duration) * (2 * (n - 1) / n) * 8
printflock(f"{global_rank}:\n",
f"duration: {duration:.4f} sec\n",
f"algo throughput: {tput:.4f} bps, {tput/1e9:.4f} Gbps\n",
f"busbw: {busbw / 1e9:.4f} Gbps"
)
def run(local_rank):
global_rank = dist.get_rank()
printflock(f"{global_rank} data size: {M*N*4/1e9} GB")
mat = torch.rand(N, M, dtype=torch.float32).cuda(local_rank)
for _ in range(TRIALS):
timed_allreduce(mat, global_rank)
def init_processes(local_rank, fn, backend='nccl'):
torch.cuda.set_device(local_rank)
dist.init_process_group(backend)
fn(local_rank)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--local_rank", type=int)
args = parser.parse_args()
rank = args.local_rank
printflock("local_rank: %d" % rank)
init_processes(local_rank=rank, fn=run)
@stas00
Copy link
Author

stas00 commented Feb 22, 2021

I run this on my setup with is just PCIe gen 3 and indeed busbw reported 32Gbps (4GBps) which is a 1/4th of PCIe spec.

$ python -m torch.distributed.launch --nproc_per_node=2 all_reduce_bench.py
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
local_rank: 0
local_rank: 1
0 data size: 0.04 GB
1 data size: 0.04 GB
ignore me 1.45694899559021
0:
 duration: 0.0479 sec
 algo throughput: 13357626061.9765 bps, 13.3576 Gbps
 busbw: 6.6788  Gbps
ignore me 1.45694899559021
1:
 duration: 0.7282 sec
 algo throughput: 878847649.2571 bps, 0.8788 Gbps
 busbw: 0.4394  Gbps
ignore me 2.91389799118042
0:
 duration: 0.0101 sec
 algo throughput: 63678619875.8684 bps, 63.6786 Gbps
 busbw: 31.8393  Gbps
ignore me 2.91389799118042
1:
 duration: 0.0101 sec
 algo throughput: 63409811761.2994 bps, 63.4098 Gbps
 busbw: 31.7049  Gbps
ignore me 5.82779598236084
0:
 duration: 0.0101 sec
 algo throughput: 63311281457.9174 bps, 63.3113 Gbps
 busbw: 31.6556  Gbps
ignore me 5.82779598236084
1:
 duration: 0.0101 sec
 algo throughput: 63416737223.7072 bps, 63.4167 Gbps
 busbw: 31.7084  Gbps
ignore me 11.65559196472168
0:
 duration: 0.0101 sec
 algo throughput: 63367171743.0794 bps, 63.3672 Gbps
 busbw: 31.6836  Gbps
ignore me 11.65559196472168
1:
 duration: 0.0101 sec
 algo throughput: 63351658993.6394 bps, 63.3517 Gbps
 busbw: 31.6758  Gbps
ignore me 23.31118392944336
0:
 duration: 0.0101 sec
 algo throughput: 63423304204.8762 bps, 63.4233 Gbps
 busbw: 31.7117  Gbps
ignore me 23.31118392944336
1:
 duration: 0.0101 sec
 algo throughput: 63372241101.8591 bps, 63.3722 Gbps
 busbw: 31.6861  Gbps

From: https://www.trentonsystems.com/blog/pci-express-interface

Summary of PCI Express Interface Parameters:

-----
    Base Clock Speed: PCIe 3.0 = 8.0GHz, PCIe 2.0 = 5.0GHz, PCIe 1.1 = 2.5GHz
    Data Rate: PCIe 3.0 = 1000MB/s, PCIe 2.0 = 500MB/s, PCIe 1.1 = 250MB/s
    Total Bandwidth: (x16 link): PCIe 3.0 = 32GB/s, PCIe 2.0 = 16GB/s, PCIe 1.1 = 8GB/s
    Data Transfer Rate: PCIe 3.0 = 8.0GT/s, PCIe 2.0= 5.0GT/s, PCIe 1.1 = 2.5GT/s

but this looks wrong: Bandwidth (x16 link): PCIe 3.0 should be 16GB/s,


Samyam Rajbhandari writes:

These are just two different ways of measuring the bandwidth. The busbw is the more accurate one, the other one is a more coarse grain estimate. These numbers will converge to the same as we increase the number of GPUs but on few GPUs they can be up to 2x off. They are just measuring GPU-to-GPU so should be close to PCI-E.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment