romilbhardwaj · October 16, 2024 21:10
diff --git a/gistfile1.txt b/gistfile1.txt
 https://github.com/skypilot-org/skypilot/blob/master/examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
 2x A100:8 nodes on GCP.
 $ sky launch -c a100 examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml

 With gVNIC
 (head, rank=0, pid=7056) -----------------------------------
 (head, rank=0, pid=7056) PyTorch distributed benchmark suite
 (head, rank=0, pid=7056) -----------------------------------
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) * PyTorch version: 2.4.1+cu121
 (head, rank=0, pid=7056) * CUDA version: 12.1
 (head, rank=0, pid=7056) * Distributed backend: nccl
 (head, rank=0, pid=7056) * Maximum bucket size: 25MB
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) --- nvidia-smi topo -m ---
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056)        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7    CPU Affinity    NUMA Affinity   GPU NUMA ID
 (head, rank=0, pid=7056) GPU0    X      NV12    NV12    NV12    NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7056) GPU1   NV12     X      NV12    NV12    NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7056) GPU2   NV12    NV12     X      NV12    NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7056) GPU3   NV12    NV12    NV12     X      NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7056) GPU4   NV12    NV12    NV12    NV12     X      NV12    NV12    NV12    24-47,72-95     1               N/A
 (head, rank=0, pid=7056) GPU5   NV12    NV12    NV12    NV12    NV12     X      NV12    NV12    24-47,72-95     1               N/A
 (head, rank=0, pid=7056) GPU6   NV12    NV12    NV12    NV12    NV12    NV12     X      NV12    24-47,72-95     1               N/A
 (head, rank=0, pid=7056) GPU7   NV12    NV12    NV12    NV12    NV12    NV12    NV12     X      24-47,72-95     1               N/A
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) Legend:
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056)   X    = Self
 (head, rank=0, pid=7056)   SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
 (head, rank=0, pid=7056)   NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
 (head, rank=0, pid=7056)   PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
 (head, rank=0, pid=7056)   PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
 (head, rank=0, pid=7056)   PIX  = Connection traversing at most a single PCIe bridge
 (head, rank=0, pid=7056)   NV#  = Connection traversing a bonded set of # NVLinks
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) --------------------------
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) Benchmark: resnet50 with batch size 32
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7056)    1 GPUs --   no ddp:  p50:  0.041s     789/s  p75:  0.041s     789/s  p90:  0.041s     788/s  p95:  0.041s     788/s
 (head, rank=0, pid=7056)    1 GPUs --    1M/1G:  p50:  0.040s     790/s  p75:  0.041s     789/s  p90:  0.041s     789/s  p95:  0.041s     789/s
 (head, rank=0, pid=7056)    2 GPUs --    1M/2G:  p50:  0.042s     755/s  p75:  0.042s     754/s  p90:  0.042s     754/s  p95:  0.042s     753/s
 (head, rank=0, pid=7056)    4 GPUs --    1M/4G:  p50:  0.043s     749/s  p75:  0.043s     748/s  p90:  0.043s     747/s  p95:  0.043s     747/s
 (head, rank=0, pid=7056)    8 GPUs --    1M/8G:  p50:  0.043s     745/s  p75:  0.047s     682/s  p90:  0.047s     679/s  p95:  0.047s     679/s
 (head, rank=0, pid=7056)   16 GPUs --    2M/8G:  p50:  0.051s     631/s  p75:  0.051s     629/s  p90:  0.051s     625/s  p95:  0.051s     623/s
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) Benchmark: resnet101 with batch size 32
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7056)    1 GPUs --   no ddp:  p50:  0.063s     506/s  p75:  0.063s     505/s  p90:  0.063s     505/s  p95:  0.063s     505/s
 (head, rank=0, pid=7056)    1 GPUs --    1M/1G:  p50:  0.063s     506/s  p75:  0.063s     505/s  p90:  0.064s     501/s  p95:  0.064s     500/s
 (head, rank=0, pid=7056)    2 GPUs --    1M/2G:  p50:  0.066s     482/s  p75:  0.066s     482/s  p90:  0.067s     481/s  p95:  0.067s     480/s
 (head, rank=0, pid=7056)    4 GPUs --    1M/4G:  p50:  0.067s     474/s  p75:  0.068s     468/s  p90:  0.071s     450/s  p95:  0.071s     449/s
 (head, rank=0, pid=7056)    8 GPUs --    1M/8G:  p50:  0.068s     467/s  p75:  0.069s     465/s  p90:  0.069s     463/s  p95:  0.069s     463/s
 (head, rank=0, pid=7056)   16 GPUs --    2M/8G:  p50:  0.081s     394/s  p75:  0.087s     368/s  p90:  0.098s     326/s  p95:  0.101s     316/s
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) Benchmark: resnext50_32x4d with batch size 32
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7056)    1 GPUs --   no ddp:  p50:  0.051s     623/s  p75:  0.051s     623/s  p90:  0.051s     622/s  p95:  0.051s     622/s
 (head, rank=0, pid=7056)    1 GPUs --    1M/1G:  p50:  0.051s     623/s  p75:  0.051s     623/s  p90:  0.051s     622/s  p95:  0.051s     622/s
 (head, rank=0, pid=7056)    2 GPUs --    1M/2G:  p50:  0.054s     596/s  p75:  0.054s     595/s  p90:  0.054s     594/s  p95:  0.054s     594/s
 (head, rank=0, pid=7056)    4 GPUs --    1M/4G:  p50:  0.054s     594/s  p75:  0.054s     593/s  p90:  0.054s     592/s  p95:  0.054s     592/s
 (head, rank=0, pid=7056)    8 GPUs --    1M/8G:  p50:  0.054s     591/s  p75:  0.054s     590/s  p90:  0.054s     589/s  p95:  0.054s     589/s
 (head, rank=0, pid=7056)   16 GPUs --    2M/8G:  p50:  0.061s     523/s  p75:  0.061s     522/s  p90:  0.061s     520/s  p95:  0.061s     520/s
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056) Benchmark: resnext101_32x8d with batch size 32
 (head, rank=0, pid=7056) 
 (head, rank=0, pid=7056)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7056)    1 GPUs --   no ddp:  p50:  0.129s     248/s  p75:  0.129s     248/s  p90:  0.129s     248/s  p95:  0.129s     248/s
 (head, rank=0, pid=7056)    1 GPUs --    1M/1G:  p50:  0.129s     248/s  p75:  0.129s     248/s  p90:  0.129s     247/s  p95:  0.129s     247/s
 (head, rank=0, pid=7056)    2 GPUs --    1M/2G:  p50:  0.132s     242/s  p75:  0.132s     242/s  p90:  0.132s     241/s  p95:  0.132s     241/s
 (head, rank=0, pid=7056)    4 GPUs --    1M/4G:  p50:  0.133s     241/s  p75:  0.133s     241/s  p90:  0.133s     241/s  p95:  0.133s     241/s
 (head, rank=0, pid=7056)    8 GPUs --    1M/8G:  p50:  0.133s     239/s  p75:  0.134s     239/s  p90:  0.134s     239/s  p95:  0.134s     239/s
 (head, rank=0, pid=7056)   16 GPUs --    2M/8G:  p50:  0.162s     197/s  p75:  0.162s     197/s  p90:  0.163s     196/s  p95:  0.164s     195/s



 Without gVNIC:
 (head, rank=0, pid=7792) -----------------------------------
 (head, rank=0, pid=7792) PyTorch distributed benchmark suite
 (head, rank=0, pid=7792) -----------------------------------
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) * PyTorch version: 2.4.1+cu121
 (head, rank=0, pid=7792) * CUDA version: 12.1
 (head, rank=0, pid=7792) * Distributed backend: nccl
 (head, rank=0, pid=7792) * Maximum bucket size: 25MB
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) --- nvidia-smi topo -m ---
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792)        GPU0    GPU1    GPU2    GPU3    GPU4    GPU5    GPU6    GPU7    CPU Affinity    NUMA Affinity   GPU NUMA ID
 (head, rank=0, pid=7792) GPU0    X      NV12    NV12    NV12    NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7792) GPU1   NV12     X      NV12    NV12    NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7792) GPU2   NV12    NV12     X      NV12    NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7792) GPU3   NV12    NV12    NV12     X      NV12    NV12    NV12    NV12    0-23,48-71      0               N/A
 (head, rank=0, pid=7792) GPU4   NV12    NV12    NV12    NV12     X      NV12    NV12    NV12    24-47,72-95     1               N/A
 (head, rank=0, pid=7792) GPU5   NV12    NV12    NV12    NV12    NV12     X      NV12    NV12    24-47,72-95     1               N/A
 (head, rank=0, pid=7792) GPU6   NV12    NV12    NV12    NV12    NV12    NV12     X      NV12    24-47,72-95     1               N/A
 (head, rank=0, pid=7792) GPU7   NV12    NV12    NV12    NV12    NV12    NV12    NV12     X      24-47,72-95     1               N/A
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) Legend:
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792)   X    = Self
 (head, rank=0, pid=7792)   SYS  = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
 (head, rank=0, pid=7792)   NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
 (head, rank=0, pid=7792)   PHB  = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
 (head, rank=0, pid=7792)   PXB  = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
 (head, rank=0, pid=7792)   PIX  = Connection traversing at most a single PCIe bridge
 (head, rank=0, pid=7792)   NV#  = Connection traversing a bonded set of # NVLinks
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) --------------------------
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) Benchmark: resnet50 with batch size 32
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7792)    1 GPUs --   no ddp:  p50:  0.041s     786/s  p75:  0.041s     786/s  p90:  0.041s     781/s  p95:  0.041s     781/s
 (head, rank=0, pid=7792)    1 GPUs --    1M/1G:  p50:  0.041s     786/s  p75:  0.041s     786/s  p90:  0.041s     786/s  p95:  0.041s     786/s
 (head, rank=0, pid=7792)    2 GPUs --    1M/2G:  p50:  0.043s     751/s  p75:  0.043s     750/s  p90:  0.043s     749/s  p95:  0.043s     749/s
 (head, rank=0, pid=7792)    4 GPUs --    1M/4G:  p50:  0.043s     747/s  p75:  0.043s     746/s  p90:  0.043s     745/s  p95:  0.043s     744/s
 (head, rank=0, pid=7792)    8 GPUs --    1M/8G:  p50:  0.043s     745/s  p75:  0.043s     744/s  p90:  0.043s     737/s  p95:  0.046s     695/s
 (head, rank=0, pid=7792)   16 GPUs --    2M/8G:  p50:  0.071s     449/s  p75:  0.072s     446/s  p90:  0.072s     444/s  p95:  0.073s     440/s
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) Benchmark: resnet101 with batch size 32
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7792)    1 GPUs --   no ddp:  p50:  0.064s     500/s  p75:  0.064s     499/s  p90:  0.064s     497/s  p95:  0.064s     497/s
 (head, rank=0, pid=7792)    1 GPUs --    1M/1G:  p50:  0.064s     497/s  p75:  0.064s     496/s  p90:  0.065s     495/s  p95:  0.065s     495/s
 (head, rank=0, pid=7792)    2 GPUs --    1M/2G:  p50:  0.067s     478/s  p75:  0.067s     478/s  p90:  0.068s     472/s  p95:  0.068s     472/s
 (head, rank=0, pid=7792)    4 GPUs --    1M/4G:  p50:  0.068s     469/s  p75:  0.069s     461/s  p90:  0.071s     452/s  p95:  0.076s     420/s
 (head, rank=0, pid=7792)    8 GPUs --    1M/8G:  p50:  0.068s     468/s  p75:  0.069s     466/s  p90:  0.072s     444/s  p95:  0.072s     443/s
 (head, rank=0, pid=7792)   16 GPUs --    2M/8G:  p50:  0.125s     256/s  p75:  0.126s     253/s  p90:  0.130s     245/s  p95:  0.133s     240/s
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) Benchmark: resnext50_32x4d with batch size 32
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7792)    1 GPUs --   no ddp:  p50:  0.052s     620/s  p75:  0.052s     620/s  p90:  0.052s     620/s  p95:  0.052s     619/s
 (head, rank=0, pid=7792)    1 GPUs --    1M/1G:  p50:  0.052s     620/s  p75:  0.052s     620/s  p90:  0.052s     620/s  p95:  0.052s     620/s
 (head, rank=0, pid=7792)    2 GPUs --    1M/2G:  p50:  0.054s     594/s  p75:  0.054s     594/s  p90:  0.054s     593/s  p95:  0.054s     593/s
 (head, rank=0, pid=7792)    4 GPUs --    1M/4G:  p50:  0.054s     592/s  p75:  0.054s     591/s  p90:  0.054s     591/s  p95:  0.054s     589/s
 (head, rank=0, pid=7792)    8 GPUs --    1M/8G:  p50:  0.054s     590/s  p75:  0.054s     590/s  p90:  0.054s     589/s  p95:  0.054s     589/s
 (head, rank=0, pid=7792)   16 GPUs --    2M/8G:  p50:  0.070s     457/s  p75:  0.071s     452/s  p90:  0.071s     449/s  p95:  0.072s     443/s
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792) Benchmark: resnext101_32x8d with batch size 32
 (head, rank=0, pid=7792) 
 (head, rank=0, pid=7792)                             sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec      sec/iter    ex/sec
 (head, rank=0, pid=7792)    1 GPUs --   no ddp:  p50:  0.129s     247/s  p75:  0.129s     247/s  p90:  0.129s     247/s  p95:  0.130s     247/s
 (head, rank=0, pid=7792)    1 GPUs --    1M/1G:  p50:  0.129s     247/s  p75:  0.129s     247/s  p90:  0.129s     247/s  p95:  0.129s     247/s
 (head, rank=0, pid=7792)    2 GPUs --    1M/2G:  p50:  0.132s     242/s  p75:  0.132s     241/s  p90:  0.132s     241/s  p95:  0.132s     241/s
 (head, rank=0, pid=7792)    4 GPUs --    1M/4G:  p50:  0.133s     241/s  p75:  0.133s     241/s  p90:  0.133s     240/s  p95:  0.133s     240/s
 (head, rank=0, pid=7792)    8 GPUs --    1M/8G:  p50:  0.133s     239/s  p75:  0.133s     239/s  p90:  0.134s     239/s  p95:  0.134s     239/s
 (head, rank=0, pid=7792)   16 GPUs --    2M/8G:  p50:  0.289s     110/s  p75:  0.290s     110/s  p90:  0.291s     109/s  p95:  0.291s     109/s
	https://github.com/skypilot-org/skypilot/blob/master/examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
	2x A100:8 nodes on GCP.
	$ sky launch -c a100 examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml

	With gVNIC
	(head, rank=0, pid=7056) -----------------------------------
	(head, rank=0, pid=7056) PyTorch distributed benchmark suite
	(head, rank=0, pid=7056) -----------------------------------
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) * PyTorch version: 2.4.1+cu121
	(head, rank=0, pid=7056) * CUDA version: 12.1
	(head, rank=0, pid=7056) * Distributed backend: nccl
	(head, rank=0, pid=7056) * Maximum bucket size: 25MB
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) --- nvidia-smi topo -m ---
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU NUMA ID
	(head, rank=0, pid=7056) GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7056) GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7056) GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7056) GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7056) GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 N/A
	(head, rank=0, pid=7056) GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 N/A
	(head, rank=0, pid=7056) GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 N/A
	(head, rank=0, pid=7056) GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 N/A
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) Legend:
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) X = Self
	(head, rank=0, pid=7056) SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
	(head, rank=0, pid=7056) NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
	(head, rank=0, pid=7056) PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
	(head, rank=0, pid=7056) PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
	(head, rank=0, pid=7056) PIX = Connection traversing at most a single PCIe bridge
	(head, rank=0, pid=7056) NV# = Connection traversing a bonded set of # NVLinks
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) --------------------------
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) Benchmark: resnet50 with batch size 32
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7056) 1 GPUs -- no ddp: p50: 0.041s 789/s p75: 0.041s 789/s p90: 0.041s 788/s p95: 0.041s 788/s
	(head, rank=0, pid=7056) 1 GPUs -- 1M/1G: p50: 0.040s 790/s p75: 0.041s 789/s p90: 0.041s 789/s p95: 0.041s 789/s
	(head, rank=0, pid=7056) 2 GPUs -- 1M/2G: p50: 0.042s 755/s p75: 0.042s 754/s p90: 0.042s 754/s p95: 0.042s 753/s
	(head, rank=0, pid=7056) 4 GPUs -- 1M/4G: p50: 0.043s 749/s p75: 0.043s 748/s p90: 0.043s 747/s p95: 0.043s 747/s
	(head, rank=0, pid=7056) 8 GPUs -- 1M/8G: p50: 0.043s 745/s p75: 0.047s 682/s p90: 0.047s 679/s p95: 0.047s 679/s
	(head, rank=0, pid=7056) 16 GPUs -- 2M/8G: p50: 0.051s 631/s p75: 0.051s 629/s p90: 0.051s 625/s p95: 0.051s 623/s
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) Benchmark: resnet101 with batch size 32
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7056) 1 GPUs -- no ddp: p50: 0.063s 506/s p75: 0.063s 505/s p90: 0.063s 505/s p95: 0.063s 505/s
	(head, rank=0, pid=7056) 1 GPUs -- 1M/1G: p50: 0.063s 506/s p75: 0.063s 505/s p90: 0.064s 501/s p95: 0.064s 500/s
	(head, rank=0, pid=7056) 2 GPUs -- 1M/2G: p50: 0.066s 482/s p75: 0.066s 482/s p90: 0.067s 481/s p95: 0.067s 480/s
	(head, rank=0, pid=7056) 4 GPUs -- 1M/4G: p50: 0.067s 474/s p75: 0.068s 468/s p90: 0.071s 450/s p95: 0.071s 449/s
	(head, rank=0, pid=7056) 8 GPUs -- 1M/8G: p50: 0.068s 467/s p75: 0.069s 465/s p90: 0.069s 463/s p95: 0.069s 463/s
	(head, rank=0, pid=7056) 16 GPUs -- 2M/8G: p50: 0.081s 394/s p75: 0.087s 368/s p90: 0.098s 326/s p95: 0.101s 316/s
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) Benchmark: resnext50_32x4d with batch size 32
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7056) 1 GPUs -- no ddp: p50: 0.051s 623/s p75: 0.051s 623/s p90: 0.051s 622/s p95: 0.051s 622/s
	(head, rank=0, pid=7056) 1 GPUs -- 1M/1G: p50: 0.051s 623/s p75: 0.051s 623/s p90: 0.051s 622/s p95: 0.051s 622/s
	(head, rank=0, pid=7056) 2 GPUs -- 1M/2G: p50: 0.054s 596/s p75: 0.054s 595/s p90: 0.054s 594/s p95: 0.054s 594/s
	(head, rank=0, pid=7056) 4 GPUs -- 1M/4G: p50: 0.054s 594/s p75: 0.054s 593/s p90: 0.054s 592/s p95: 0.054s 592/s
	(head, rank=0, pid=7056) 8 GPUs -- 1M/8G: p50: 0.054s 591/s p75: 0.054s 590/s p90: 0.054s 589/s p95: 0.054s 589/s
	(head, rank=0, pid=7056) 16 GPUs -- 2M/8G: p50: 0.061s 523/s p75: 0.061s 522/s p90: 0.061s 520/s p95: 0.061s 520/s
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) Benchmark: resnext101_32x8d with batch size 32
	(head, rank=0, pid=7056)
	(head, rank=0, pid=7056) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7056) 1 GPUs -- no ddp: p50: 0.129s 248/s p75: 0.129s 248/s p90: 0.129s 248/s p95: 0.129s 248/s
	(head, rank=0, pid=7056) 1 GPUs -- 1M/1G: p50: 0.129s 248/s p75: 0.129s 248/s p90: 0.129s 247/s p95: 0.129s 247/s
	(head, rank=0, pid=7056) 2 GPUs -- 1M/2G: p50: 0.132s 242/s p75: 0.132s 242/s p90: 0.132s 241/s p95: 0.132s 241/s
	(head, rank=0, pid=7056) 4 GPUs -- 1M/4G: p50: 0.133s 241/s p75: 0.133s 241/s p90: 0.133s 241/s p95: 0.133s 241/s
	(head, rank=0, pid=7056) 8 GPUs -- 1M/8G: p50: 0.133s 239/s p75: 0.134s 239/s p90: 0.134s 239/s p95: 0.134s 239/s
	(head, rank=0, pid=7056) 16 GPUs -- 2M/8G: p50: 0.162s 197/s p75: 0.162s 197/s p90: 0.163s 196/s p95: 0.164s 195/s



	Without gVNIC:
	(head, rank=0, pid=7792) -----------------------------------
	(head, rank=0, pid=7792) PyTorch distributed benchmark suite
	(head, rank=0, pid=7792) -----------------------------------
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) * PyTorch version: 2.4.1+cu121
	(head, rank=0, pid=7792) * CUDA version: 12.1
	(head, rank=0, pid=7792) * Distributed backend: nccl
	(head, rank=0, pid=7792) * Maximum bucket size: 25MB
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) --- nvidia-smi topo -m ---
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) GPU0 GPU1 GPU2 GPU3 GPU4 GPU5 GPU6 GPU7 CPU Affinity NUMA Affinity GPU NUMA ID
	(head, rank=0, pid=7792) GPU0 X NV12 NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7792) GPU1 NV12 X NV12 NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7792) GPU2 NV12 NV12 X NV12 NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7792) GPU3 NV12 NV12 NV12 X NV12 NV12 NV12 NV12 0-23,48-71 0 N/A
	(head, rank=0, pid=7792) GPU4 NV12 NV12 NV12 NV12 X NV12 NV12 NV12 24-47,72-95 1 N/A
	(head, rank=0, pid=7792) GPU5 NV12 NV12 NV12 NV12 NV12 X NV12 NV12 24-47,72-95 1 N/A
	(head, rank=0, pid=7792) GPU6 NV12 NV12 NV12 NV12 NV12 NV12 X NV12 24-47,72-95 1 N/A
	(head, rank=0, pid=7792) GPU7 NV12 NV12 NV12 NV12 NV12 NV12 NV12 X 24-47,72-95 1 N/A
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) Legend:
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) X = Self
	(head, rank=0, pid=7792) SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
	(head, rank=0, pid=7792) NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node
	(head, rank=0, pid=7792) PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
	(head, rank=0, pid=7792) PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
	(head, rank=0, pid=7792) PIX = Connection traversing at most a single PCIe bridge
	(head, rank=0, pid=7792) NV# = Connection traversing a bonded set of # NVLinks
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) --------------------------
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) Benchmark: resnet50 with batch size 32
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7792) 1 GPUs -- no ddp: p50: 0.041s 786/s p75: 0.041s 786/s p90: 0.041s 781/s p95: 0.041s 781/s
	(head, rank=0, pid=7792) 1 GPUs -- 1M/1G: p50: 0.041s 786/s p75: 0.041s 786/s p90: 0.041s 786/s p95: 0.041s 786/s
	(head, rank=0, pid=7792) 2 GPUs -- 1M/2G: p50: 0.043s 751/s p75: 0.043s 750/s p90: 0.043s 749/s p95: 0.043s 749/s
	(head, rank=0, pid=7792) 4 GPUs -- 1M/4G: p50: 0.043s 747/s p75: 0.043s 746/s p90: 0.043s 745/s p95: 0.043s 744/s
	(head, rank=0, pid=7792) 8 GPUs -- 1M/8G: p50: 0.043s 745/s p75: 0.043s 744/s p90: 0.043s 737/s p95: 0.046s 695/s
	(head, rank=0, pid=7792) 16 GPUs -- 2M/8G: p50: 0.071s 449/s p75: 0.072s 446/s p90: 0.072s 444/s p95: 0.073s 440/s
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) Benchmark: resnet101 with batch size 32
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7792) 1 GPUs -- no ddp: p50: 0.064s 500/s p75: 0.064s 499/s p90: 0.064s 497/s p95: 0.064s 497/s
	(head, rank=0, pid=7792) 1 GPUs -- 1M/1G: p50: 0.064s 497/s p75: 0.064s 496/s p90: 0.065s 495/s p95: 0.065s 495/s
	(head, rank=0, pid=7792) 2 GPUs -- 1M/2G: p50: 0.067s 478/s p75: 0.067s 478/s p90: 0.068s 472/s p95: 0.068s 472/s
	(head, rank=0, pid=7792) 4 GPUs -- 1M/4G: p50: 0.068s 469/s p75: 0.069s 461/s p90: 0.071s 452/s p95: 0.076s 420/s
	(head, rank=0, pid=7792) 8 GPUs -- 1M/8G: p50: 0.068s 468/s p75: 0.069s 466/s p90: 0.072s 444/s p95: 0.072s 443/s
	(head, rank=0, pid=7792) 16 GPUs -- 2M/8G: p50: 0.125s 256/s p75: 0.126s 253/s p90: 0.130s 245/s p95: 0.133s 240/s
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) Benchmark: resnext50_32x4d with batch size 32
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7792) 1 GPUs -- no ddp: p50: 0.052s 620/s p75: 0.052s 620/s p90: 0.052s 620/s p95: 0.052s 619/s
	(head, rank=0, pid=7792) 1 GPUs -- 1M/1G: p50: 0.052s 620/s p75: 0.052s 620/s p90: 0.052s 620/s p95: 0.052s 620/s
	(head, rank=0, pid=7792) 2 GPUs -- 1M/2G: p50: 0.054s 594/s p75: 0.054s 594/s p90: 0.054s 593/s p95: 0.054s 593/s
	(head, rank=0, pid=7792) 4 GPUs -- 1M/4G: p50: 0.054s 592/s p75: 0.054s 591/s p90: 0.054s 591/s p95: 0.054s 589/s
	(head, rank=0, pid=7792) 8 GPUs -- 1M/8G: p50: 0.054s 590/s p75: 0.054s 590/s p90: 0.054s 589/s p95: 0.054s 589/s
	(head, rank=0, pid=7792) 16 GPUs -- 2M/8G: p50: 0.070s 457/s p75: 0.071s 452/s p90: 0.071s 449/s p95: 0.072s 443/s
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) Benchmark: resnext101_32x8d with batch size 32
	(head, rank=0, pid=7792)
	(head, rank=0, pid=7792) sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec sec/iter ex/sec
	(head, rank=0, pid=7792) 1 GPUs -- no ddp: p50: 0.129s 247/s p75: 0.129s 247/s p90: 0.129s 247/s p95: 0.130s 247/s
	(head, rank=0, pid=7792) 1 GPUs -- 1M/1G: p50: 0.129s 247/s p75: 0.129s 247/s p90: 0.129s 247/s p95: 0.129s 247/s
	(head, rank=0, pid=7792) 2 GPUs -- 1M/2G: p50: 0.132s 242/s p75: 0.132s 241/s p90: 0.132s 241/s p95: 0.132s 241/s
	(head, rank=0, pid=7792) 4 GPUs -- 1M/4G: p50: 0.133s 241/s p75: 0.133s 241/s p90: 0.133s 240/s p95: 0.133s 240/s
	(head, rank=0, pid=7792) 8 GPUs -- 1M/8G: p50: 0.133s 239/s p75: 0.133s 239/s p90: 0.134s 239/s p95: 0.134s 239/s
	(head, rank=0, pid=7792) 16 GPUs -- 2M/8G: p50: 0.289s 110/s p75: 0.290s 110/s p90: 0.291s 109/s p95: 0.291s 109/s