Skip to content

Instantly share code, notes, and snippets.

View woshiyyya's full-sized avatar
zzz

Yunxuan Xiao woshiyyya

zzz
View GitHub Profile
@woshiyyya
woshiyyya / error.log
Last active September 6, 2024 18:13
zbh1 debug
Traceback (most recent call last):
File "/home/ray/default/skeleton_zb_h1.py", line 106, in <module>
ray.get(dag.execute(1))
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
return fn(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/_private/worker.py", line 2648, in get
return object_refs.get(timeout=timeout)
File "/home/ray/anaconda3/lib/python3.10/site-packages/ray/experimental/compiled_dag_ref.py", line 90, in get
import ray
import ray.cluster_utils
from ray.experimental.channel.torch_tensor_type import TorchTensorType
from ray.dag import InputNode, MultiOutputNode
from typing import Optional
from ray.dag.compiled_dag_node import CompiledDAG
from argparse import ArgumentError, ArgumentParser
@woshiyyya
woshiyyya / adag.py
Last active October 9, 2024 03:48
Use adag to train a llama2-7b model with zero bubble pipeline parallel
def generate_zbh1_dag(workers, num_microbatches):
num_workers = len(workers)
num_lead_microbatches = num_workers
with InputNode() as inp:
fwd_queues = [[] for _ in range(num_workers)]
bwd_queues = [[] for _ in range(num_workers)]
# Once a worker's counter reaches 0, it cannot execute another fwd until it
# executes a bwd first.
fwd_counter = [num_lead_microbatches - i for i in range(num_workers)]