Skip to content

Instantly share code, notes, and snippets.

View woshiyyya's full-sized avatar
zzz

Yunxuan Xiao woshiyyya

zzz
View GitHub Profile
task.
class_name: RayTrainWorker
actor_id: 9e6790a209b7c509e64301f305000000
pid: 35979
namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527
ip: 172.24.101.245
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last):
File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task
File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task
@woshiyyya
woshiyyya / minimal_train.py
Last active June 3, 2024 20:42
Minimal training script for testing train dashboard
import ray
from ray.train.torch import TorchTrainer
from ray.train import RunConfig, ScalingConfig
import time
def train_func():
print("Training Starts")
time.sleep(100)
datasets = {
@woshiyyya
woshiyyya / multi_input_dag.py
Created July 1, 2024 20:01
Simple DAG with multiple Inputs
from ray.dag.input_node import InputNode
from ray.dag.output_node import MultiOutputNode
import ray
@ray.remote
class Worker:
def __init__(self, rank):
self.rank = rank
self.logs = []
@woshiyyya
woshiyyya / PP.md
Last active October 9, 2024 03:48
pseudo code for PP with Ray DAG

@woshiyyya
woshiyyya / test_case_1.py
Last active July 18, 2024 22:47
Train Dashboard BugBash
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig, RunConfig
@woshiyyya
woshiyyya / benchmark_adag.py
Last active August 30, 2024 23:57
Benchmark NCCL Data Transfer
import ray
import torch
from ray.experimental.channel.torch_tensor_type import TorchTensorType
# shape = (4, 8192)
shape = (4, 24576)
@ray.remote(num_gpus=1)
class MyActor:
@woshiyyya
woshiyyya / train_tp_timeout.py
Created August 19, 2024 23:54
DistMM DAG Timeout Failure
import ray
from ray.air.util.torch_dist import _init_torch_distributed
from ray.air._internal.util import find_free_port
from ray.dag.input_node import InputNode
from ray.dag.output_node import MultiOutputNode
from ray.experimental.channel.torch_tensor_type import TorchTensorType
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
@woshiyyya
woshiyyya / channel_error.py
Created August 20, 2024 00:28
DAG NCCL channel error when binding with a node of the same actor
import ray
import torch
from ray.dag.input_node import InputNode
from ray.dag.output_node import MultiOutputNode
from ray.experimental.channel.torch_tensor_type import TorchTensorType
@ray.remote(num_gpus=1)
class MyActor:
def __init__(self):
pass
@woshiyyya
woshiyyya / test.py
Created August 22, 2024 21:22
ADAG hide the actual method error stack trace, but printing a timeout error
import ray
from ray.air.util.torch_dist import _init_torch_distributed
from ray.air._internal.util import find_free_port
from ray.dag.input_node import InputNode
from ray.dag.output_node import MultiOutputNode
from ray.experimental.channel.torch_tensor_type import TorchTensorType
import os
import torch
import torch.nn as nn
from torch.nn import functional as F
@woshiyyya
woshiyyya / dtensor_2d_llama.py
Created September 5, 2024 01:05
TP + FSDP with PyTorch DTensor
import sys
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
from log_utils import rank_log, get_logger, verify_min_gpu_count
# ---- GPU check ------------