This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| task. | |
| class_name: RayTrainWorker | |
| actor_id: 9e6790a209b7c509e64301f305000000 | |
| pid: 35979 | |
| namespace: f205d617-4ee1-4fae-a76a-c3f2382b7527 | |
| ip: 172.24.101.245 | |
| The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None. Traceback (most recent call last): | |
| File "python/ray/_raylet.pyx", line 1883, in ray._raylet.execute_task | |
| File "python/ray/_raylet.pyx", line 1984, in ray._raylet.execute_task | |
| File "python/ray/_raylet.pyx", line 1889, in ray._raylet.execute_task |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ray | |
| from ray.train.torch import TorchTrainer | |
| from ray.train import RunConfig, ScalingConfig | |
| import time | |
| def train_func(): | |
| print("Training Starts") | |
| time.sleep(100) | |
| datasets = { |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from ray.dag.input_node import InputNode | |
| from ray.dag.output_node import MultiOutputNode | |
| import ray | |
| @ray.remote | |
| class Worker: | |
| def __init__(self, rank): | |
| self.rank = rank | |
| self.logs = [] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from __future__ import print_function | |
| import argparse | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| import torch.optim as optim | |
| from torchvision import datasets, transforms | |
| from torch.optim.lr_scheduler import StepLR | |
| from ray.train.torch import TorchTrainer | |
| from ray.train import ScalingConfig, RunConfig |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ray | |
| import torch | |
| from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
| # shape = (4, 8192) | |
| shape = (4, 24576) | |
| @ray.remote(num_gpus=1) | |
| class MyActor: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ray | |
| from ray.air.util.torch_dist import _init_torch_distributed | |
| from ray.air._internal.util import find_free_port | |
| from ray.dag.input_node import InputNode | |
| from ray.dag.output_node import MultiOutputNode | |
| from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
| import os | |
| import torch | |
| import torch.nn as nn | |
| from torch.nn import functional as F |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ray | |
| import torch | |
| from ray.dag.input_node import InputNode | |
| from ray.dag.output_node import MultiOutputNode | |
| from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
| @ray.remote(num_gpus=1) | |
| class MyActor: | |
| def __init__(self): | |
| pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import ray | |
| from ray.air.util.torch_dist import _init_torch_distributed | |
| from ray.air._internal.util import find_free_port | |
| from ray.dag.input_node import InputNode | |
| from ray.dag.output_node import MultiOutputNode | |
| from ray.experimental.channel.torch_tensor_type import TorchTensorType | |
| import os | |
| import torch | |
| import torch.nn as nn | |
| from torch.nn import functional as F |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| import os | |
| import torch | |
| import torch.distributed as dist | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from log_utils import rank_log, get_logger, verify_min_gpu_count | |
| # ---- GPU check ------------ |
