Skip to content

Instantly share code, notes, and snippets.

View woshiyyya's full-sized avatar
zzz

Yunxuan Xiao woshiyyya

zzz
View GitHub Profile
accelerate==0.19.0
adal==1.2.7
aiofiles==22.1.0
aiohttp==3.8.5
aiohttp-cors==0.7.0
aiorwlock==1.3.0
aiosignal==1.3.1
aiosqlite==0.19.0
alabaster==0.7.13
anyio==3.7.1
@woshiyyya
woshiyyya / torch_ddp.py
Last active October 23, 2023 21:16
Torch_DDP_Example
import os
import tempfile
import torch
from torch import nn
from torch.nn.parallel import DistributedDataParallel
import ray
from ray.train import Checkpoint, CheckpointConfig, RunConfig, ScalingConfig
from ray.train.torch import TorchTrainer
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_backend # noqa: F401
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer, prepare_model
from ray.train.torch.xla import TorchXLAConfig
# This script is tested with the PR(https://github.com/ray-project/ray/pull/39130) from AWS team.
# It configures the required environment variables for Neuron XLA.
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_backend # noqa: F401
"""
Cluster: 16 x A10G GPUs
Command: python precompute_latents.py --subset_size 50 --mode debug
"""
import argparse
import io
import pandas as pd
import pyarrow.dataset as pds
import os
import os
import time
import torch
from torch import nn
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer
from ray.train.torch.xla import TorchXLAConfig
from torchvision.datasets import mnist
import ray
import ray.train
import numpy as np
from ray.train.torch import TorchTrainer
from ray.train import ScalingConfig
from dataclasses import dataclass
@dataclass
class DummyDataclass:
#!/usr/bin/env python3
# pylint: skip-file
import os
import torch
from torch import distributed as dist
from torchvision.models import resnet18
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor, Normalize, Compose
from torch.utils.data import DataLoader, DistributedSampler
import ray
ray.init()
node_resources = {}
for node in ray.nodes():
print(node, "\n")
node_resources[node["NodeID"]] = node["Resources"]
import ray
@woshiyyya
woshiyyya / run.py
Created April 9, 2024 18:14
Test Async Actor DDP
from collections import defaultdict
from ray.train._internal.utils import get_address_and_port
import ray
import os
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import time