Skip to content

Instantly share code, notes, and snippets.

@qpwo
Last active January 14, 2025 22:14
Show Gist options
  • Save qpwo/f2e5d2a99e775f7ac54e05c6254191ae to your computer and use it in GitHub Desktop.
Save qpwo/f2e5d2a99e775f7ac54e05c6254191ae to your computer and use it in GitHub Desktop.
# modal run issue.py
import os
import subprocess
import time
import modal
import modal.experimental
import modal.gpu
import modal.volume
image = (
modal.Image.debian_slim(python_version="3.12")
.pip_install("torch", "numpy<2")
.apt_install("curl", "git", "iproute2")
.run_commands("git clone https://github.com/sgl-project/sglang.git")
.run_commands(
"cd sglang && pip install -e 'python[all]' --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/"
)
)
app = modal.App(
os.path.basename(__file__),
image=image,
secrets=[modal.Secret.from_name("huggingface-secret", required_keys=["HF_TOKEN"])],
)
n_nodes = 2
n_proc_per_node = 2
total_gpus = n_nodes * n_proc_per_node
def shellbg(cmd: str):
print(f"Running:\n\t$ {cmd}", flush=True)
subprocess.Popen(cmd, shell=True)
def mysleep(seconds):
"long sleeps block stdout network sync?"
for _ in range(seconds * 1000):
time.sleep(0.001)
@app.function(
gpu=modal.gpu.H100(count=n_proc_per_node),
timeout=3600 * 24,
cpu=64,
memory=344_064, # in megabytes
) # pyright: ignore
@modal.experimental.clustered(size=n_nodes)
def fire_it_up():
cluster_info = modal.experimental.get_cluster_info()
node_rank = cluster_info.rank
main_addr = cluster_info.container_ips[0]
my_addr = cluster_info.container_ips[node_rank]
num_nodes = len(cluster_info.container_ips)
task_id = os.environ["MODAL_TASK_ID"]
print(f"{cluster_info=}")
print(f"hello from {node_rank=}, {main_addr=}, {my_addr=}, {num_nodes=}, {task_id=}")
shellbg("ip address show")
tp = num_nodes * n_proc_per_node
print("\n\n\n")
mysleep(5)
if node_rank == 0:
shellbg("python3 -m sglang.check_env")
mysleep(15)
print("\n\n\n")
shellbg(
f"""
export GLOO_SOCKET_IFNAME='eth1' MASTER_ADDR={main_addr} MASTER_PORT=1234;
echo GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME and MASTER_ADDR=$MASTER_ADDR and MASTER_PORT=$MASTER_PORT;
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B \
--tp-size {tp} --nnodes {num_nodes} --node-rank {node_rank} \
--port 2242 --dist-init-addr [{main_addr}]:1234 \
--disable-cuda-graph --trust-remote-code
"""
)
for i in range(10000000):
shellbg("curl http://localhost:2242/v1/models")
if i % 3 == 1:
shellbg("nvidia-smi --query-gpu=power.draw,memory.used,memory.free --format=csv")
mysleep(10)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment