qpwo · January 14, 2025 22:14
diff --git a/issue.py b/issue.py
 # modal run issue.py
 import os
 import subprocess
 import time

 import modal
 import modal.experimental
 import modal.gpu
 import modal.volume

 image = (
    modal.Image.debian_slim(python_version="3.12")
    .pip_install("torch", "numpy<2")
    .apt_install("curl", "git", "iproute2")
    .run_commands("git clone https://github.com/sgl-project/sglang.git")
    .run_commands(
        "cd sglang && pip install -e 'python[all]' --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/"
    )
 )

 app = modal.App(
    os.path.basename(__file__),
    image=image,
    secrets=[modal.Secret.from_name("huggingface-secret", required_keys=["HF_TOKEN"])],
 )

 n_nodes = 2
 n_proc_per_node = 2
 total_gpus = n_nodes * n_proc_per_node


 def shellbg(cmd: str):
    print(f"Running:\n\t$ {cmd}", flush=True)
    subprocess.Popen(cmd, shell=True)


 def mysleep(seconds):
    "long sleeps block stdout network sync?"
    for _ in range(seconds * 1000):
        time.sleep(0.001)


 @app.function(
    gpu=modal.gpu.H100(count=n_proc_per_node),
    timeout=3600 * 24,
    cpu=64,
    memory=344_064,  # in megabytes
 )  # pyright: ignore
 @modal.experimental.clustered(size=n_nodes)
 def fire_it_up():
    cluster_info = modal.experimental.get_cluster_info()
    node_rank = cluster_info.rank
    main_addr = cluster_info.container_ips[0]
    my_addr = cluster_info.container_ips[node_rank]
    num_nodes = len(cluster_info.container_ips)
    task_id = os.environ["MODAL_TASK_ID"]
    print(f"{cluster_info=}")
    print(f"hello from {node_rank=}, {main_addr=}, {my_addr=}, {num_nodes=}, {task_id=}")

    shellbg("ip address show")

    tp = num_nodes * n_proc_per_node

    print("\n\n\n")
    mysleep(5)
    if node_rank == 0:
        shellbg("python3 -m sglang.check_env")
    mysleep(15)
    print("\n\n\n")

    shellbg(
        f"""
        export GLOO_SOCKET_IFNAME='eth1' MASTER_ADDR={main_addr} MASTER_PORT=1234;
        echo GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME and MASTER_ADDR=$MASTER_ADDR and MASTER_PORT=$MASTER_PORT;
        python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B \
            --tp-size {tp}  --nnodes {num_nodes} --node-rank {node_rank} \
            --port 2242 --dist-init-addr [{main_addr}]:1234 \
            --disable-cuda-graph --trust-remote-code
    """
    )
    for i in range(10000000):
        shellbg("curl http://localhost:2242/v1/models")
        if i % 3 == 1:
            shellbg("nvidia-smi --query-gpu=power.draw,memory.used,memory.free --format=csv")
        mysleep(10)
	# modal run issue.py
	import os
	import subprocess
	import time

	import modal
	import modal.experimental
	import modal.gpu
	import modal.volume

	image = (
	modal.Image.debian_slim(python_version="3.12")
	.pip_install("torch", "numpy<2")
	.apt_install("curl", "git", "iproute2")
	.run_commands("git clone https://github.com/sgl-project/sglang.git")
	.run_commands(
	"cd sglang && pip install -e 'python[all]' --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/"
	)
	)

	app = modal.App(
	os.path.basename(__file__),
	image=image,
	secrets=[modal.Secret.from_name("huggingface-secret", required_keys=["HF_TOKEN"])],
	)

	n_nodes = 2
	n_proc_per_node = 2
	total_gpus = n_nodes * n_proc_per_node


	def shellbg(cmd: str):
	print(f"Running:\n\t$ {cmd}", flush=True)
	subprocess.Popen(cmd, shell=True)


	def mysleep(seconds):
	"long sleeps block stdout network sync?"
	for _ in range(seconds * 1000):
	time.sleep(0.001)


	@app.function(
	gpu=modal.gpu.H100(count=n_proc_per_node),
	timeout=3600 * 24,
	cpu=64,
	memory=344_064, # in megabytes
	) # pyright: ignore
	@modal.experimental.clustered(size=n_nodes)
	def fire_it_up():
	cluster_info = modal.experimental.get_cluster_info()
	node_rank = cluster_info.rank
	main_addr = cluster_info.container_ips[0]
	my_addr = cluster_info.container_ips[node_rank]
	num_nodes = len(cluster_info.container_ips)
	task_id = os.environ["MODAL_TASK_ID"]
	print(f"{cluster_info=}")
	print(f"hello from {node_rank=}, {main_addr=}, {my_addr=}, {num_nodes=}, {task_id=}")

	shellbg("ip address show")

	tp = num_nodes * n_proc_per_node

	print("\n\n\n")
	mysleep(5)
	if node_rank == 0:
	shellbg("python3 -m sglang.check_env")
	mysleep(15)
	print("\n\n\n")

	shellbg(
	f"""
	export GLOO_SOCKET_IFNAME='eth1' MASTER_ADDR={main_addr} MASTER_PORT=1234;
	echo GLOO_SOCKET_IFNAME=$GLOO_SOCKET_IFNAME and MASTER_ADDR=$MASTER_ADDR and MASTER_PORT=$MASTER_PORT;
	python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B \
	--tp-size {tp} --nnodes {num_nodes} --node-rank {node_rank} \
	--port 2242 --dist-init-addr [{main_addr}]:1234 \
	--disable-cuda-graph --trust-remote-code
	"""
	)
	for i in range(10000000):
	shellbg("curl http://localhost:2242/v1/models")
	if i % 3 == 1:
	shellbg("nvidia-smi --query-gpu=power.draw,memory.used,memory.free --format=csv")
	mysleep(10)