Skip to content

Instantly share code, notes, and snippets.

View kabouzeid's full-sized avatar

Karim Abou Zeid kabouzeid

View GitHub Profile
# Copyright (c) Karim Abou Zeid
from typing import Any
import lightning.pytorch as pl
import torch
from lightning.pytorch.callbacks.callback import Callback
from lightning.pytorch.utilities.exceptions import MisconfigurationException
from lightning.pytorch.utilities.types import STEP_OUTPUT
from typing_extensions import override
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.13"
# dependencies = [
# "rich",
# ]
# ///
import argparse
import json
#!/usr/bin/env bash
set -euo pipefail
torchrun \
--nproc-per-node=$SLURM_GPUS_ON_NODE\
--nnodes=$SLURM_JOB_NUM_NODES \
--node_rank=$SLURM_NODEID \
--master_addr=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n1) \
--master_port=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4)) \