Skip to content

Instantly share code, notes, and snippets.

@rajagurunath
Created October 23, 2025 03:45
Show Gist options
  • Save rajagurunath/64744bd359105c1191561f98c469fd9f to your computer and use it in GitHub Desktop.
Save rajagurunath/64744bd359105c1191561f98c469fd9f to your computer and use it in GitHub Desktop.
#!/bin/bash
# nanochat Training Script for IONet CaaS (Gist Version)
# This script is designed to be hosted on GitHub Gist for easy updates
# without rebuilding Docker images
set -e # Exit on error
echo "=================================="
echo "nanochat IONet Training (Gist)"
echo "=================================="
echo "Start time: $(date)"
# Ensure PATH includes uv and cargo
export PATH="/root/.local/bin:/root/.cargo/bin:$PATH"
# R2 configuration
R2_ENDPOINT_URL=${R2_ENDPOINT_URL:-""}
R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-""}
R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-""}
R2_BUCKET_NAME=${R2_BUCKET_NAME:-"llm-exploration"}
echo ""
echo "=================================="
echo "Step 1: Pre-flight Checks"
echo "=================================="
# Check GPU availability
echo "Checking GPU availability..."
if command -v nvidia-smi &> /dev/null; then
nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
echo "✓ Found $GPU_COUNT GPU(s)"
else
echo "❌ nvidia-smi not found!"
exit 1
fi
# Verify tools are installed
echo "Verifying installed tools..."
echo " uv: $(uv --version 2>&1 || echo 'NOT FOUND')"
echo " rust: $(rustc --version 2>&1 || echo 'NOT FOUND')"
echo " aws: $(aws --version 2>&1 || echo 'NOT FOUND')"
# Configure R2 if credentials provided
if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then
echo "Configuring R2 access..."
mkdir -p ~/.aws
cat > ~/.aws/credentials <<EOF
[default]
aws_access_key_id = $R2_ACCESS_KEY_ID
aws_secret_access_key = $R2_SECRET_ACCESS_KEY
EOF
cat > ~/.aws/config <<EOF
[default]
region = auto
output = json
EOF
# Test R2 connectivity
echo "Testing R2 connectivity..."
TEST_FILE="/tmp/r2_test_$(date +%s).txt"
echo "IONet nanochat - R2 test at $(date)" > $TEST_FILE
if aws s3 cp $TEST_FILE s3://$R2_BUCKET_NAME/ --endpoint-url $R2_ENDPOINT_URL 2>/dev/null; then
echo "✓ R2 upload test successful"
aws s3 rm s3://$R2_BUCKET_NAME/$(basename $TEST_FILE) --endpoint-url $R2_ENDPOINT_URL 2>/dev/null
rm $TEST_FILE
else
echo "⚠️ R2 upload test failed - continuing anyway"
fi
else
echo "⚠️ R2 not configured - uploads will be skipped"
fi
echo ""
echo "=================================="
echo "Step 2: Setup nanochat Environment"
echo "=================================="
cd /workspace
# Clone nanochat
if [ ! -d "nanochat" ]; then
echo "Cloning nanochat repository..."
git clone https://github.com/karpathy/nanochat.git
fi
cd nanochat
# Setup environment variables
export OMP_NUM_THREADS=1
export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}"
mkdir -p $NANOCHAT_BASE_DIR
# Create venv and install dependencies
echo "Setting up Python virtual environment with uv..."
if [ ! -d ".venv" ]; then
uv venv
fi
echo "Installing Python dependencies (this may take 5-10 minutes)..."
uv sync --extra gpu
# Activate venv
source .venv/bin/activate
# Verify Python environment
echo "Python environment:"
python --version
pip --version
# Test wandb if configured
if [ -n "$WANDB_API_KEY" ]; then
echo "Configuring wandb..."
python3 -c "import wandb; wandb.login(key='$WANDB_API_KEY')" 2>/dev/null && echo "✓ wandb configured" || echo "⚠️ wandb test failed"
fi
# Set wandb run name
if [ -n "$WANDB_RUN" ] && [ "$WANDB_RUN" != "dummy" ]; then
export WANDB_RUN=$WANDB_RUN
else
export WANDB_RUN=dummy
fi
echo ""
echo "=================================="
echo "Step 3: Training Pipeline"
echo "=================================="
echo "This will take approximately 4 hours..."
# Reset report
echo "Initializing training report..."
python -m nanochat.report reset
# Build tokenizer
echo "Building Rust tokenizer..."
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
# Download initial dataset
echo "Downloading initial dataset (~800MB)..."
python -m nanochat.dataset -n 8
# Download remaining data in background
echo "Starting background dataset download (~24GB)..."
python -m nanochat.dataset -n 240 &
DATASET_DOWNLOAD_PID=$!
# Train tokenizer
echo "Training tokenizer on 2B characters..."
python -m scripts.tok_train --max_chars=2000000000
python -m scripts.tok_eval
# Download eval bundle
echo "Downloading evaluation bundle..."
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# Wait for dataset download
echo "Waiting for dataset download to complete..."
wait $DATASET_DOWNLOAD_PID
# Pretraining
echo ""
echo "Starting PRETRAINING (d20 model, 561M parameters)..."
echo "This is the longest phase (~2-3 hours)"
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_loss
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_eval
# Download identity conversations
echo "Downloading identity conversations..."
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
# Midtraining
echo ""
echo "Starting MIDTRAINING..."
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.mid_train -- --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i mid
# Supervised finetuning
echo ""
echo "Starting SUPERVISED FINETUNING..."
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_sft -- --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i sft
# Generate final report
echo "Generating final training report..."
python -m nanochat.report generate
echo ""
echo "=================================="
echo "Step 4: Upload to R2"
echo "=================================="
if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then
echo "Uploading model artifacts to R2..."
# Create timestamp for this run
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
MODEL_PREFIX="nanochat-d20-$TIMESTAMP"
# Upload checkpoints
for checkpoint in base.pt mid.pt sft.pt; do
if [ -f "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" ]; then
echo "Uploading $checkpoint..."
aws s3 cp "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" \
"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/checkpoints/$checkpoint" \
--endpoint-url "$R2_ENDPOINT_URL"
fi
done
# Upload tokenizer
if [ -f "$NANOCHAT_BASE_DIR/tokenizer.model" ]; then
echo "Uploading tokenizer.model..."
aws s3 cp "$NANOCHAT_BASE_DIR/tokenizer.model" \
"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/tokenizer.model" \
--endpoint-url "$R2_ENDPOINT_URL"
fi
# Upload report
if [ -f "report.md" ]; then
echo "Uploading report.md..."
aws s3 cp report.md \
"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/report.md" \
--endpoint-url "$R2_ENDPOINT_URL"
fi
# Create and upload metadata
cat > /tmp/metadata.json <<EOF
{
"model_name": "nanochat-d20",
"timestamp": "$TIMESTAMP",
"gpu_count": $GPU_COUNT,
"wandb_run": "${WANDB_RUN:-dummy}",
"model_params": "561M",
"training_tokens": "11.2B",
"model_depth": 20,
"upload_date": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
}
EOF
aws s3 cp /tmp/metadata.json \
"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/metadata.json" \
--endpoint-url "$R2_ENDPOINT_URL"
echo ""
echo "✓ Model uploaded to: s3://$R2_BUCKET_NAME/$MODEL_PREFIX/"
aws s3 ls "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/" --recursive --endpoint-url "$R2_ENDPOINT_URL" || echo "Could not list files"
else
echo "⚠️ R2 upload skipped - credentials not configured"
fi
echo ""
echo "=================================="
echo "Training Complete!"
echo "=================================="
echo "End time: $(date)"
# Print report summary
if [ -f "report.md" ]; then
echo ""
echo "Training Report Summary:"
echo "------------------------"
tail -n 30 report.md
fi
# Optionally serve the model
if [ "$KEEP_ALIVE" = "true" ]; then
echo ""
echo "KEEP_ALIVE=true - Starting model server on port 8000..."
source .venv/bin/activate
python -m scripts.chat_web --host 0.0.0.0 --port 8000
else
echo ""
echo "Training complete. Container will exit."
echo "Set KEEP_ALIVE=true to start the model server."
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment