rajagurunath · October 23, 2025 03:45
diff --git a/train_and_upload_simple.sh b/train_and_upload_simple.sh
 #!/bin/bash

 # nanochat Training Script for IONet CaaS (Gist Version)
 # This script is designed to be hosted on GitHub Gist for easy updates
 # without rebuilding Docker images

 set -e  # Exit on error

 echo "=================================="
 echo "nanochat IONet Training (Gist)"
 echo "=================================="
 echo "Start time: $(date)"

 # Ensure PATH includes uv and cargo
 export PATH="/root/.local/bin:/root/.cargo/bin:$PATH"

 # R2 configuration
 R2_ENDPOINT_URL=${R2_ENDPOINT_URL:-""}
 R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-""}
 R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-""}
 R2_BUCKET_NAME=${R2_BUCKET_NAME:-"llm-exploration"}

 echo ""
 echo "=================================="
 echo "Step 1: Pre-flight Checks"
 echo "=================================="

 # Check GPU availability
 echo "Checking GPU availability..."
 if command -v nvidia-smi &> /dev/null; then
    nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
    GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
    echo "✓ Found $GPU_COUNT GPU(s)"
 else
    echo "❌ nvidia-smi not found!"
    exit 1
 fi

 # Verify tools are installed
 echo "Verifying installed tools..."
 echo "  uv: $(uv --version 2>&1 || echo 'NOT FOUND')"
 echo "  rust: $(rustc --version 2>&1 || echo 'NOT FOUND')"
 echo "  aws: $(aws --version 2>&1 || echo 'NOT FOUND')"

 # Configure R2 if credentials provided
 if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then
    echo "Configuring R2 access..."
    mkdir -p ~/.aws
    cat > ~/.aws/credentials <<EOF
 [default]
 aws_access_key_id = $R2_ACCESS_KEY_ID
 aws_secret_access_key = $R2_SECRET_ACCESS_KEY
 EOF

    cat > ~/.aws/config <<EOF
 [default]
 region = auto
 output = json
 EOF

    # Test R2 connectivity
    echo "Testing R2 connectivity..."
    TEST_FILE="/tmp/r2_test_$(date +%s).txt"
    echo "IONet nanochat - R2 test at $(date)" > $TEST_FILE

    if aws s3 cp $TEST_FILE s3://$R2_BUCKET_NAME/ --endpoint-url $R2_ENDPOINT_URL 2>/dev/null; then
        echo "✓ R2 upload test successful"
        aws s3 rm s3://$R2_BUCKET_NAME/$(basename $TEST_FILE) --endpoint-url $R2_ENDPOINT_URL 2>/dev/null
        rm $TEST_FILE
    else
        echo "⚠️  R2 upload test failed - continuing anyway"
    fi
 else
    echo "⚠️  R2 not configured - uploads will be skipped"
 fi

 echo ""
 echo "=================================="
 echo "Step 2: Setup nanochat Environment"
 echo "=================================="

 cd /workspace

 # Clone nanochat
 if [ ! -d "nanochat" ]; then
    echo "Cloning nanochat repository..."
    git clone https://github.com/karpathy/nanochat.git
 fi

 cd nanochat

 # Setup environment variables
 export OMP_NUM_THREADS=1
 export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}"
 mkdir -p $NANOCHAT_BASE_DIR

 # Create venv and install dependencies
 echo "Setting up Python virtual environment with uv..."
 if [ ! -d ".venv" ]; then
    uv venv
 fi

 echo "Installing Python dependencies (this may take 5-10 minutes)..."
 uv sync --extra gpu

 # Activate venv
 source .venv/bin/activate

 # Verify Python environment
 echo "Python environment:"
 python --version
 pip --version

 # Test wandb if configured
 if [ -n "$WANDB_API_KEY" ]; then
    echo "Configuring wandb..."
    python3 -c "import wandb; wandb.login(key='$WANDB_API_KEY')" 2>/dev/null && echo "✓ wandb configured" || echo "⚠️  wandb test failed"
 fi

 # Set wandb run name
 if [ -n "$WANDB_RUN" ] && [ "$WANDB_RUN" != "dummy" ]; then
    export WANDB_RUN=$WANDB_RUN
 else
    export WANDB_RUN=dummy
 fi

 echo ""
 echo "=================================="
 echo "Step 3: Training Pipeline"
 echo "=================================="
 echo "This will take approximately 4 hours..."

 # Reset report
 echo "Initializing training report..."
 python -m nanochat.report reset

 # Build tokenizer
 echo "Building Rust tokenizer..."
 uv run maturin develop --release --manifest-path rustbpe/Cargo.toml

 # Download initial dataset
 echo "Downloading initial dataset (~800MB)..."
 python -m nanochat.dataset -n 8

 # Download remaining data in background
 echo "Starting background dataset download (~24GB)..."
 python -m nanochat.dataset -n 240 &
 DATASET_DOWNLOAD_PID=$!

 # Train tokenizer
 echo "Training tokenizer on 2B characters..."
 python -m scripts.tok_train --max_chars=2000000000
 python -m scripts.tok_eval

 # Download eval bundle
 echo "Downloading evaluation bundle..."
 EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
 if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
    unzip -q eval_bundle.zip
    rm eval_bundle.zip
    mv eval_bundle $NANOCHAT_BASE_DIR
 fi

 # Wait for dataset download
 echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID

 # Pretraining
 echo ""
 echo "Starting PRETRAINING (d20 model, 561M parameters)..."
 echo "This is the longest phase (~2-3 hours)"
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_loss
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_eval

 # Download identity conversations
 echo "Downloading identity conversations..."
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl

 # Midtraining
 echo ""
 echo "Starting MIDTRAINING..."
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.mid_train -- --run=$WANDB_RUN
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i mid

 # Supervised finetuning
 echo ""
 echo "Starting SUPERVISED FINETUNING..."
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_sft -- --run=$WANDB_RUN
 torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i sft

 # Generate final report
 echo "Generating final training report..."
 python -m nanochat.report generate

 echo ""
 echo "=================================="
 echo "Step 4: Upload to R2"
 echo "=================================="

 if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then
    echo "Uploading model artifacts to R2..."

    # Create timestamp for this run
    TIMESTAMP=$(date +%Y%m%d-%H%M%S)
    MODEL_PREFIX="nanochat-d20-$TIMESTAMP"

    # Upload checkpoints
    for checkpoint in base.pt mid.pt sft.pt; do
        if [ -f "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" ]; then
            echo "Uploading $checkpoint..."
            aws s3 cp "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" \
                "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/checkpoints/$checkpoint" \
                --endpoint-url "$R2_ENDPOINT_URL"
        fi
    done

    # Upload tokenizer
    if [ -f "$NANOCHAT_BASE_DIR/tokenizer.model" ]; then
        echo "Uploading tokenizer.model..."
        aws s3 cp "$NANOCHAT_BASE_DIR/tokenizer.model" \
            "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/tokenizer.model" \
            --endpoint-url "$R2_ENDPOINT_URL"
    fi

    # Upload report
    if [ -f "report.md" ]; then
        echo "Uploading report.md..."
        aws s3 cp report.md \
            "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/report.md" \
            --endpoint-url "$R2_ENDPOINT_URL"
    fi

    # Create and upload metadata
    cat > /tmp/metadata.json <<EOF
 {
  "model_name": "nanochat-d20",
  "timestamp": "$TIMESTAMP",
  "gpu_count": $GPU_COUNT,
  "wandb_run": "${WANDB_RUN:-dummy}",
  "model_params": "561M",
  "training_tokens": "11.2B",
  "model_depth": 20,
  "upload_date": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
 }
 EOF

    aws s3 cp /tmp/metadata.json \
        "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/metadata.json" \
        --endpoint-url "$R2_ENDPOINT_URL"

    echo ""
    echo "✓ Model uploaded to: s3://$R2_BUCKET_NAME/$MODEL_PREFIX/"
    aws s3 ls "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/" --recursive --endpoint-url "$R2_ENDPOINT_URL" || echo "Could not list files"

 else
    echo "⚠️  R2 upload skipped - credentials not configured"
 fi

 echo ""
 echo "=================================="
 echo "Training Complete!"
 echo "=================================="
 echo "End time: $(date)"

 # Print report summary
 if [ -f "report.md" ]; then
    echo ""
    echo "Training Report Summary:"
    echo "------------------------"
    tail -n 30 report.md
 fi

 # Optionally serve the model
 if [ "$KEEP_ALIVE" = "true" ]; then
    echo ""
    echo "KEEP_ALIVE=true - Starting model server on port 8000..."
    source .venv/bin/activate
    python -m scripts.chat_web --host 0.0.0.0 --port 8000
 else
    echo ""
    echo "Training complete. Container will exit."
    echo "Set KEEP_ALIVE=true to start the model server."
 fi
	#!/bin/bash

	# nanochat Training Script for IONet CaaS (Gist Version)
	# This script is designed to be hosted on GitHub Gist for easy updates
	# without rebuilding Docker images

	set -e # Exit on error

	echo "=================================="
	echo "nanochat IONet Training (Gist)"
	echo "=================================="
	echo "Start time: $(date)"

	# Ensure PATH includes uv and cargo
	export PATH="/root/.local/bin:/root/.cargo/bin:$PATH"

	# R2 configuration
	R2_ENDPOINT_URL=${R2_ENDPOINT_URL:-""}
	R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-""}
	R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-""}
	R2_BUCKET_NAME=${R2_BUCKET_NAME:-"llm-exploration"}

	echo ""
	echo "=================================="
	echo "Step 1: Pre-flight Checks"
	echo "=================================="

	# Check GPU availability
	echo "Checking GPU availability..."
	if command -v nvidia-smi &> /dev/null; then
	nvidia-smi --query-gpu=name,memory.total --format=csv,noheader
	GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader \| wc -l)
	echo "✓ Found $GPU_COUNT GPU(s)"
	else
	echo "❌ nvidia-smi not found!"
	exit 1
	fi

	# Verify tools are installed
	echo "Verifying installed tools..."
	echo " uv: $(uv --version 2>&1 \|\| echo 'NOT FOUND')"
	echo " rust: $(rustc --version 2>&1 \|\| echo 'NOT FOUND')"
	echo " aws: $(aws --version 2>&1 \|\| echo 'NOT FOUND')"

	# Configure R2 if credentials provided
	if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then
	echo "Configuring R2 access..."
	mkdir -p ~/.aws
	cat > ~/.aws/credentials <<EOF
	[default]
	aws_access_key_id = $R2_ACCESS_KEY_ID
	aws_secret_access_key = $R2_SECRET_ACCESS_KEY
	EOF

	cat > ~/.aws/config <<EOF
	[default]
	region = auto
	output = json
	EOF

	# Test R2 connectivity
	echo "Testing R2 connectivity..."
	TEST_FILE="/tmp/r2_test_$(date +%s).txt"
	echo "IONet nanochat - R2 test at $(date)" > $TEST_FILE

	if aws s3 cp $TEST_FILE s3://$R2_BUCKET_NAME/ --endpoint-url $R2_ENDPOINT_URL 2>/dev/null; then
	echo "✓ R2 upload test successful"
	aws s3 rm s3://$R2_BUCKET_NAME/$(basename $TEST_FILE) --endpoint-url $R2_ENDPOINT_URL 2>/dev/null
	rm $TEST_FILE
	else
	echo "⚠️ R2 upload test failed - continuing anyway"
	fi
	else
	echo "⚠️ R2 not configured - uploads will be skipped"
	fi

	echo ""
	echo "=================================="
	echo "Step 2: Setup nanochat Environment"
	echo "=================================="

	cd /workspace

	# Clone nanochat
	if [ ! -d "nanochat" ]; then
	echo "Cloning nanochat repository..."
	git clone https://github.com/karpathy/nanochat.git
	fi

	cd nanochat

	# Setup environment variables
	export OMP_NUM_THREADS=1
	export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}"
	mkdir -p $NANOCHAT_BASE_DIR

	# Create venv and install dependencies
	echo "Setting up Python virtual environment with uv..."
	if [ ! -d ".venv" ]; then
	uv venv
	fi

	echo "Installing Python dependencies (this may take 5-10 minutes)..."
	uv sync --extra gpu

	# Activate venv
	source .venv/bin/activate

	# Verify Python environment
	echo "Python environment:"
	python --version
	pip --version

	# Test wandb if configured
	if [ -n "$WANDB_API_KEY" ]; then
	echo "Configuring wandb..."
	python3 -c "import wandb; wandb.login(key='$WANDB_API_KEY')" 2>/dev/null && echo "✓ wandb configured" \|\| echo "⚠️ wandb test failed"
	fi

	# Set wandb run name
	if [ -n "$WANDB_RUN" ] && [ "$WANDB_RUN" != "dummy" ]; then
	export WANDB_RUN=$WANDB_RUN
	else
	export WANDB_RUN=dummy
	fi

	echo ""
	echo "=================================="
	echo "Step 3: Training Pipeline"
	echo "=================================="
	echo "This will take approximately 4 hours..."

	# Reset report
	echo "Initializing training report..."
	python -m nanochat.report reset

	# Build tokenizer
	echo "Building Rust tokenizer..."
	uv run maturin develop --release --manifest-path rustbpe/Cargo.toml

	# Download initial dataset
	echo "Downloading initial dataset (~800MB)..."
	python -m nanochat.dataset -n 8

	# Download remaining data in background
	echo "Starting background dataset download (~24GB)..."
	python -m nanochat.dataset -n 240 &
	DATASET_DOWNLOAD_PID=$!

	# Train tokenizer
	echo "Training tokenizer on 2B characters..."
	python -m scripts.tok_train --max_chars=2000000000
	python -m scripts.tok_eval

	# Download eval bundle
	echo "Downloading evaluation bundle..."
	EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
	if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
	curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
	unzip -q eval_bundle.zip
	rm eval_bundle.zip
	mv eval_bundle $NANOCHAT_BASE_DIR
	fi

	# Wait for dataset download
	echo "Waiting for dataset download to complete..."
	wait $DATASET_DOWNLOAD_PID

	# Pretraining
	echo ""
	echo "Starting PRETRAINING (d20 model, 561M parameters)..."
	echo "This is the longest phase (~2-3 hours)"
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_loss
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_eval

	# Download identity conversations
	echo "Downloading identity conversations..."
	curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl

	# Midtraining
	echo ""
	echo "Starting MIDTRAINING..."
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.mid_train -- --run=$WANDB_RUN
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i mid

	# Supervised finetuning
	echo ""
	echo "Starting SUPERVISED FINETUNING..."
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_sft -- --run=$WANDB_RUN
	torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i sft

	# Generate final report
	echo "Generating final training report..."
	python -m nanochat.report generate

	echo ""
	echo "=================================="
	echo "Step 4: Upload to R2"
	echo "=================================="

	if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then
	echo "Uploading model artifacts to R2..."

	# Create timestamp for this run
	TIMESTAMP=$(date +%Y%m%d-%H%M%S)
	MODEL_PREFIX="nanochat-d20-$TIMESTAMP"

	# Upload checkpoints
	for checkpoint in base.pt mid.pt sft.pt; do
	if [ -f "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" ]; then
	echo "Uploading $checkpoint..."
	aws s3 cp "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" \
	"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/checkpoints/$checkpoint" \
	--endpoint-url "$R2_ENDPOINT_URL"
	fi
	done

	# Upload tokenizer
	if [ -f "$NANOCHAT_BASE_DIR/tokenizer.model" ]; then
	echo "Uploading tokenizer.model..."
	aws s3 cp "$NANOCHAT_BASE_DIR/tokenizer.model" \
	"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/tokenizer.model" \
	--endpoint-url "$R2_ENDPOINT_URL"
	fi

	# Upload report
	if [ -f "report.md" ]; then
	echo "Uploading report.md..."
	aws s3 cp report.md \
	"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/report.md" \
	--endpoint-url "$R2_ENDPOINT_URL"
	fi

	# Create and upload metadata
	cat > /tmp/metadata.json <<EOF
	{
	"model_name": "nanochat-d20",
	"timestamp": "$TIMESTAMP",
	"gpu_count": $GPU_COUNT,
	"wandb_run": "${WANDB_RUN:-dummy}",
	"model_params": "561M",
	"training_tokens": "11.2B",
	"model_depth": 20,
	"upload_date": "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
	}
	EOF

	aws s3 cp /tmp/metadata.json \
	"s3://$R2_BUCKET_NAME/$MODEL_PREFIX/metadata.json" \
	--endpoint-url "$R2_ENDPOINT_URL"

	echo ""
	echo "✓ Model uploaded to: s3://$R2_BUCKET_NAME/$MODEL_PREFIX/"
	aws s3 ls "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/" --recursive --endpoint-url "$R2_ENDPOINT_URL" \|\| echo "Could not list files"

	else
	echo "⚠️ R2 upload skipped - credentials not configured"
	fi

	echo ""
	echo "=================================="
	echo "Training Complete!"
	echo "=================================="
	echo "End time: $(date)"

	# Print report summary
	if [ -f "report.md" ]; then
	echo ""
	echo "Training Report Summary:"
	echo "------------------------"
	tail -n 30 report.md
	fi

	# Optionally serve the model
	if [ "$KEEP_ALIVE" = "true" ]; then
	echo ""
	echo "KEEP_ALIVE=true - Starting model server on port 8000..."
	source .venv/bin/activate
	python -m scripts.chat_web --host 0.0.0.0 --port 8000
	else
	echo ""
	echo "Training complete. Container will exit."
	echo "Set KEEP_ALIVE=true to start the model server."
	fi