Created
October 23, 2025 03:45
-
-
Save rajagurunath/64744bd359105c1191561f98c469fd9f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # nanochat Training Script for IONet CaaS (Gist Version) | |
| # This script is designed to be hosted on GitHub Gist for easy updates | |
| # without rebuilding Docker images | |
| set -e # Exit on error | |
| echo "==================================" | |
| echo "nanochat IONet Training (Gist)" | |
| echo "==================================" | |
| echo "Start time: $(date)" | |
| # Ensure PATH includes uv and cargo | |
| export PATH="/root/.local/bin:/root/.cargo/bin:$PATH" | |
| # R2 configuration | |
| R2_ENDPOINT_URL=${R2_ENDPOINT_URL:-""} | |
| R2_ACCESS_KEY_ID=${R2_ACCESS_KEY_ID:-""} | |
| R2_SECRET_ACCESS_KEY=${R2_SECRET_ACCESS_KEY:-""} | |
| R2_BUCKET_NAME=${R2_BUCKET_NAME:-"llm-exploration"} | |
| echo "" | |
| echo "==================================" | |
| echo "Step 1: Pre-flight Checks" | |
| echo "==================================" | |
| # Check GPU availability | |
| echo "Checking GPU availability..." | |
| if command -v nvidia-smi &> /dev/null; then | |
| nvidia-smi --query-gpu=name,memory.total --format=csv,noheader | |
| GPU_COUNT=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l) | |
| echo "✓ Found $GPU_COUNT GPU(s)" | |
| else | |
| echo "❌ nvidia-smi not found!" | |
| exit 1 | |
| fi | |
| # Verify tools are installed | |
| echo "Verifying installed tools..." | |
| echo " uv: $(uv --version 2>&1 || echo 'NOT FOUND')" | |
| echo " rust: $(rustc --version 2>&1 || echo 'NOT FOUND')" | |
| echo " aws: $(aws --version 2>&1 || echo 'NOT FOUND')" | |
| # Configure R2 if credentials provided | |
| if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then | |
| echo "Configuring R2 access..." | |
| mkdir -p ~/.aws | |
| cat > ~/.aws/credentials <<EOF | |
| [default] | |
| aws_access_key_id = $R2_ACCESS_KEY_ID | |
| aws_secret_access_key = $R2_SECRET_ACCESS_KEY | |
| EOF | |
| cat > ~/.aws/config <<EOF | |
| [default] | |
| region = auto | |
| output = json | |
| EOF | |
| # Test R2 connectivity | |
| echo "Testing R2 connectivity..." | |
| TEST_FILE="/tmp/r2_test_$(date +%s).txt" | |
| echo "IONet nanochat - R2 test at $(date)" > $TEST_FILE | |
| if aws s3 cp $TEST_FILE s3://$R2_BUCKET_NAME/ --endpoint-url $R2_ENDPOINT_URL 2>/dev/null; then | |
| echo "✓ R2 upload test successful" | |
| aws s3 rm s3://$R2_BUCKET_NAME/$(basename $TEST_FILE) --endpoint-url $R2_ENDPOINT_URL 2>/dev/null | |
| rm $TEST_FILE | |
| else | |
| echo "⚠️ R2 upload test failed - continuing anyway" | |
| fi | |
| else | |
| echo "⚠️ R2 not configured - uploads will be skipped" | |
| fi | |
| echo "" | |
| echo "==================================" | |
| echo "Step 2: Setup nanochat Environment" | |
| echo "==================================" | |
| cd /workspace | |
| # Clone nanochat | |
| if [ ! -d "nanochat" ]; then | |
| echo "Cloning nanochat repository..." | |
| git clone https://github.com/karpathy/nanochat.git | |
| fi | |
| cd nanochat | |
| # Setup environment variables | |
| export OMP_NUM_THREADS=1 | |
| export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}" | |
| mkdir -p $NANOCHAT_BASE_DIR | |
| # Create venv and install dependencies | |
| echo "Setting up Python virtual environment with uv..." | |
| if [ ! -d ".venv" ]; then | |
| uv venv | |
| fi | |
| echo "Installing Python dependencies (this may take 5-10 minutes)..." | |
| uv sync --extra gpu | |
| # Activate venv | |
| source .venv/bin/activate | |
| # Verify Python environment | |
| echo "Python environment:" | |
| python --version | |
| pip --version | |
| # Test wandb if configured | |
| if [ -n "$WANDB_API_KEY" ]; then | |
| echo "Configuring wandb..." | |
| python3 -c "import wandb; wandb.login(key='$WANDB_API_KEY')" 2>/dev/null && echo "✓ wandb configured" || echo "⚠️ wandb test failed" | |
| fi | |
| # Set wandb run name | |
| if [ -n "$WANDB_RUN" ] && [ "$WANDB_RUN" != "dummy" ]; then | |
| export WANDB_RUN=$WANDB_RUN | |
| else | |
| export WANDB_RUN=dummy | |
| fi | |
| echo "" | |
| echo "==================================" | |
| echo "Step 3: Training Pipeline" | |
| echo "==================================" | |
| echo "This will take approximately 4 hours..." | |
| # Reset report | |
| echo "Initializing training report..." | |
| python -m nanochat.report reset | |
| # Build tokenizer | |
| echo "Building Rust tokenizer..." | |
| uv run maturin develop --release --manifest-path rustbpe/Cargo.toml | |
| # Download initial dataset | |
| echo "Downloading initial dataset (~800MB)..." | |
| python -m nanochat.dataset -n 8 | |
| # Download remaining data in background | |
| echo "Starting background dataset download (~24GB)..." | |
| python -m nanochat.dataset -n 240 & | |
| DATASET_DOWNLOAD_PID=$! | |
| # Train tokenizer | |
| echo "Training tokenizer on 2B characters..." | |
| python -m scripts.tok_train --max_chars=2000000000 | |
| python -m scripts.tok_eval | |
| # Download eval bundle | |
| echo "Downloading evaluation bundle..." | |
| EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip | |
| if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then | |
| curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL | |
| unzip -q eval_bundle.zip | |
| rm eval_bundle.zip | |
| mv eval_bundle $NANOCHAT_BASE_DIR | |
| fi | |
| # Wait for dataset download | |
| echo "Waiting for dataset download to complete..." | |
| wait $DATASET_DOWNLOAD_PID | |
| # Pretraining | |
| echo "" | |
| echo "Starting PRETRAINING (d20 model, 561M parameters)..." | |
| echo "This is the longest phase (~2-3 hours)" | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_train -- --depth=20 --run=$WANDB_RUN | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_loss | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.base_eval | |
| # Download identity conversations | |
| echo "Downloading identity conversations..." | |
| curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl | |
| # Midtraining | |
| echo "" | |
| echo "Starting MIDTRAINING..." | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.mid_train -- --run=$WANDB_RUN | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i mid | |
| # Supervised finetuning | |
| echo "" | |
| echo "Starting SUPERVISED FINETUNING..." | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_sft -- --run=$WANDB_RUN | |
| torchrun --standalone --nproc_per_node=$GPU_COUNT -m scripts.chat_eval -- -i sft | |
| # Generate final report | |
| echo "Generating final training report..." | |
| python -m nanochat.report generate | |
| echo "" | |
| echo "==================================" | |
| echo "Step 4: Upload to R2" | |
| echo "==================================" | |
| if [ -n "$R2_ENDPOINT_URL" ] && [ -n "$R2_ACCESS_KEY_ID" ]; then | |
| echo "Uploading model artifacts to R2..." | |
| # Create timestamp for this run | |
| TIMESTAMP=$(date +%Y%m%d-%H%M%S) | |
| MODEL_PREFIX="nanochat-d20-$TIMESTAMP" | |
| # Upload checkpoints | |
| for checkpoint in base.pt mid.pt sft.pt; do | |
| if [ -f "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" ]; then | |
| echo "Uploading $checkpoint..." | |
| aws s3 cp "$NANOCHAT_BASE_DIR/checkpoints/$checkpoint" \ | |
| "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/checkpoints/$checkpoint" \ | |
| --endpoint-url "$R2_ENDPOINT_URL" | |
| fi | |
| done | |
| # Upload tokenizer | |
| if [ -f "$NANOCHAT_BASE_DIR/tokenizer.model" ]; then | |
| echo "Uploading tokenizer.model..." | |
| aws s3 cp "$NANOCHAT_BASE_DIR/tokenizer.model" \ | |
| "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/tokenizer.model" \ | |
| --endpoint-url "$R2_ENDPOINT_URL" | |
| fi | |
| # Upload report | |
| if [ -f "report.md" ]; then | |
| echo "Uploading report.md..." | |
| aws s3 cp report.md \ | |
| "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/report.md" \ | |
| --endpoint-url "$R2_ENDPOINT_URL" | |
| fi | |
| # Create and upload metadata | |
| cat > /tmp/metadata.json <<EOF | |
| { | |
| "model_name": "nanochat-d20", | |
| "timestamp": "$TIMESTAMP", | |
| "gpu_count": $GPU_COUNT, | |
| "wandb_run": "${WANDB_RUN:-dummy}", | |
| "model_params": "561M", | |
| "training_tokens": "11.2B", | |
| "model_depth": 20, | |
| "upload_date": "$(date -u +%Y-%m-%dT%H:%M:%SZ)" | |
| } | |
| EOF | |
| aws s3 cp /tmp/metadata.json \ | |
| "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/metadata.json" \ | |
| --endpoint-url "$R2_ENDPOINT_URL" | |
| echo "" | |
| echo "✓ Model uploaded to: s3://$R2_BUCKET_NAME/$MODEL_PREFIX/" | |
| aws s3 ls "s3://$R2_BUCKET_NAME/$MODEL_PREFIX/" --recursive --endpoint-url "$R2_ENDPOINT_URL" || echo "Could not list files" | |
| else | |
| echo "⚠️ R2 upload skipped - credentials not configured" | |
| fi | |
| echo "" | |
| echo "==================================" | |
| echo "Training Complete!" | |
| echo "==================================" | |
| echo "End time: $(date)" | |
| # Print report summary | |
| if [ -f "report.md" ]; then | |
| echo "" | |
| echo "Training Report Summary:" | |
| echo "------------------------" | |
| tail -n 30 report.md | |
| fi | |
| # Optionally serve the model | |
| if [ "$KEEP_ALIVE" = "true" ]; then | |
| echo "" | |
| echo "KEEP_ALIVE=true - Starting model server on port 8000..." | |
| source .venv/bin/activate | |
| python -m scripts.chat_web --host 0.0.0.0 --port 8000 | |
| else | |
| echo "" | |
| echo "Training complete. Container will exit." | |
| echo "Set KEEP_ALIVE=true to start the model server." | |
| fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment