eadz · April 13, 2026 09:16
diff --git a/llm.rb b/llm.rb
 # frozen_string_literal: true
 # ════════════════════════════════════════════════════════════════════════════
 #   llm.rb — a GPT, trained and sampled in pure, dependency-free Ruby.
 #   Ported from @karpathy's microgpt.py. Every line below is the algorithm;
 #   everything else — tensors, GPUs, frameworks — is merely efficiency.
 # ════════════════════════════════════════════════════════════════════════════

 require 'net/http'
 require 'uri'

 srand 42              # let there be order among chaos
 $stdout.sync = true   # so progress lines flush as they're written

 # ─── Dataset ────────────────────────────────────────────────────────────────
 # Each document is a line in the corpus (e.g. a single name)
 NAMES_URL = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'
 File.write('input.txt', Net::HTTP.get(URI(NAMES_URL))) unless File.exist?('input.txt')

 DOCS = File.readlines('input.txt', chomp: true).reject(&:empty?).shuffle

 # ─── Tokenizer ──────────────────────────────────────────────────────────────
 # Character-level: unique chars become token ids 0..n-1, then a BOS on top.
 UCHARS     = DOCS.join.chars.uniq.sort
 CHAR_TO_ID = UCHARS.each_with_index.to_h
 BOS        = UCHARS.size             # beginning-of-sequence token id
 VOCAB_SIZE = UCHARS.size + 1         # +1 for BOS

 # string ⇄ token-id translation
 ENCODE = ->(doc) { [BOS, *doc.chars.map { CHAR_TO_ID.fetch(it) }, BOS] }
 DECODE = ->(ids) { ids.map { UCHARS[it] }.join }

 puts "num docs:   #{DOCS.size}", "vocab size: #{VOCAB_SIZE}"

 # ─── Hyperparameters ────────────────────────────────────────────────────────
 N_LAYER     = 1                 # transformer depth
 N_EMBD      = 16                # embedding dimension (width)
 BLOCK_SIZE  = 16                # context length of the attention window
 N_HEAD      = 4                 # number of attention heads
 HEAD_DIM    = N_EMBD / N_HEAD   # dimension of each head
 ATTN_SCALE  = HEAD_DIM**0.5     # divisor inside the attention softmax
 INIT_STD    = 0.08              # initial weight spread
 RMSNORM_EPS = 1e-5              # numerical guard inside rmsnorm

 # ─── Run config ─────────────────────────────────────────────────────────────
 NUM_STEPS   = Integer(ENV.fetch('STEPS',       1000))
 NUM_SAMPLES = Integer(ENV.fetch('SAMPLES',     20))
 TEMPERATURE = Float(ENV.fetch(  'TEMPERATURE', 0.5))

 # ─── Autograd ───────────────────────────────────────────────────────────────
 # Recursively apply the chain rule through a computation graph of scalar nodes
 class Value
  attr_accessor :data, :grad
  attr_reader :children, :local_grads

  # data:        scalar value at this node (forward)
  # grad:        derivative of the loss w.r.t. this node (backward)
  # children:    upstream nodes in the compute graph
  # local_grads: ∂self/∂child, paired 1:1 with children
  def initialize(data, children = [], local_grads = [])
    @data, @grad = data.to_f, 0.0
    @children, @local_grads = children, local_grads
  end

  # Lets `1 - value` work as `Value(1) - value`
  def coerce(other) = [Value(other), self]

  def +(other)
    o = Value(other)
    Value.new(@data + o.data, [self, o], [1.0, 1.0])
  end

  def *(other)
    o = Value(other)
    Value.new(@data * o.data, [self, o], [o.data, @data])
  end

  def **(k) = Value.new(@data**k, [self], [k * (@data**(k - 1.0))]) # scalar k
  def -@    = Value.new(-@data,   [self], [-1.0])
  def -(o)  = self + (-o)
  def /(o)  = self * (Value(o)**-1.0)

  def inspect   = "Value(#{@data.round(4)}, grad=#{@grad.round(4)})"
  def to_f      = @data
  def zero_grad = (@grad = 0.0)

  def log = Value.new(Math.log(@data), [self], [1.0 / @data])
  def exp = Math.exp(@data).then { |e| Value.new(e, [self], [e]) }

  def relu
    on = @data.positive?
    Value.new(on ? @data : 0.0, [self], [on ? 1.0 : 0.0])
  end

  def backward
    # Iterative post-order DFS — recursion would blow the stack on deep graphs.
    topo  = []
    seen  = {}.compare_by_identity
    stack = [[self, 0]]
    seen[self] = true

    until stack.empty?
      node, i = stack.last
      if i < node.children.size
        stack.last[1] = i + 1
        child = node.children[i]
        next if seen[child]
        seen[child] = true
        stack << [child, 0]
      else
        topo << stack.pop.first
      end
    end

    @grad = 1.0
    topo.reverse_each do |v|
      v.children.zip(v.local_grads) { |c, g| c.grad += g * v.grad }
    end
  end
 end

 # Kernel-style factory — `Value(x)` mirrors `Tensor(x)` and `Integer("42")`.
 def Value(x) = x.is_a?(Value) ? x : Value.new(x)

 # ─── Tensor: a 1-D vector of Values that does math ─────────────────────────
 # With operator overloading and a handful of methods, every line of the
 # transformer reads like the math it implements.
 class Tensor
  include Enumerable
  attr_reader :elems

  # Bracket constructor — `Tensor[1.0, 2.0]` mirrors `Hash[]`, `Array[]`, `Set[]`.
  def self.[](*elems) = new(elems)

  def initialize(elems) = @elems = elems
  def each(&b)          = @elems.each(&b)
  def [](i, len = nil)  = len ? Tensor.new(@elems[i, len]) : @elems[i]
  def size              = @elems.size

  def inspect
    preview = @elems.first(3).map { it.data.round(3) }.join(', ')
    preview += ', …' if @elems.size > 3
    "Tensor[#{@elems.size}](#{preview})"
  end

  def +(other)  = Tensor.new(@elems.zip(other.elems).map { |a, b| a + b })
  def -(other)  = Tensor.new(@elems.zip(other.elems).map { |a, b| a - b })
  def *(scalar) = Tensor.new(@elems.map { |v| v * scalar })
  def /(scalar) = Tensor.new(@elems.map { |v| v / scalar })

  def dot(other) = @elems.zip(other.elems).sum { |a, b| a * b }
  def relu       = Tensor.new(@elems.map(&:relu))

  def rmsnorm
    ms    = @elems.sum { |v| v * v } / @elems.size.to_f
    scale = (ms + RMSNORM_EPS)**-0.5
    self * scale
  end

  def softmax
    shift = @elems.map(&:data).max
    exps  = @elems.map { |v| (v - shift).exp }
    total = exps.sum
    Tensor.new(exps.map { |e| e / total })
  end

  # Draw an index from this Tensor as a discrete distribution (data only).
  def sample_index
    weights = @elems.map(&:data)
    target  = rand * weights.sum
    running = 0.0
    weights.each_with_index do |w, i|
      running += w
      return i if running >= target
    end
    weights.size - 1
  end
 end

 # Kernel-style factory — lets us write `Tensor(xs)` like `Integer("42")`.
 def Tensor(xs) = xs.is_a?(Tensor) ? xs : Tensor.new(xs)

 # ─── Matrix: rows of Tensors, with a matrix-vector product ─────────────────
 class Matrix
  include Enumerable
  attr_reader :rows

  def initialize(rows) = @rows = rows
  def each(&b)         = @rows.each(&b)
  def [](i)            = @rows[i]
  def size             = @rows.size
  def values           = @rows.flat_map(&:elems) # every Value, flat

  # matrix-vector product: (m × n) · (n) → (m)
  def *(x) = Tensor.new(@rows.map { |row| row.dot(x) })
 end

 # ─── Random helpers ─────────────────────────────────────────────────────────
 # Box-Muller: two uniforms in, one standard normal out.
 def random_gauss(mean = 0.0, std = 1.0)
  u1 = [rand, 1e-10].max # guard log(0)
  u2 = rand
  Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2) * std + mean
 end

 # ─── Parameters: the model's knowledge, a bag of Values ────────────────────
 def matrix(rows, cols, std = INIT_STD)
  Matrix.new(Array.new(rows) {
    Tensor.new(Array.new(cols) { Value.new(random_gauss(0.0, std)) })
  })
 end

 LAYER_SHAPES = {
  attn_wq: [N_EMBD,     N_EMBD],
  attn_wk: [N_EMBD,     N_EMBD],
  attn_wv: [N_EMBD,     N_EMBD],
  attn_wo: [N_EMBD,     N_EMBD],
  mlp_fc1: [4 * N_EMBD, N_EMBD],
  mlp_fc2: [N_EMBD,     4 * N_EMBD]
 }

 STATE_DICT = {
  wte:     matrix(VOCAB_SIZE, N_EMBD),
  wpe:     matrix(BLOCK_SIZE, N_EMBD),
  lm_head: matrix(VOCAB_SIZE, N_EMBD)
 }
 N_LAYER.times do |li|
  LAYER_SHAPES.each { |name, (r, c)| STATE_DICT[:"layer#{li}.#{name}"] = matrix(r, c) }
 end

 PARAMS = STATE_DICT.values.flat_map(&:values)
 puts "num params: #{PARAMS.size}", ""

 # ─── Model: a decoder-only transformer ──────────────────────────────────────
 # One forward pass: one token in, one distribution over next tokens out.
 # `keys`/`values` are the KV cache — one array per layer, grown as we go.
 def param(name) = STATE_DICT.fetch(name)
 def layer(li, name) = STATE_DICT.fetch(:"layer#{li}.#{name}")

 def fresh_kv = Array.new(N_LAYER) { [] }

 def gpt(token_id, pos_id, keys, values)
  x = (param(:wte)[token_id] + param(:wpe)[pos_id]).rmsnorm

  N_LAYER.times do |li|
    wq, wk, wv, wo, fc1, fc2 =
      %i[attn_wq attn_wk attn_wv attn_wo mlp_fc1 mlp_fc2].map { layer(li, it) }

    # ── (1) Multi-head causal self-attention (pre-norm, with residual) ──
    xn = x.rmsnorm
    q, k, v = wq * xn, wk * xn, wv * xn
    keys[li]   << k  # append to KV cache; causality is implicit because
    values[li] << v  # each step only attends over keys/values seen so far

    heads = Tensor(N_HEAD.times.flat_map { |h|
      lo = h * HEAD_DIM
      qh = q[lo, HEAD_DIM]
      kh = keys[li].map   { it[lo, HEAD_DIM] }
      vh = values[li].map { it[lo, HEAD_DIM] }

      attn = Tensor(kh.map { qh.dot(it) / ATTN_SCALE }).softmax

      # weighted sum over value vectors: Σ_t attn[t] · v_t
      Array.new(HEAD_DIM) { |j| attn.zip(vh).sum { |a, vt| a * vt[j] } }
    })

    x = wo * heads + x

    # ── (2) Feed-forward MLP (pre-norm, with residual) ──
    x = fc2 * (fc1 * x.rmsnorm).relu + x
  end

  param(:lm_head) * x
 end

 # ─── Optimizer: Adam ────────────────────────────────────────────────────────
 LEARNING_RATE = 0.01
 BETA1         = 0.85
 BETA2         = 0.99
 EPS_ADAM      = 1e-8

 m_buf = Array.new(PARAMS.size, 0.0) # first  moment buffer
 v_buf = Array.new(PARAMS.size, 0.0) # second moment buffer

 # ─── Training ───────────────────────────────────────────────────────────────
 training_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
 NUM_STEPS.times do |step|
  tokens = ENCODE[DOCS[step % DOCS.size]]               # BOS-wrapped token ids
  pairs  = tokens.each_cons(2).first(BLOCK_SIZE)        # (input, target) windows

  # Forward the sequence, building up the compute graph one position at a time
  keys, values = fresh_kv, fresh_kv

  # Mean cross-entropy over the document. May yours be low.
  loss = pairs.each_with_index.sum { |(input, target), pos|
    -gpt(input, pos, keys, values).softmax[target].log
  } / pairs.size

  # Backward pass: fill in ∂loss/∂p for every parameter p
  loss.backward

  # Adam step (with linear learning-rate decay)
  lr_t   = LEARNING_RATE * (1.0 - step.to_f / NUM_STEPS)
  bias_m = 1.0 - BETA1**(step + 1)
  bias_v = 1.0 - BETA2**(step + 1)
  PARAMS.each_with_index do |param, i|
    g = param.grad
    m_buf[i] = BETA1 * m_buf[i] + (1.0 - BETA1) * g
    v_buf[i] = BETA2 * v_buf[i] + (1.0 - BETA2) * g**2
    param.data -= lr_t * (m_buf[i] / bias_m) / ((v_buf[i] / bias_v)**0.5 + EPS_ADAM)
    param.grad  = 0.0
  end

  printf("step %4d / %4d | loss %.4f\r", step + 1, NUM_STEPS, loss.data)
 end
 elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - training_started_at
 printf("\ntrained %d steps in %.1fs (%.2f steps/s)\n", NUM_STEPS, elapsed, NUM_STEPS / elapsed)

 # ─── Inference ──────────────────────────────────────────────────────────────
 # May the model babble back to us.
 puts "", "--- hallucinated names ---"

 NUM_SAMPLES.times do |i|
  keys, values = fresh_kv, fresh_kv
  token_id     = BOS
  generated    = []

  BLOCK_SIZE.times do |pos_id|
    token_id = (gpt(token_id, pos_id, keys, values) / TEMPERATURE).softmax.sample_index
    break if token_id == BOS
    generated << token_id
  end

  printf("sample %2d: %s\n", i + 1, DECODE[generated])
 end
	# frozen_string_literal: true
	# ════════════════════════════════════════════════════════════════════════════
	# llm.rb — a GPT, trained and sampled in pure, dependency-free Ruby.
	# Ported from @karpathy's microgpt.py. Every line below is the algorithm;
	# everything else — tensors, GPUs, frameworks — is merely efficiency.
	# ════════════════════════════════════════════════════════════════════════════

	require 'net/http'
	require 'uri'

	srand 42 # let there be order among chaos
	$stdout.sync = true # so progress lines flush as they're written

	# ─── Dataset ────────────────────────────────────────────────────────────────
	# Each document is a line in the corpus (e.g. a single name)
	NAMES_URL = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'
	File.write('input.txt', Net::HTTP.get(URI(NAMES_URL))) unless File.exist?('input.txt')

	DOCS = File.readlines('input.txt', chomp: true).reject(&:empty?).shuffle

	# ─── Tokenizer ──────────────────────────────────────────────────────────────
	# Character-level: unique chars become token ids 0..n-1, then a BOS on top.
	UCHARS = DOCS.join.chars.uniq.sort
	CHAR_TO_ID = UCHARS.each_with_index.to_h
	BOS = UCHARS.size # beginning-of-sequence token id
	VOCAB_SIZE = UCHARS.size + 1 # +1 for BOS

	# string ⇄ token-id translation
	ENCODE = ->(doc) { [BOS, *doc.chars.map { CHAR_TO_ID.fetch(it) }, BOS] }
	DECODE = ->(ids) { ids.map { UCHARS[it] }.join }

	puts "num docs: #{DOCS.size}", "vocab size: #{VOCAB_SIZE}"

	# ─── Hyperparameters ────────────────────────────────────────────────────────
	N_LAYER = 1 # transformer depth
	N_EMBD = 16 # embedding dimension (width)
	BLOCK_SIZE = 16 # context length of the attention window
	N_HEAD = 4 # number of attention heads
	HEAD_DIM = N_EMBD / N_HEAD # dimension of each head
	ATTN_SCALE = HEAD_DIM**0.5 # divisor inside the attention softmax
	INIT_STD = 0.08 # initial weight spread
	RMSNORM_EPS = 1e-5 # numerical guard inside rmsnorm

	# ─── Run config ─────────────────────────────────────────────────────────────
	NUM_STEPS = Integer(ENV.fetch('STEPS', 1000))
	NUM_SAMPLES = Integer(ENV.fetch('SAMPLES', 20))
	TEMPERATURE = Float(ENV.fetch( 'TEMPERATURE', 0.5))

	# ─── Autograd ───────────────────────────────────────────────────────────────
	# Recursively apply the chain rule through a computation graph of scalar nodes
	class Value
	attr_accessor :data, :grad
	attr_reader :children, :local_grads

	# data: scalar value at this node (forward)
	# grad: derivative of the loss w.r.t. this node (backward)
	# children: upstream nodes in the compute graph
	# local_grads: ∂self/∂child, paired 1:1 with children
	def initialize(data, children = [], local_grads = [])
	@data, @grad = data.to_f, 0.0
	@children, @local_grads = children, local_grads
	end

	# Lets `1 - value` work as `Value(1) - value`
	def coerce(other) = [Value(other), self]

	def +(other)
	o = Value(other)
	Value.new(@data + o.data, [self, o], [1.0, 1.0])
	end

	def *(other)
	o = Value(other)
	Value.new(@data * o.data, [self, o], [o.data, @data])
	end

	def (k) = Value.new(@datak, [self], [k * (@data**(k - 1.0))]) # scalar k
	def -@ = Value.new(-@data, [self], [-1.0])
	def -(o) = self + (-o)
	def /(o) = self * (Value(o)**-1.0)

	def inspect = "Value(#{@data.round(4)}, grad=#{@grad.round(4)})"
	def to_f = @data
	def zero_grad = (@grad = 0.0)

	def log = Value.new(Math.log(@data), [self], [1.0 / @data])
	def exp = Math.exp(@data).then { \|e\| Value.new(e, [self], [e]) }

	def relu
	on = @data.positive?
	Value.new(on ? @data : 0.0, [self], [on ? 1.0 : 0.0])
	end

	def backward
	# Iterative post-order DFS — recursion would blow the stack on deep graphs.
	topo = []
	seen = {}.compare_by_identity
	stack = [[self, 0]]
	seen[self] = true

	until stack.empty?
	node, i = stack.last
	if i < node.children.size
	stack.last[1] = i + 1
	child = node.children[i]
	next if seen[child]
	seen[child] = true
	stack << [child, 0]
	else
	topo << stack.pop.first
	end
	end

	@grad = 1.0
	topo.reverse_each do \|v\|
	v.children.zip(v.local_grads) { \|c, g\| c.grad += g * v.grad }
	end
	end
	end

	# Kernel-style factory — `Value(x)` mirrors `Tensor(x)` and `Integer("42")`.
	def Value(x) = x.is_a?(Value) ? x : Value.new(x)

	# ─── Tensor: a 1-D vector of Values that does math ─────────────────────────
	# With operator overloading and a handful of methods, every line of the
	# transformer reads like the math it implements.
	class Tensor
	include Enumerable
	attr_reader :elems

	# Bracket constructor — `Tensor[1.0, 2.0]` mirrors `Hash[]`, `Array[]`, `Set[]`.
	def self.[](*elems) = new(elems)

	def initialize(elems) = @elems = elems
	def each(&b) = @elems.each(&b)
	def [](i, len = nil) = len ? Tensor.new(@elems[i, len]) : @elems[i]
	def size = @elems.size

	def inspect
	preview = @elems.first(3).map { it.data.round(3) }.join(', ')
	preview += ', …' if @elems.size > 3
	"Tensor[#{@elems.size}](#{preview})"
	end

	def +(other) = Tensor.new(@elems.zip(other.elems).map { \|a, b\| a + b })
	def -(other) = Tensor.new(@elems.zip(other.elems).map { \|a, b\| a - b })
	def (scalar) = Tensor.new(@elems.map { \|v\| v scalar })
	def /(scalar) = Tensor.new(@elems.map { \|v\| v / scalar })

	def dot(other) = @elems.zip(other.elems).sum { \|a, b\| a * b }
	def relu = Tensor.new(@elems.map(&:relu))

	def rmsnorm
	ms = @elems.sum { \|v\| v * v } / @elems.size.to_f
	scale = (ms + RMSNORM_EPS)**-0.5
	self * scale
	end

	def softmax
	shift = @elems.map(&:data).max
	exps = @elems.map { \|v\| (v - shift).exp }
	total = exps.sum
	Tensor.new(exps.map { \|e\| e / total })
	end

	# Draw an index from this Tensor as a discrete distribution (data only).
	def sample_index
	weights = @elems.map(&:data)
	target = rand * weights.sum
	running = 0.0
	weights.each_with_index do \|w, i\|
	running += w
	return i if running >= target
	end
	weights.size - 1
	end
	end

	# Kernel-style factory — lets us write `Tensor(xs)` like `Integer("42")`.
	def Tensor(xs) = xs.is_a?(Tensor) ? xs : Tensor.new(xs)

	# ─── Matrix: rows of Tensors, with a matrix-vector product ─────────────────
	class Matrix
	include Enumerable
	attr_reader :rows

	def initialize(rows) = @rows = rows
	def each(&b) = @rows.each(&b)
	def [](i) = @rows[i]
	def size = @rows.size
	def values = @rows.flat_map(&:elems) # every Value, flat

	# matrix-vector product: (m × n) · (n) → (m)
	def *(x) = Tensor.new(@rows.map { \|row\| row.dot(x) })
	end

	# ─── Random helpers ─────────────────────────────────────────────────────────
	# Box-Muller: two uniforms in, one standard normal out.
	def random_gauss(mean = 0.0, std = 1.0)
	u1 = [rand, 1e-10].max # guard log(0)
	u2 = rand
	Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2) * std + mean
	end

	# ─── Parameters: the model's knowledge, a bag of Values ────────────────────
	def matrix(rows, cols, std = INIT_STD)
	Matrix.new(Array.new(rows) {
	Tensor.new(Array.new(cols) { Value.new(random_gauss(0.0, std)) })
	})
	end

	LAYER_SHAPES = {
	attn_wq: [N_EMBD, N_EMBD],
	attn_wk: [N_EMBD, N_EMBD],
	attn_wv: [N_EMBD, N_EMBD],
	attn_wo: [N_EMBD, N_EMBD],
	mlp_fc1: [4 * N_EMBD, N_EMBD],
	mlp_fc2: [N_EMBD, 4 * N_EMBD]
	}

	STATE_DICT = {
	wte: matrix(VOCAB_SIZE, N_EMBD),
	wpe: matrix(BLOCK_SIZE, N_EMBD),
	lm_head: matrix(VOCAB_SIZE, N_EMBD)
	}
	N_LAYER.times do \|li\|
	LAYER_SHAPES.each { \|name, (r, c)\| STATE_DICT[:"layer#{li}.#{name}"] = matrix(r, c) }
	end

	PARAMS = STATE_DICT.values.flat_map(&:values)
	puts "num params: #{PARAMS.size}", ""

	# ─── Model: a decoder-only transformer ──────────────────────────────────────
	# One forward pass: one token in, one distribution over next tokens out.
	# `keys`/`values` are the KV cache — one array per layer, grown as we go.
	def param(name) = STATE_DICT.fetch(name)
	def layer(li, name) = STATE_DICT.fetch(:"layer#{li}.#{name}")

	def fresh_kv = Array.new(N_LAYER) { [] }

	def gpt(token_id, pos_id, keys, values)
	x = (param(:wte)[token_id] + param(:wpe)[pos_id]).rmsnorm

	N_LAYER.times do \|li\|
	wq, wk, wv, wo, fc1, fc2 =
	%i[attn_wq attn_wk attn_wv attn_wo mlp_fc1 mlp_fc2].map { layer(li, it) }

	# ── (1) Multi-head causal self-attention (pre-norm, with residual) ──
	xn = x.rmsnorm
	q, k, v = wq * xn, wk * xn, wv * xn
	keys[li] << k # append to KV cache; causality is implicit because
	values[li] << v # each step only attends over keys/values seen so far

	heads = Tensor(N_HEAD.times.flat_map { \|h\|
	lo = h * HEAD_DIM
	qh = q[lo, HEAD_DIM]
	kh = keys[li].map { it[lo, HEAD_DIM] }
	vh = values[li].map { it[lo, HEAD_DIM] }

	attn = Tensor(kh.map { qh.dot(it) / ATTN_SCALE }).softmax

	# weighted sum over value vectors: Σ_t attn[t] · v_t
	Array.new(HEAD_DIM) { \|j\| attn.zip(vh).sum { \|a, vt\| a * vt[j] } }
	})

	x = wo * heads + x

	# ── (2) Feed-forward MLP (pre-norm, with residual) ──
	x = fc2 * (fc1 * x.rmsnorm).relu + x
	end

	param(:lm_head) * x
	end

	# ─── Optimizer: Adam ────────────────────────────────────────────────────────
	LEARNING_RATE = 0.01
	BETA1 = 0.85
	BETA2 = 0.99
	EPS_ADAM = 1e-8

	m_buf = Array.new(PARAMS.size, 0.0) # first moment buffer
	v_buf = Array.new(PARAMS.size, 0.0) # second moment buffer

	# ─── Training ───────────────────────────────────────────────────────────────
	training_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
	NUM_STEPS.times do \|step\|
	tokens = ENCODE[DOCS[step % DOCS.size]] # BOS-wrapped token ids
	pairs = tokens.each_cons(2).first(BLOCK_SIZE) # (input, target) windows

	# Forward the sequence, building up the compute graph one position at a time
	keys, values = fresh_kv, fresh_kv

	# Mean cross-entropy over the document. May yours be low.
	loss = pairs.each_with_index.sum { \|(input, target), pos\|
	-gpt(input, pos, keys, values).softmax[target].log
	} / pairs.size

	# Backward pass: fill in ∂loss/∂p for every parameter p
	loss.backward

	# Adam step (with linear learning-rate decay)
	lr_t = LEARNING_RATE * (1.0 - step.to_f / NUM_STEPS)
	bias_m = 1.0 - BETA1**(step + 1)
	bias_v = 1.0 - BETA2**(step + 1)
	PARAMS.each_with_index do \|param, i\|
	g = param.grad
	m_buf[i] = BETA1 * m_buf[i] + (1.0 - BETA1) * g
	v_buf[i] = BETA2 * v_buf[i] + (1.0 - BETA2) * g**2
	param.data -= lr_t * (m_buf[i] / bias_m) / ((v_buf[i] / bias_v)**0.5 + EPS_ADAM)
	param.grad = 0.0
	end

	printf("step %4d / %4d \| loss %.4f\r", step + 1, NUM_STEPS, loss.data)
	end
	elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - training_started_at
	printf("\ntrained %d steps in %.1fs (%.2f steps/s)\n", NUM_STEPS, elapsed, NUM_STEPS / elapsed)

	# ─── Inference ──────────────────────────────────────────────────────────────
	# May the model babble back to us.
	puts "", "--- hallucinated names ---"

	NUM_SAMPLES.times do \|i\|
	keys, values = fresh_kv, fresh_kv
	token_id = BOS
	generated = []

	BLOCK_SIZE.times do \|pos_id\|
	token_id = (gpt(token_id, pos_id, keys, values) / TEMPERATURE).softmax.sample_index
	break if token_id == BOS
	generated << token_id
	end

	printf("sample %2d: %s\n", i + 1, DECODE[generated])
	end
No results found