Last active
April 13, 2026 09:16
-
-
Save eadz/4f5dedd55f309d9b28b22e8c8253d4cc to your computer and use it in GitHub Desktop.
llm.rb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # frozen_string_literal: true | |
| # ════════════════════════════════════════════════════════════════════════════ | |
| # llm.rb — a GPT, trained and sampled in pure, dependency-free Ruby. | |
| # Ported from @karpathy's microgpt.py. Every line below is the algorithm; | |
| # everything else — tensors, GPUs, frameworks — is merely efficiency. | |
| # ════════════════════════════════════════════════════════════════════════════ | |
| require 'net/http' | |
| require 'uri' | |
| srand 42 # let there be order among chaos | |
| $stdout.sync = true # so progress lines flush as they're written | |
| # ─── Dataset ──────────────────────────────────────────────────────────────── | |
| # Each document is a line in the corpus (e.g. a single name) | |
| NAMES_URL = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt' | |
| File.write('input.txt', Net::HTTP.get(URI(NAMES_URL))) unless File.exist?('input.txt') | |
| DOCS = File.readlines('input.txt', chomp: true).reject(&:empty?).shuffle | |
| # ─── Tokenizer ────────────────────────────────────────────────────────────── | |
| # Character-level: unique chars become token ids 0..n-1, then a BOS on top. | |
| UCHARS = DOCS.join.chars.uniq.sort | |
| CHAR_TO_ID = UCHARS.each_with_index.to_h | |
| BOS = UCHARS.size # beginning-of-sequence token id | |
| VOCAB_SIZE = UCHARS.size + 1 # +1 for BOS | |
| # string ⇄ token-id translation | |
| ENCODE = ->(doc) { [BOS, *doc.chars.map { CHAR_TO_ID.fetch(it) }, BOS] } | |
| DECODE = ->(ids) { ids.map { UCHARS[it] }.join } | |
| puts "num docs: #{DOCS.size}", "vocab size: #{VOCAB_SIZE}" | |
| # ─── Hyperparameters ──────────────────────────────────────────────────────── | |
| N_LAYER = 1 # transformer depth | |
| N_EMBD = 16 # embedding dimension (width) | |
| BLOCK_SIZE = 16 # context length of the attention window | |
| N_HEAD = 4 # number of attention heads | |
| HEAD_DIM = N_EMBD / N_HEAD # dimension of each head | |
| ATTN_SCALE = HEAD_DIM**0.5 # divisor inside the attention softmax | |
| INIT_STD = 0.08 # initial weight spread | |
| RMSNORM_EPS = 1e-5 # numerical guard inside rmsnorm | |
| # ─── Run config ───────────────────────────────────────────────────────────── | |
| NUM_STEPS = Integer(ENV.fetch('STEPS', 1000)) | |
| NUM_SAMPLES = Integer(ENV.fetch('SAMPLES', 20)) | |
| TEMPERATURE = Float(ENV.fetch( 'TEMPERATURE', 0.5)) | |
| # ─── Autograd ─────────────────────────────────────────────────────────────── | |
| # Recursively apply the chain rule through a computation graph of scalar nodes | |
| class Value | |
| attr_accessor :data, :grad | |
| attr_reader :children, :local_grads | |
| # data: scalar value at this node (forward) | |
| # grad: derivative of the loss w.r.t. this node (backward) | |
| # children: upstream nodes in the compute graph | |
| # local_grads: ∂self/∂child, paired 1:1 with children | |
| def initialize(data, children = [], local_grads = []) | |
| @data, @grad = data.to_f, 0.0 | |
| @children, @local_grads = children, local_grads | |
| end | |
| # Lets `1 - value` work as `Value(1) - value` | |
| def coerce(other) = [Value(other), self] | |
| def +(other) | |
| o = Value(other) | |
| Value.new(@data + o.data, [self, o], [1.0, 1.0]) | |
| end | |
| def *(other) | |
| o = Value(other) | |
| Value.new(@data * o.data, [self, o], [o.data, @data]) | |
| end | |
| def **(k) = Value.new(@data**k, [self], [k * (@data**(k - 1.0))]) # scalar k | |
| def -@ = Value.new(-@data, [self], [-1.0]) | |
| def -(o) = self + (-o) | |
| def /(o) = self * (Value(o)**-1.0) | |
| def inspect = "Value(#{@data.round(4)}, grad=#{@grad.round(4)})" | |
| def to_f = @data | |
| def zero_grad = (@grad = 0.0) | |
| def log = Value.new(Math.log(@data), [self], [1.0 / @data]) | |
| def exp = Math.exp(@data).then { |e| Value.new(e, [self], [e]) } | |
| def relu | |
| on = @data.positive? | |
| Value.new(on ? @data : 0.0, [self], [on ? 1.0 : 0.0]) | |
| end | |
| def backward | |
| # Iterative post-order DFS — recursion would blow the stack on deep graphs. | |
| topo = [] | |
| seen = {}.compare_by_identity | |
| stack = [[self, 0]] | |
| seen[self] = true | |
| until stack.empty? | |
| node, i = stack.last | |
| if i < node.children.size | |
| stack.last[1] = i + 1 | |
| child = node.children[i] | |
| next if seen[child] | |
| seen[child] = true | |
| stack << [child, 0] | |
| else | |
| topo << stack.pop.first | |
| end | |
| end | |
| @grad = 1.0 | |
| topo.reverse_each do |v| | |
| v.children.zip(v.local_grads) { |c, g| c.grad += g * v.grad } | |
| end | |
| end | |
| end | |
| # Kernel-style factory — `Value(x)` mirrors `Tensor(x)` and `Integer("42")`. | |
| def Value(x) = x.is_a?(Value) ? x : Value.new(x) | |
| # ─── Tensor: a 1-D vector of Values that does math ───────────────────────── | |
| # With operator overloading and a handful of methods, every line of the | |
| # transformer reads like the math it implements. | |
| class Tensor | |
| include Enumerable | |
| attr_reader :elems | |
| # Bracket constructor — `Tensor[1.0, 2.0]` mirrors `Hash[]`, `Array[]`, `Set[]`. | |
| def self.[](*elems) = new(elems) | |
| def initialize(elems) = @elems = elems | |
| def each(&b) = @elems.each(&b) | |
| def [](i, len = nil) = len ? Tensor.new(@elems[i, len]) : @elems[i] | |
| def size = @elems.size | |
| def inspect | |
| preview = @elems.first(3).map { it.data.round(3) }.join(', ') | |
| preview += ', …' if @elems.size > 3 | |
| "Tensor[#{@elems.size}](#{preview})" | |
| end | |
| def +(other) = Tensor.new(@elems.zip(other.elems).map { |a, b| a + b }) | |
| def -(other) = Tensor.new(@elems.zip(other.elems).map { |a, b| a - b }) | |
| def *(scalar) = Tensor.new(@elems.map { |v| v * scalar }) | |
| def /(scalar) = Tensor.new(@elems.map { |v| v / scalar }) | |
| def dot(other) = @elems.zip(other.elems).sum { |a, b| a * b } | |
| def relu = Tensor.new(@elems.map(&:relu)) | |
| def rmsnorm | |
| ms = @elems.sum { |v| v * v } / @elems.size.to_f | |
| scale = (ms + RMSNORM_EPS)**-0.5 | |
| self * scale | |
| end | |
| def softmax | |
| shift = @elems.map(&:data).max | |
| exps = @elems.map { |v| (v - shift).exp } | |
| total = exps.sum | |
| Tensor.new(exps.map { |e| e / total }) | |
| end | |
| # Draw an index from this Tensor as a discrete distribution (data only). | |
| def sample_index | |
| weights = @elems.map(&:data) | |
| target = rand * weights.sum | |
| running = 0.0 | |
| weights.each_with_index do |w, i| | |
| running += w | |
| return i if running >= target | |
| end | |
| weights.size - 1 | |
| end | |
| end | |
| # Kernel-style factory — lets us write `Tensor(xs)` like `Integer("42")`. | |
| def Tensor(xs) = xs.is_a?(Tensor) ? xs : Tensor.new(xs) | |
| # ─── Matrix: rows of Tensors, with a matrix-vector product ───────────────── | |
| class Matrix | |
| include Enumerable | |
| attr_reader :rows | |
| def initialize(rows) = @rows = rows | |
| def each(&b) = @rows.each(&b) | |
| def [](i) = @rows[i] | |
| def size = @rows.size | |
| def values = @rows.flat_map(&:elems) # every Value, flat | |
| # matrix-vector product: (m × n) · (n) → (m) | |
| def *(x) = Tensor.new(@rows.map { |row| row.dot(x) }) | |
| end | |
| # ─── Random helpers ───────────────────────────────────────────────────────── | |
| # Box-Muller: two uniforms in, one standard normal out. | |
| def random_gauss(mean = 0.0, std = 1.0) | |
| u1 = [rand, 1e-10].max # guard log(0) | |
| u2 = rand | |
| Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2) * std + mean | |
| end | |
| # ─── Parameters: the model's knowledge, a bag of Values ──────────────────── | |
| def matrix(rows, cols, std = INIT_STD) | |
| Matrix.new(Array.new(rows) { | |
| Tensor.new(Array.new(cols) { Value.new(random_gauss(0.0, std)) }) | |
| }) | |
| end | |
| LAYER_SHAPES = { | |
| attn_wq: [N_EMBD, N_EMBD], | |
| attn_wk: [N_EMBD, N_EMBD], | |
| attn_wv: [N_EMBD, N_EMBD], | |
| attn_wo: [N_EMBD, N_EMBD], | |
| mlp_fc1: [4 * N_EMBD, N_EMBD], | |
| mlp_fc2: [N_EMBD, 4 * N_EMBD] | |
| } | |
| STATE_DICT = { | |
| wte: matrix(VOCAB_SIZE, N_EMBD), | |
| wpe: matrix(BLOCK_SIZE, N_EMBD), | |
| lm_head: matrix(VOCAB_SIZE, N_EMBD) | |
| } | |
| N_LAYER.times do |li| | |
| LAYER_SHAPES.each { |name, (r, c)| STATE_DICT[:"layer#{li}.#{name}"] = matrix(r, c) } | |
| end | |
| PARAMS = STATE_DICT.values.flat_map(&:values) | |
| puts "num params: #{PARAMS.size}", "" | |
| # ─── Model: a decoder-only transformer ────────────────────────────────────── | |
| # One forward pass: one token in, one distribution over next tokens out. | |
| # `keys`/`values` are the KV cache — one array per layer, grown as we go. | |
| def param(name) = STATE_DICT.fetch(name) | |
| def layer(li, name) = STATE_DICT.fetch(:"layer#{li}.#{name}") | |
| def fresh_kv = Array.new(N_LAYER) { [] } | |
| def gpt(token_id, pos_id, keys, values) | |
| x = (param(:wte)[token_id] + param(:wpe)[pos_id]).rmsnorm | |
| N_LAYER.times do |li| | |
| wq, wk, wv, wo, fc1, fc2 = | |
| %i[attn_wq attn_wk attn_wv attn_wo mlp_fc1 mlp_fc2].map { layer(li, it) } | |
| # ── (1) Multi-head causal self-attention (pre-norm, with residual) ── | |
| xn = x.rmsnorm | |
| q, k, v = wq * xn, wk * xn, wv * xn | |
| keys[li] << k # append to KV cache; causality is implicit because | |
| values[li] << v # each step only attends over keys/values seen so far | |
| heads = Tensor(N_HEAD.times.flat_map { |h| | |
| lo = h * HEAD_DIM | |
| qh = q[lo, HEAD_DIM] | |
| kh = keys[li].map { it[lo, HEAD_DIM] } | |
| vh = values[li].map { it[lo, HEAD_DIM] } | |
| attn = Tensor(kh.map { qh.dot(it) / ATTN_SCALE }).softmax | |
| # weighted sum over value vectors: Σ_t attn[t] · v_t | |
| Array.new(HEAD_DIM) { |j| attn.zip(vh).sum { |a, vt| a * vt[j] } } | |
| }) | |
| x = wo * heads + x | |
| # ── (2) Feed-forward MLP (pre-norm, with residual) ── | |
| x = fc2 * (fc1 * x.rmsnorm).relu + x | |
| end | |
| param(:lm_head) * x | |
| end | |
| # ─── Optimizer: Adam ──────────────────────────────────────────────────────── | |
| LEARNING_RATE = 0.01 | |
| BETA1 = 0.85 | |
| BETA2 = 0.99 | |
| EPS_ADAM = 1e-8 | |
| m_buf = Array.new(PARAMS.size, 0.0) # first moment buffer | |
| v_buf = Array.new(PARAMS.size, 0.0) # second moment buffer | |
| # ─── Training ─────────────────────────────────────────────────────────────── | |
| training_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC) | |
| NUM_STEPS.times do |step| | |
| tokens = ENCODE[DOCS[step % DOCS.size]] # BOS-wrapped token ids | |
| pairs = tokens.each_cons(2).first(BLOCK_SIZE) # (input, target) windows | |
| # Forward the sequence, building up the compute graph one position at a time | |
| keys, values = fresh_kv, fresh_kv | |
| # Mean cross-entropy over the document. May yours be low. | |
| loss = pairs.each_with_index.sum { |(input, target), pos| | |
| -gpt(input, pos, keys, values).softmax[target].log | |
| } / pairs.size | |
| # Backward pass: fill in ∂loss/∂p for every parameter p | |
| loss.backward | |
| # Adam step (with linear learning-rate decay) | |
| lr_t = LEARNING_RATE * (1.0 - step.to_f / NUM_STEPS) | |
| bias_m = 1.0 - BETA1**(step + 1) | |
| bias_v = 1.0 - BETA2**(step + 1) | |
| PARAMS.each_with_index do |param, i| | |
| g = param.grad | |
| m_buf[i] = BETA1 * m_buf[i] + (1.0 - BETA1) * g | |
| v_buf[i] = BETA2 * v_buf[i] + (1.0 - BETA2) * g**2 | |
| param.data -= lr_t * (m_buf[i] / bias_m) / ((v_buf[i] / bias_v)**0.5 + EPS_ADAM) | |
| param.grad = 0.0 | |
| end | |
| printf("step %4d / %4d | loss %.4f\r", step + 1, NUM_STEPS, loss.data) | |
| end | |
| elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - training_started_at | |
| printf("\ntrained %d steps in %.1fs (%.2f steps/s)\n", NUM_STEPS, elapsed, NUM_STEPS / elapsed) | |
| # ─── Inference ────────────────────────────────────────────────────────────── | |
| # May the model babble back to us. | |
| puts "", "--- hallucinated names ---" | |
| NUM_SAMPLES.times do |i| | |
| keys, values = fresh_kv, fresh_kv | |
| token_id = BOS | |
| generated = [] | |
| BLOCK_SIZE.times do |pos_id| | |
| token_id = (gpt(token_id, pos_id, keys, values) / TEMPERATURE).softmax.sample_index | |
| break if token_id == BOS | |
| generated << token_id | |
| end | |
| printf("sample %2d: %s\n", i + 1, DECODE[generated]) | |
| end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment