Skip to content

Instantly share code, notes, and snippets.

@eadz
Last active April 13, 2026 09:16
Show Gist options
  • Select an option

  • Save eadz/4f5dedd55f309d9b28b22e8c8253d4cc to your computer and use it in GitHub Desktop.

Select an option

Save eadz/4f5dedd55f309d9b28b22e8c8253d4cc to your computer and use it in GitHub Desktop.
llm.rb
# frozen_string_literal: true
# ════════════════════════════════════════════════════════════════════════════
# llm.rb — a GPT, trained and sampled in pure, dependency-free Ruby.
# Ported from @karpathy's microgpt.py. Every line below is the algorithm;
# everything else — tensors, GPUs, frameworks — is merely efficiency.
# ════════════════════════════════════════════════════════════════════════════
require 'net/http'
require 'uri'
srand 42 # let there be order among chaos
$stdout.sync = true # so progress lines flush as they're written
# ─── Dataset ────────────────────────────────────────────────────────────────
# Each document is a line in the corpus (e.g. a single name)
NAMES_URL = 'https://raw.githubusercontent.com/karpathy/makemore/988aa59/names.txt'
File.write('input.txt', Net::HTTP.get(URI(NAMES_URL))) unless File.exist?('input.txt')
DOCS = File.readlines('input.txt', chomp: true).reject(&:empty?).shuffle
# ─── Tokenizer ──────────────────────────────────────────────────────────────
# Character-level: unique chars become token ids 0..n-1, then a BOS on top.
UCHARS = DOCS.join.chars.uniq.sort
CHAR_TO_ID = UCHARS.each_with_index.to_h
BOS = UCHARS.size # beginning-of-sequence token id
VOCAB_SIZE = UCHARS.size + 1 # +1 for BOS
# string ⇄ token-id translation
ENCODE = ->(doc) { [BOS, *doc.chars.map { CHAR_TO_ID.fetch(it) }, BOS] }
DECODE = ->(ids) { ids.map { UCHARS[it] }.join }
puts "num docs: #{DOCS.size}", "vocab size: #{VOCAB_SIZE}"
# ─── Hyperparameters ────────────────────────────────────────────────────────
N_LAYER = 1 # transformer depth
N_EMBD = 16 # embedding dimension (width)
BLOCK_SIZE = 16 # context length of the attention window
N_HEAD = 4 # number of attention heads
HEAD_DIM = N_EMBD / N_HEAD # dimension of each head
ATTN_SCALE = HEAD_DIM**0.5 # divisor inside the attention softmax
INIT_STD = 0.08 # initial weight spread
RMSNORM_EPS = 1e-5 # numerical guard inside rmsnorm
# ─── Run config ─────────────────────────────────────────────────────────────
NUM_STEPS = Integer(ENV.fetch('STEPS', 1000))
NUM_SAMPLES = Integer(ENV.fetch('SAMPLES', 20))
TEMPERATURE = Float(ENV.fetch( 'TEMPERATURE', 0.5))
# ─── Autograd ───────────────────────────────────────────────────────────────
# Recursively apply the chain rule through a computation graph of scalar nodes
class Value
attr_accessor :data, :grad
attr_reader :children, :local_grads
# data: scalar value at this node (forward)
# grad: derivative of the loss w.r.t. this node (backward)
# children: upstream nodes in the compute graph
# local_grads: ∂self/∂child, paired 1:1 with children
def initialize(data, children = [], local_grads = [])
@data, @grad = data.to_f, 0.0
@children, @local_grads = children, local_grads
end
# Lets `1 - value` work as `Value(1) - value`
def coerce(other) = [Value(other), self]
def +(other)
o = Value(other)
Value.new(@data + o.data, [self, o], [1.0, 1.0])
end
def *(other)
o = Value(other)
Value.new(@data * o.data, [self, o], [o.data, @data])
end
def **(k) = Value.new(@data**k, [self], [k * (@data**(k - 1.0))]) # scalar k
def -@ = Value.new(-@data, [self], [-1.0])
def -(o) = self + (-o)
def /(o) = self * (Value(o)**-1.0)
def inspect = "Value(#{@data.round(4)}, grad=#{@grad.round(4)})"
def to_f = @data
def zero_grad = (@grad = 0.0)
def log = Value.new(Math.log(@data), [self], [1.0 / @data])
def exp = Math.exp(@data).then { |e| Value.new(e, [self], [e]) }
def relu
on = @data.positive?
Value.new(on ? @data : 0.0, [self], [on ? 1.0 : 0.0])
end
def backward
# Iterative post-order DFS — recursion would blow the stack on deep graphs.
topo = []
seen = {}.compare_by_identity
stack = [[self, 0]]
seen[self] = true
until stack.empty?
node, i = stack.last
if i < node.children.size
stack.last[1] = i + 1
child = node.children[i]
next if seen[child]
seen[child] = true
stack << [child, 0]
else
topo << stack.pop.first
end
end
@grad = 1.0
topo.reverse_each do |v|
v.children.zip(v.local_grads) { |c, g| c.grad += g * v.grad }
end
end
end
# Kernel-style factory — `Value(x)` mirrors `Tensor(x)` and `Integer("42")`.
def Value(x) = x.is_a?(Value) ? x : Value.new(x)
# ─── Tensor: a 1-D vector of Values that does math ─────────────────────────
# With operator overloading and a handful of methods, every line of the
# transformer reads like the math it implements.
class Tensor
include Enumerable
attr_reader :elems
# Bracket constructor — `Tensor[1.0, 2.0]` mirrors `Hash[]`, `Array[]`, `Set[]`.
def self.[](*elems) = new(elems)
def initialize(elems) = @elems = elems
def each(&b) = @elems.each(&b)
def [](i, len = nil) = len ? Tensor.new(@elems[i, len]) : @elems[i]
def size = @elems.size
def inspect
preview = @elems.first(3).map { it.data.round(3) }.join(', ')
preview += ', …' if @elems.size > 3
"Tensor[#{@elems.size}](#{preview})"
end
def +(other) = Tensor.new(@elems.zip(other.elems).map { |a, b| a + b })
def -(other) = Tensor.new(@elems.zip(other.elems).map { |a, b| a - b })
def *(scalar) = Tensor.new(@elems.map { |v| v * scalar })
def /(scalar) = Tensor.new(@elems.map { |v| v / scalar })
def dot(other) = @elems.zip(other.elems).sum { |a, b| a * b }
def relu = Tensor.new(@elems.map(&:relu))
def rmsnorm
ms = @elems.sum { |v| v * v } / @elems.size.to_f
scale = (ms + RMSNORM_EPS)**-0.5
self * scale
end
def softmax
shift = @elems.map(&:data).max
exps = @elems.map { |v| (v - shift).exp }
total = exps.sum
Tensor.new(exps.map { |e| e / total })
end
# Draw an index from this Tensor as a discrete distribution (data only).
def sample_index
weights = @elems.map(&:data)
target = rand * weights.sum
running = 0.0
weights.each_with_index do |w, i|
running += w
return i if running >= target
end
weights.size - 1
end
end
# Kernel-style factory — lets us write `Tensor(xs)` like `Integer("42")`.
def Tensor(xs) = xs.is_a?(Tensor) ? xs : Tensor.new(xs)
# ─── Matrix: rows of Tensors, with a matrix-vector product ─────────────────
class Matrix
include Enumerable
attr_reader :rows
def initialize(rows) = @rows = rows
def each(&b) = @rows.each(&b)
def [](i) = @rows[i]
def size = @rows.size
def values = @rows.flat_map(&:elems) # every Value, flat
# matrix-vector product: (m × n) · (n) → (m)
def *(x) = Tensor.new(@rows.map { |row| row.dot(x) })
end
# ─── Random helpers ─────────────────────────────────────────────────────────
# Box-Muller: two uniforms in, one standard normal out.
def random_gauss(mean = 0.0, std = 1.0)
u1 = [rand, 1e-10].max # guard log(0)
u2 = rand
Math.sqrt(-2.0 * Math.log(u1)) * Math.cos(2.0 * Math::PI * u2) * std + mean
end
# ─── Parameters: the model's knowledge, a bag of Values ────────────────────
def matrix(rows, cols, std = INIT_STD)
Matrix.new(Array.new(rows) {
Tensor.new(Array.new(cols) { Value.new(random_gauss(0.0, std)) })
})
end
LAYER_SHAPES = {
attn_wq: [N_EMBD, N_EMBD],
attn_wk: [N_EMBD, N_EMBD],
attn_wv: [N_EMBD, N_EMBD],
attn_wo: [N_EMBD, N_EMBD],
mlp_fc1: [4 * N_EMBD, N_EMBD],
mlp_fc2: [N_EMBD, 4 * N_EMBD]
}
STATE_DICT = {
wte: matrix(VOCAB_SIZE, N_EMBD),
wpe: matrix(BLOCK_SIZE, N_EMBD),
lm_head: matrix(VOCAB_SIZE, N_EMBD)
}
N_LAYER.times do |li|
LAYER_SHAPES.each { |name, (r, c)| STATE_DICT[:"layer#{li}.#{name}"] = matrix(r, c) }
end
PARAMS = STATE_DICT.values.flat_map(&:values)
puts "num params: #{PARAMS.size}", ""
# ─── Model: a decoder-only transformer ──────────────────────────────────────
# One forward pass: one token in, one distribution over next tokens out.
# `keys`/`values` are the KV cache — one array per layer, grown as we go.
def param(name) = STATE_DICT.fetch(name)
def layer(li, name) = STATE_DICT.fetch(:"layer#{li}.#{name}")
def fresh_kv = Array.new(N_LAYER) { [] }
def gpt(token_id, pos_id, keys, values)
x = (param(:wte)[token_id] + param(:wpe)[pos_id]).rmsnorm
N_LAYER.times do |li|
wq, wk, wv, wo, fc1, fc2 =
%i[attn_wq attn_wk attn_wv attn_wo mlp_fc1 mlp_fc2].map { layer(li, it) }
# ── (1) Multi-head causal self-attention (pre-norm, with residual) ──
xn = x.rmsnorm
q, k, v = wq * xn, wk * xn, wv * xn
keys[li] << k # append to KV cache; causality is implicit because
values[li] << v # each step only attends over keys/values seen so far
heads = Tensor(N_HEAD.times.flat_map { |h|
lo = h * HEAD_DIM
qh = q[lo, HEAD_DIM]
kh = keys[li].map { it[lo, HEAD_DIM] }
vh = values[li].map { it[lo, HEAD_DIM] }
attn = Tensor(kh.map { qh.dot(it) / ATTN_SCALE }).softmax
# weighted sum over value vectors: Σ_t attn[t] · v_t
Array.new(HEAD_DIM) { |j| attn.zip(vh).sum { |a, vt| a * vt[j] } }
})
x = wo * heads + x
# ── (2) Feed-forward MLP (pre-norm, with residual) ──
x = fc2 * (fc1 * x.rmsnorm).relu + x
end
param(:lm_head) * x
end
# ─── Optimizer: Adam ────────────────────────────────────────────────────────
LEARNING_RATE = 0.01
BETA1 = 0.85
BETA2 = 0.99
EPS_ADAM = 1e-8
m_buf = Array.new(PARAMS.size, 0.0) # first moment buffer
v_buf = Array.new(PARAMS.size, 0.0) # second moment buffer
# ─── Training ───────────────────────────────────────────────────────────────
training_started_at = Process.clock_gettime(Process::CLOCK_MONOTONIC)
NUM_STEPS.times do |step|
tokens = ENCODE[DOCS[step % DOCS.size]] # BOS-wrapped token ids
pairs = tokens.each_cons(2).first(BLOCK_SIZE) # (input, target) windows
# Forward the sequence, building up the compute graph one position at a time
keys, values = fresh_kv, fresh_kv
# Mean cross-entropy over the document. May yours be low.
loss = pairs.each_with_index.sum { |(input, target), pos|
-gpt(input, pos, keys, values).softmax[target].log
} / pairs.size
# Backward pass: fill in ∂loss/∂p for every parameter p
loss.backward
# Adam step (with linear learning-rate decay)
lr_t = LEARNING_RATE * (1.0 - step.to_f / NUM_STEPS)
bias_m = 1.0 - BETA1**(step + 1)
bias_v = 1.0 - BETA2**(step + 1)
PARAMS.each_with_index do |param, i|
g = param.grad
m_buf[i] = BETA1 * m_buf[i] + (1.0 - BETA1) * g
v_buf[i] = BETA2 * v_buf[i] + (1.0 - BETA2) * g**2
param.data -= lr_t * (m_buf[i] / bias_m) / ((v_buf[i] / bias_v)**0.5 + EPS_ADAM)
param.grad = 0.0
end
printf("step %4d / %4d | loss %.4f\r", step + 1, NUM_STEPS, loss.data)
end
elapsed = Process.clock_gettime(Process::CLOCK_MONOTONIC) - training_started_at
printf("\ntrained %d steps in %.1fs (%.2f steps/s)\n", NUM_STEPS, elapsed, NUM_STEPS / elapsed)
# ─── Inference ──────────────────────────────────────────────────────────────
# May the model babble back to us.
puts "", "--- hallucinated names ---"
NUM_SAMPLES.times do |i|
keys, values = fresh_kv, fresh_kv
token_id = BOS
generated = []
BLOCK_SIZE.times do |pos_id|
token_id = (gpt(token_id, pos_id, keys, values) / TEMPERATURE).softmax.sample_index
break if token_id == BOS
generated << token_id
end
printf("sample %2d: %s\n", i + 1, DECODE[generated])
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment