Skip to content

Instantly share code, notes, and snippets.

@karpathy
Last active February 12, 2026 03:54
Show Gist options
  • Select an option

  • Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.

Select an option

Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
microgpt
"""
The most atomic way to train and inference a GPT in pure, dependency-free Python.
This file is the complete algorithm.
Everything else is just efficiency.
@karpathy
"""
import os # os.path.exists
import math # math.log, math.exp
import random # random.seed, random.choices, random.gauss, random.shuffle
# Let there be order among chaos
random.seed(42)
# Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names)
if not os.path.exists('input.txt'):
import urllib.request
names_url = 'https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt'
urllib.request.urlretrieve(names_url, 'input.txt')
docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents
random.shuffle(docs)
print(f"num docs: {len(docs)}")
# Let there be a Tokenizer to translate strings to discrete symbols and back
chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter
vocab_size = len(chars)
stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer
itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string
BOS = stoi['<BOS>']
print(f"vocab size: {vocab_size}")
# Let there be an Autograd to apply the chain rule recursively across a computation graph and so
# calculate the gradients of the loss with respect to model parameters.
class Value:
"""Stores a single scalar value and its gradient."""
def __init__(self, data, _children=(), _op=''):
self.data = data
self.grad = 0
self._backward = lambda: None
self._prev = set(_children)
self._op = _op # the op that produced this node, for graphviz / debugging / etc
def __add__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data + other.data, (self, other), '+')
def _backward():
self.grad += out.grad
other.grad += out.grad
out._backward = _backward
return out
def __mul__(self, other):
other = other if isinstance(other, Value) else Value(other)
out = Value(self.data * other.data, (self, other), '*')
def _backward():
self.grad += other.data * out.grad
other.grad += self.data * out.grad
out._backward = _backward
return out
def __pow__(self, other):
assert isinstance(other, (int, float)), "only supporting int/float powers for now"
out = Value(self.data**other, (self,), f'**{other}')
def _backward():
self.grad += (other * self.data**(other-1)) * out.grad
out._backward = _backward
return out
def log(self):
out = Value(math.log(self.data), (self,), 'log')
def _backward():
self.grad += (1 / self.data) * out.grad
out._backward = _backward
return out
def exp(self):
out = Value(math.exp(self.data), (self,), 'exp')
def _backward():
self.grad += out.data * out.grad
out._backward = _backward
return out
def relu(self):
out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU')
def _backward():
self.grad += (out.data > 0) * out.grad
out._backward = _backward
return out
def backward(self):
# topological order all of the children in the graph
topo = []
visited = set()
def build_topo(v):
if v not in visited:
visited.add(v)
for child in v._prev:
build_topo(child)
topo.append(v)
build_topo(self)
# go one variable at a time and apply the chain rule to get its gradient
self.grad = 1
for v in reversed(topo):
v._backward()
def __neg__(self): return self * -1
def __radd__(self, other): return self + other
def __sub__(self, other): return self + (-other)
def __rsub__(self, other): return other + (-self)
def __rmul__(self, other): return self * other
def __truediv__(self, other): return self * other**-1
def __rtruediv__(self, other): return other * self**-1
def __repr__(self): return f"Value(data={self.data}, grad={self.grad})"
# Initialize the parameters, to store the knowledge of the model.
n_embd = 16 # embedding dimension
n_head = 4 # number of attention heads
n_layer = 1 # number of layers
block_size = 8 # maximum sequence length
head_dim = n_embd // n_head # dimension of each head
matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)]
state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)}
for i in range(n_layer):
state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd)
state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd)
state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd)
state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0)
state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd)
state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0)
params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value]
print(f"num params: {len(params)}")
# Define the model architecture: a stateless function mapping token sequence and parameters to logits over what comes next.
# Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2
def linear(x, w):
return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w]
def softmax(logits):
max_val = max(val.data for val in logits)
exps = [(val - max_val).exp() for val in logits]
total = sum(exps)
return [e / total for e in exps]
def rmsnorm(x):
ms = sum(xi * xi for xi in x) / len(x)
scale = (ms + 1e-5) ** -0.5
return [xi * scale for xi in x]
def gpt(token_id, pos_id, keys, values):
tok_emb = state_dict['wte'][token_id] # token embedding
pos_emb = state_dict['wpe'][pos_id] # position embedding
x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding
x = rmsnorm(x)
for li in range(n_layer):
# 1) Multi-head attention block
x_residual = x
x = rmsnorm(x)
q = linear(x, state_dict[f'layer{li}.attn_wq'])
k = linear(x, state_dict[f'layer{li}.attn_wk'])
v = linear(x, state_dict[f'layer{li}.attn_wv'])
keys[li].append(k)
values[li].append(v)
x_attn = []
for h in range(n_head):
hs = h * head_dim
q_h = q[hs:hs+head_dim]
k_h = [ki[hs:hs+head_dim] for ki in keys[li]]
v_h = [vi[hs:hs+head_dim] for vi in values[li]]
attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))]
attn_weights = softmax(attn_logits)
head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)]
x_attn.extend(head_out)
x = linear(x_attn, state_dict[f'layer{li}.attn_wo'])
x = [a + b for a, b in zip(x, x_residual)]
# 2) MLP block
x_residual = x
x = rmsnorm(x)
x = linear(x, state_dict[f'layer{li}.mlp_fc1'])
x = [xi.relu() ** 2 for xi in x]
x = linear(x, state_dict[f'layer{li}.mlp_fc2'])
x = [a + b for a, b in zip(x, x_residual)]
logits = linear(x, state_dict['lm_head'])
return logits
# Let there be Adam, the blessed optimizer and its buffers
learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8
m = [0.0] * len(params) # first moment buffer
v = [0.0] * len(params) # second moment buffer
# Repeat in sequence
num_steps = 500 # number of training steps
for step in range(num_steps):
# Take single document, tokenize it, surround it with BOS special token on both sides
doc = docs[step % len(docs)]
tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS]
n = min(block_size, len(tokens) - 1)
# Forward the token sequence through the model, building up the computation graph all the way to the loss.
keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
losses = []
for pos_id in range(n):
token_id, target_id = tokens[pos_id], tokens[pos_id + 1]
logits = gpt(token_id, pos_id, keys, values)
probs = softmax(logits)
loss_t = -probs[target_id].log()
losses.append(loss_t)
loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low.
# Backward the loss, calculating the gradients with respect to all model parameters.
loss.backward()
# Adam optimizer update: update the model parameters based on the corresponding gradients.
lr_t = learning_rate * (1 - step / num_steps)
for i, p in enumerate(params):
m[i] = beta1 * m[i] + (1 - beta1) * p.grad
v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2
m_hat = m[i] / (1 - beta1 ** (step + 1))
v_hat = v[i] / (1 - beta2 ** (step + 1))
p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam)
p.grad = 0
print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}")
# Inference: may the model babble back to us
temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high
print("\n--- inference ---")
for sample_idx in range(20):
keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)]
token_id = BOS
print(f"sample {sample_idx+1}: ", end="")
for pos_id in range(block_size):
logits = gpt(token_id, pos_id, keys, values)
probs = softmax([l / temperature for l in logits])
token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0]
if token_id == BOS:
break
print(itos[token_id], end="")
print()
@Nathan-Roll1
Copy link

Where’s my signed (and numbered) karpathy original print?

@neohe-imer
Copy link

this is the version but in Haskell

`{-# LANGUAGE RecordWildCards #-}
module Main where

import Control.Monad
import Data.Char
import Data.IORef
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import System.Directory
import System.Process
import System.Random
import qualified Data.List as L

data Value = Value
{ vid :: !Int
, vdata :: !(IORef Double)
, vgrad :: !(IORef Double)
, vback :: !(IORef (IO ()))
, vprev :: !(IORef [Value])
, vop :: !String
}

type StateDict = M.Map String [[Value]]

newId :: IORef Int -> IO Int
newId r = atomicModifyIORef' r (\i -> (i + 1, i))

mkValue :: IORef Int -> Double -> [Value] -> String -> IO Value
mkValue idRef x prev op = do
i <- newId idRef
d <- newIORef x
g <- newIORef 0.0
b <- newIORef (pure ())
p <- newIORef prev
pure Value { vid = i, vdata = d, vgrad = g, vback = b, vprev = p, vop = op }

readData :: Value -> IO Double
readData Value{..} = readIORef vdata

readGrad :: Value -> IO Double
readGrad Value{..} = readIORef vgrad

addGrad :: Value -> Double -> IO ()
addGrad Value{..} x = modifyIORef' vgrad (+ x)

setGrad :: Value -> Double -> IO ()
setGrad Value{..} x = writeIORef vgrad x

setData :: Value -> Double -> IO ()
setData Value{..} x = writeIORef vdata x

setBack :: Value -> IO () -> IO ()
setBack Value{..} act = writeIORef vback act

getBack :: Value -> IO (IO ())
getBack Value{..} = readIORef vback

getPrev :: Value -> IO [Value]
getPrev Value{..} = readIORef vprev

valConst :: IORef Int -> Double -> IO Value
valConst idRef x = mkValue idRef x [] "const"

vAdd :: IORef Int -> Value -> Value -> IO Value
vAdd idRef a b = do
ad <- readData a
bd <- readData b
out <- mkValue idRef (ad + bd) [a,b] "+"
setBack out $ do
og <- readGrad out
addGrad a og
addGrad b og
pure out

vMul :: IORef Int -> Value -> Value -> IO Value
vMul idRef a b = do
ad <- readData a
bd <- readData b
out <- mkValue idRef (ad * bd) [a,b] "*"
setBack out $ do
og <- readGrad out
addGrad a (bd * og)
addGrad b (ad * og)
pure out

vNeg :: IORef Int -> Value -> IO Value
vNeg idRef a = do
m1 <- valConst idRef (-1.0)
vMul idRef a m1

vSub :: IORef Int -> Value -> Value -> IO Value
vSub idRef a b = do
nb <- vNeg idRef b
vAdd idRef a nb

vPow :: IORef Int -> Value -> Double -> IO Value
vPow idRef a pwr = do
ad <- readData a
out <- mkValue idRef (ad ** pwr) [a] ("**" ++ show pwr)
setBack out $ do
og <- readGrad out
addGrad a ((pwr * (ad ** (pwr - 1.0))) * og)
pure out

vLog :: IORef Int -> Value -> IO Value
vLog idRef a = do
ad <- readData a
out <- mkValue idRef (log ad) [a] "log"
setBack out $ do
og <- readGrad out
addGrad a ((1.0 / ad) * og)
pure out

vExp :: IORef Int -> Value -> IO Value
vExp idRef a = do
ad <- readData a
let ed = exp ad
out <- mkValue idRef ed [a] "exp"
setBack out $ do
og <- readGrad out
addGrad a (ed * og)
pure out

vReLU :: IORef Int -> Value -> IO Value
vReLU idRef a = do
ad <- readData a
let od = if ad < 0 then 0 else ad
out <- mkValue idRef od [a] "ReLU"
setBack out $ do
og <- readGrad out
addGrad a ((if od > 0 then 1 else 0) * og)
pure out

sumValues :: IORef Int -> [Value] -> IO Value
sumValues idRef xs = do
z <- valConst idRef 0.0
foldM (vAdd idRef) z xs

meanValues :: IORef Int -> [Value] -> IO Value
meanValues idRef xs = do
s <- sumValues idRef xs
n <- valConst idRef (fromIntegral (length xs))
inv <- vPow idRef n (-1.0)
vMul idRef s inv

backward :: Value -> IO ()
backward loss = do
topoRef <- newIORef ([] :: [Value])
let dfs visited v = do
let i = vid v
if S.member i visited
then pure visited
else do
prevs <- getPrev v
visited' <- foldM dfs (S.insert i visited) prevs
modifyIORef' topoRef (v :)
pure visited'
_ <- dfs S.empty loss
topo <- readIORef topoRef
setGrad loss 1.0
forM_ topo $ \v -> getBack v >>= id

boxMuller :: Double -> Double -> (Double, Double)
boxMuller u1 u2 =
let r = sqrt (-2.0 * log (max 1e-12 u1))
t = 2.0 * pi * u2
in (r * cos t, r * sin t)

gauss :: IORef StdGen -> Double -> Double -> IO Double
gauss genRef mu sigma = do
g <- readIORef genRef
let (u1, g1) = randomR (0.0, 1.0) g
(u2, g2) = randomR (0.0, 1.0) g1
(z0, _) = boxMuller u1 u2
writeIORef genRef g2
pure (mu + sigma * z0)

randInt :: IORef StdGen -> Int -> Int -> IO Int
randInt genRef lo hi = do
g <- readIORef genRef
let (x, g') = randomR (lo, hi) g
writeIORef genRef g'
pure x

shuffle :: IORef StdGen -> [a] -> IO [a]
shuffle genRef xs = do
let n = length xs
arr <- newIORef xs
let swapAt i j ys =
let xi = ys !! i
xj = ys !! j
in [ if k == i then xj else if k == j then xi else ys !! k | k <- [0..n-1] ]
forM_ [n-1, n-2 .. 1] $ \i -> do
j <- randInt genRef 0 i
ys <- readIORef arr
writeIORef arr (swapAt i j ys)
readIORef arr

matrix :: IORef Int -> IORef StdGen -> Int -> Int -> Double -> IO [[Value]]
matrix idRef genRef nout nin std = do
forM [1..nout] $ _ ->
forM [1..nin] $ _ -> do
x <- gauss genRef 0.0 std
mkValue idRef x [] "param"

linear :: IORef Int -> [Value] -> [[Value]] -> IO [Value]
linear idRef x w = forM w $ \wo -> do
prods <- forM (zip wo x) $ (wi, xi) -> vMul idRef wi xi
sumValues idRef prods

softmax :: IORef Int -> [Value] -> IO [Value]
softmax idRef logits = do
ds <- mapM readData logits
let mx = maximum ds
mxv <- valConst idRef mx
exps <- forM logits $ \v -> do
dv <- vSub idRef v mxv
vExp idRef dv
total <- sumValues idRef exps
inv <- vPow idRef total (-1.0)
forM exps $ \e -> vMul idRef e inv

rmsnorm :: IORef Int -> [Value] -> IO [Value]
rmsnorm idRef x = do
sqs <- forM x $ \xi -> vMul idRef xi xi
ms <- meanValues idRef sqs
eps <- valConst idRef 1e-5
denom <- vAdd idRef ms eps
scale <- vPow idRef denom (-0.5)
forM x $ \xi -> vMul idRef xi scale

slice :: Int -> Int -> [a] -> [a]
slice s l = take l . drop s

zipWithM' :: (a -> b -> IO c) -> [a] -> [b] -> IO [c]
zipWithM' f as bs = sequence (zipWith f as bs)

gpt :: IORef Int
-> StateDict
-> Int -> Int
-> [[[[Value]]]] -> [[[[Value]]]]
-> Int -> Int -> Int -> Int -> Int
-> IO [Value]
gpt idRef st tokenId posId keys values nLayer nHead headDim nEmbd vocabSize = do
let wte = st M.! "wte"
wpe = st M.! "wpe"
let tokEmb = wte !! tokenId
posEmb = wpe !! posId
x0 <- zipWithM' (vAdd idRef) tokEmb posEmb
x1 <- rmsnorm idRef x0
foldM (\x li -> do
xRes1 <- pure x
xN1 <- rmsnorm idRef x
q <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wq"))
k <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wk"))
v <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wv"))
let keysLi = keys !! li
valuesLi = values !! li
let keysLi' = keysLi ++ [k]
valuesLi' = valuesLi ++ [v]
let keys' = take li keys ++ [keysLi'] ++ drop (li+1) keys
let values' = take li values ++ [valuesLi'] ++ drop (li+1) values
xAttnHeads <- forM [0..nHead-1] $ \h -> do
let hs = h * headDim
let qh = slice hs headDim q
let kh = map (slice hs headDim) (keys' !! li)
let vh = map (slice hs headDim) (values' !! li)
logits <- forM [0..length kh - 1] $ \t -> do
dots <- forM [0..headDim-1] $ \j -> vMul idRef (qh !! j) (kh !! t !! j)
s <- sumValues idRef dots
denom <- valConst idRef (sqrt (fromIntegral headDim))
inv <- vPow idRef denom (-1.0)
vMul idRef s inv
weights <- softmax idRef logits
forM [0..headDim-1] $ \j -> do
terms <- forM [0..length vh - 1] $ \t -> vMul idRef (weights !! t) (vh !! t !! j)
sumValues idRef terms
let xAttn = concat xAttnHeads
xProj <- linear idRef xAttn (st M.! ("layer" ++ show li ++ ".attn_wo"))
x2 <- zipWithM' (vAdd idRef) xProj xRes1
xRes2 <- pure x2
xN2 <- rmsnorm idRef x2
xFc1 <- linear idRef xN2 (st M.! ("layer" ++ show li ++ ".mlp_fc1"))
xAct <- forM xFc1 $ \xi -> do
r <- vReLU idRef xi
vPow idRef r 2.0
xFc2 <- linear idRef xAct (st M.! ("layer" ++ show li ++ ".mlp_fc2"))
zipWithM' (vAdd idRef) xFc2 xRes2
) x1 [0..nLayer-1] >>= \xFinal ->
linear idRef xFinal (st M.! "lm_head")

categorical :: IORef StdGen -> [Double] -> IO Int
categorical genRef ws = do
let total = sum ws
g <- readIORef genRef
let (r, g') = randomR (0.0, total) g
writeIORef genRef g'
pure (go r 0 ws)
where
go _ i [] = max 0 (i - 1)
go r i (w:rest) = if r <= w then i else go (r - w) (i + 1) rest

main :: IO ()
main = do
let seed = 42 :: Int
genRef <- newIORef (mkStdGen seed)
idRef <- newIORef 0

exists <- doesFileExist "input.txt"
unless exists $ do
let url = "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"
callCommand ("curl -L " ++ url ++ " -o input.txt")

raw <- readFile "input.txt"
let docs0 = filter (not . null) (map (dropWhileEnd isSpace . dropWhile isSpace) (lines raw))
docs <- shuffle genRef docs0
putStrLn ("num docs: " ++ show (length docs))

let chars = "" : (L.sort . S.toList . S.fromList . concat $ docs)
let vocabSize = length chars
let stoi = M.fromList (zip chars [0..])
let itos = M.fromList (zip [0..] chars)
let bos = stoi M.! ""
putStrLn ("vocab size: " ++ show vocabSize)

let nEmbd = 16
let nHead = 4
let nLayer = 1
let blockSize = 8
let headDim = nEmbd div nHead

wte <- matrix idRef genRef vocabSize nEmbd 0.02
wpe <- matrix idRef genRef blockSize nEmbd 0.02
lm <- matrix idRef genRef vocabSize nEmbd 0.02

let initSD = M.fromList [("wte", wte), ("wpe", wpe), ("lm_head", lm)]

sd <- foldM (\m li -> do
wq <- matrix idRef genRef nEmbd nEmbd 0.02
wk <- matrix idRef genRef nEmbd nEmbd 0.02
wv <- matrix idRef genRef nEmbd nEmbd 0.02
wo <- matrix idRef genRef nEmbd nEmbd 0.0
fc1 <- matrix idRef genRef (4nEmbd) nEmbd 0.02
fc2 <- matrix idRef genRef nEmbd (4
nEmbd) 0.0
pure $
M.insert ("layer" ++ show li ++ ".attn_wq") wq $
M.insert ("layer" ++ show li ++ ".attn_wk") wk $
M.insert ("layer" ++ show li ++ ".attn_wv") wv $
M.insert ("layer" ++ show li ++ ".attn_wo") wo $
M.insert ("layer" ++ show li ++ ".mlp_fc1") fc1 $
M.insert ("layer" ++ show li ++ ".mlp_fc2") fc2 m
) initSD [0..nLayer-1]

let mats = M.elems sd
let params = [ p | mat <- mats, row <- mat, p <- row ]
putStrLn ("num params: " ++ show (length params))

let learningRate = 1e-2
let beta1 = 0.9
let beta2 = 0.95
let epsAdam = 1e-8

mBuf <- newIORef (replicate (length params) 0.0)
vBuf <- newIORef (replicate (length params) 0.0)

let numSteps = 500 :: Int

forM_ [0..numSteps-1] $ \step -> do
let doc = docs !! (step mod length docs)
let toks = [bos] ++ map (\ch -> stoi M.! [ch]) doc ++ [bos]
let n = min blockSize (length toks - 1)

let keys0 = replicate nLayer []
let values0 = replicate nLayer []

lossesRef <- newIORef ([] :: [Value])

let loopKV pos keys values
      | pos >= n = pure (keys, values)
      | otherwise = do
          let tokenId = toks !! pos
          let targetId = toks !! (pos + 1)
          logits <- gpt idRef sd tokenId pos keys values nLayer nHead headDim nEmbd vocabSize
          probs <- softmax idRef logits
          lt <- vLog idRef (probs !! targetId)
          nlt <- vNeg idRef lt
          modifyIORef' lossesRef (\ls -> ls ++ [nlt])
          let updateAt i new xs = take i xs ++ [new] ++ drop (i+1) xs
          let kAdd li = updateAt li ((keys !! li) ++ [linearKey li]) keys
              vAdd li = updateAt li ((values !! li) ++ [linearVal li]) values
              linearKey li = []
              linearVal li = []
          loopKV (pos + 1) keys values

_ <- loopKV 0 keys0 values0

losses <- readIORef lossesRef
loss <- do
  s <- sumValues idRef losses
  invn <- valConst idRef (1.0 / fromIntegral n)
  vMul idRef invn s

backward loss

let lrT = learningRate * (1.0 - fromIntegral step / fromIntegral numSteps)

mList <- readIORef mBuf
vList <- readIORef vBuf

(mList', vList') <- foldM (\(ms, vs) (i, p) -> do
    g <- readGrad p
    let mi = beta1 * (ms !! i) + (1.0 - beta1) * g
    let vi = beta2 * (vs !! i) + (1.0 - beta2) * (g * g)
    let mHat = mi / (1.0 - beta1 ** fromIntegral (step + 1))
    let vHat = vi / (1.0 - beta2 ** fromIntegral (step + 1))
    d <- readData p
    let d' = d - lrT * mHat / (sqrt vHat + epsAdam)
    setData p d'
    setGrad p 0.0
    let ms' = take i ms ++ [mi] ++ drop (i+1) ms
    let vs' = take i vs ++ [vi] ++ drop (i+1) vs
    pure (ms', vs')
  ) (mList, vList) (zip [0..] params)

writeIORef mBuf mList'
writeIORef vBuf vList'

ld <- readData loss
putStrLn ("step " ++ pad 4 (show (step+1)) ++ " / " ++ pad 4 (show numSteps) ++ " | loss " ++ showFF 4 ld)

let temperature = 0.6
putStrLn "\n--- inference ---"
forM_ [1..20::Int] $ \sampleIdx -> do
let keys0 = replicate nLayer []
let values0 = replicate nLayer []
putStr ("sample " ++ show sampleIdx ++ ": ")
let genLoop pos tokenId keys values
| pos >= blockSize = putStrLn ""
| otherwise = do
logits <- gpt idRef sd tokenId pos keys values nLayer nHead headDim nEmbd vocabSize
tempV <- valConst idRef temperature
scaled <- forM logits $ \l -> do
invt <- vPow idRef tempV (-1.0)
vMul idRef l invt
probs <- softmax idRef scaled
ws <- mapM readData probs
next <- categorical genRef ws
if next == bos
then putStrLn ""
else do
let ch = itos M.! next
putStr ch
genLoop (pos + 1) next keys values
genLoop 0 bos keys0 values0

pad :: Int -> String -> String
pad n s = replicate (n - length s) ' ' ++ s

showFF :: Int -> Double -> String
showFF k x =
let p = 10 ^ k
y = fromIntegral (round (x * fromIntegral p) :: Int) / fromIntegral p
in show y

dropWhileEnd :: (a -> Bool) -> [a] -> [a]
dropWhileEnd f = reverse . dropWhile f . reverse
`

@ivanstepanovftw
Copy link

thank you, functional programmer

@Cohegen
Copy link

Cohegen commented Feb 12, 2026

Thanks for this awesome distillation

@aziz0x00
Copy link

recommend running with uv run --python pypy karpathy.py for speed and better garbage collection, python gets heavy on the memory.

@davidkimai
Copy link

davidkimai commented Feb 12, 2026

Let there be order among chaos
Let there be Adam, the blessed optimizer and its buffers

Let there be art, and there was art.

Thank you @karpathy for your elegant work!

@mgalgs
Copy link

mgalgs commented Feb 12, 2026

Hello, future history books 👋

@dkarapetyan-afk
Copy link

Kinda weird how the dual vector version was deleted when it's actually a lot simpler.

@Nucs
Copy link

Nucs commented Feb 12, 2026

Its prettier when you called it art.

@davidkimai
Copy link

davidkimai commented Feb 12, 2026

uv run --python pypy microgpt.py

thanks for this

@mpmisko
Copy link

mpmisko commented Feb 12, 2026

"final average loss over the document sequence. May yours be low." :)

@davidkimai
Copy link

now everyone will be saying they trained a neural network from scratch haha @karpathy

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment