-
Star
(290)
You must be signed in to star a gist -
Fork
(54)
You must be signed in to fork a gist
-
-
Save karpathy/8627fe009c40f57531cb18360106ce95 to your computer and use it in GitHub Desktop.
| """ | |
| The most atomic way to train and inference a GPT in pure, dependency-free Python. | |
| This file is the complete algorithm. | |
| Everything else is just efficiency. | |
| @karpathy | |
| """ | |
| import os # os.path.exists | |
| import math # math.log, math.exp | |
| import random # random.seed, random.choices, random.gauss, random.shuffle | |
| # Let there be order among chaos | |
| random.seed(42) | |
| # Let there be an input dataset `docs`: list[str] of documents (e.g. a dataset of names) | |
| if not os.path.exists('input.txt'): | |
| import urllib.request | |
| names_url = 'https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt' | |
| urllib.request.urlretrieve(names_url, 'input.txt') | |
| docs = [l.strip() for l in open('input.txt').read().strip().split('\n') if l.strip()] # list[str] of documents | |
| random.shuffle(docs) | |
| print(f"num docs: {len(docs)}") | |
| # Let there be a Tokenizer to translate strings to discrete symbols and back | |
| chars = ['<BOS>'] + sorted(set(''.join(docs))) # character-level tokenizer with a BOS delimiter | |
| vocab_size = len(chars) | |
| stoi = { ch:i for i, ch in enumerate(chars) } # encoding: map string to integer | |
| itos = { i:ch for i, ch in enumerate(chars) } # decoding: map integer to string | |
| BOS = stoi['<BOS>'] | |
| print(f"vocab size: {vocab_size}") | |
| # Let there be an Autograd to apply the chain rule recursively across a computation graph and so | |
| # calculate the gradients of the loss with respect to model parameters. | |
| class Value: | |
| """Stores a single scalar value and its gradient.""" | |
| def __init__(self, data, _children=(), _op=''): | |
| self.data = data | |
| self.grad = 0 | |
| self._backward = lambda: None | |
| self._prev = set(_children) | |
| self._op = _op # the op that produced this node, for graphviz / debugging / etc | |
| def __add__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data + other.data, (self, other), '+') | |
| def _backward(): | |
| self.grad += out.grad | |
| other.grad += out.grad | |
| out._backward = _backward | |
| return out | |
| def __mul__(self, other): | |
| other = other if isinstance(other, Value) else Value(other) | |
| out = Value(self.data * other.data, (self, other), '*') | |
| def _backward(): | |
| self.grad += other.data * out.grad | |
| other.grad += self.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def __pow__(self, other): | |
| assert isinstance(other, (int, float)), "only supporting int/float powers for now" | |
| out = Value(self.data**other, (self,), f'**{other}') | |
| def _backward(): | |
| self.grad += (other * self.data**(other-1)) * out.grad | |
| out._backward = _backward | |
| return out | |
| def log(self): | |
| out = Value(math.log(self.data), (self,), 'log') | |
| def _backward(): | |
| self.grad += (1 / self.data) * out.grad | |
| out._backward = _backward | |
| return out | |
| def exp(self): | |
| out = Value(math.exp(self.data), (self,), 'exp') | |
| def _backward(): | |
| self.grad += out.data * out.grad | |
| out._backward = _backward | |
| return out | |
| def relu(self): | |
| out = Value(0 if self.data < 0 else self.data, (self,), 'ReLU') | |
| def _backward(): | |
| self.grad += (out.data > 0) * out.grad | |
| out._backward = _backward | |
| return out | |
| def backward(self): | |
| # topological order all of the children in the graph | |
| topo = [] | |
| visited = set() | |
| def build_topo(v): | |
| if v not in visited: | |
| visited.add(v) | |
| for child in v._prev: | |
| build_topo(child) | |
| topo.append(v) | |
| build_topo(self) | |
| # go one variable at a time and apply the chain rule to get its gradient | |
| self.grad = 1 | |
| for v in reversed(topo): | |
| v._backward() | |
| def __neg__(self): return self * -1 | |
| def __radd__(self, other): return self + other | |
| def __sub__(self, other): return self + (-other) | |
| def __rsub__(self, other): return other + (-self) | |
| def __rmul__(self, other): return self * other | |
| def __truediv__(self, other): return self * other**-1 | |
| def __rtruediv__(self, other): return other * self**-1 | |
| def __repr__(self): return f"Value(data={self.data}, grad={self.grad})" | |
| # Initialize the parameters, to store the knowledge of the model. | |
| n_embd = 16 # embedding dimension | |
| n_head = 4 # number of attention heads | |
| n_layer = 1 # number of layers | |
| block_size = 8 # maximum sequence length | |
| head_dim = n_embd // n_head # dimension of each head | |
| matrix = lambda nout, nin, std=0.02: [[Value(random.gauss(0, std)) for _ in range(nin)] for _ in range(nout)] | |
| state_dict = {'wte': matrix(vocab_size, n_embd), 'wpe': matrix(block_size, n_embd), 'lm_head': matrix(vocab_size, n_embd)} | |
| for i in range(n_layer): | |
| state_dict[f'layer{i}.attn_wq'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wk'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wv'] = matrix(n_embd, n_embd) | |
| state_dict[f'layer{i}.attn_wo'] = matrix(n_embd, n_embd, std=0) | |
| state_dict[f'layer{i}.mlp_fc1'] = matrix(4 * n_embd, n_embd) | |
| state_dict[f'layer{i}.mlp_fc2'] = matrix(n_embd, 4 * n_embd, std=0) | |
| params = [p for mat in state_dict.values() for row in mat for p in row] # flatten params into a single list[Value] | |
| print(f"num params: {len(params)}") | |
| # Define the model architecture: a stateless function mapping token sequence and parameters to logits over what comes next. | |
| # Follow GPT-2, blessed among the GPTs, with minor differences: layernorm -> rmsnorm, no biases, GeLU -> ReLU^2 | |
| def linear(x, w): | |
| return [sum(wi * xi for wi, xi in zip(wo, x)) for wo in w] | |
| def softmax(logits): | |
| max_val = max(val.data for val in logits) | |
| exps = [(val - max_val).exp() for val in logits] | |
| total = sum(exps) | |
| return [e / total for e in exps] | |
| def rmsnorm(x): | |
| ms = sum(xi * xi for xi in x) / len(x) | |
| scale = (ms + 1e-5) ** -0.5 | |
| return [xi * scale for xi in x] | |
| def gpt(token_id, pos_id, keys, values): | |
| tok_emb = state_dict['wte'][token_id] # token embedding | |
| pos_emb = state_dict['wpe'][pos_id] # position embedding | |
| x = [t + p for t, p in zip(tok_emb, pos_emb)] # joint token and position embedding | |
| x = rmsnorm(x) | |
| for li in range(n_layer): | |
| # 1) Multi-head attention block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| q = linear(x, state_dict[f'layer{li}.attn_wq']) | |
| k = linear(x, state_dict[f'layer{li}.attn_wk']) | |
| v = linear(x, state_dict[f'layer{li}.attn_wv']) | |
| keys[li].append(k) | |
| values[li].append(v) | |
| x_attn = [] | |
| for h in range(n_head): | |
| hs = h * head_dim | |
| q_h = q[hs:hs+head_dim] | |
| k_h = [ki[hs:hs+head_dim] for ki in keys[li]] | |
| v_h = [vi[hs:hs+head_dim] for vi in values[li]] | |
| attn_logits = [sum(q_h[j] * k_h[t][j] for j in range(head_dim)) / head_dim**0.5 for t in range(len(k_h))] | |
| attn_weights = softmax(attn_logits) | |
| head_out = [sum(attn_weights[t] * v_h[t][j] for t in range(len(v_h))) for j in range(head_dim)] | |
| x_attn.extend(head_out) | |
| x = linear(x_attn, state_dict[f'layer{li}.attn_wo']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| # 2) MLP block | |
| x_residual = x | |
| x = rmsnorm(x) | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc1']) | |
| x = [xi.relu() ** 2 for xi in x] | |
| x = linear(x, state_dict[f'layer{li}.mlp_fc2']) | |
| x = [a + b for a, b in zip(x, x_residual)] | |
| logits = linear(x, state_dict['lm_head']) | |
| return logits | |
| # Let there be Adam, the blessed optimizer and its buffers | |
| learning_rate, beta1, beta2, eps_adam = 1e-2, 0.9, 0.95, 1e-8 | |
| m = [0.0] * len(params) # first moment buffer | |
| v = [0.0] * len(params) # second moment buffer | |
| # Repeat in sequence | |
| num_steps = 500 # number of training steps | |
| for step in range(num_steps): | |
| # Take single document, tokenize it, surround it with BOS special token on both sides | |
| doc = docs[step % len(docs)] | |
| tokens = [BOS] + [stoi[ch] for ch in doc] + [BOS] | |
| n = min(block_size, len(tokens) - 1) | |
| # Forward the token sequence through the model, building up the computation graph all the way to the loss. | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| losses = [] | |
| for pos_id in range(n): | |
| token_id, target_id = tokens[pos_id], tokens[pos_id + 1] | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax(logits) | |
| loss_t = -probs[target_id].log() | |
| losses.append(loss_t) | |
| loss = (1 / n) * sum(losses) # final average loss over the document sequence. May yours be low. | |
| # Backward the loss, calculating the gradients with respect to all model parameters. | |
| loss.backward() | |
| # Adam optimizer update: update the model parameters based on the corresponding gradients. | |
| lr_t = learning_rate * (1 - step / num_steps) | |
| for i, p in enumerate(params): | |
| m[i] = beta1 * m[i] + (1 - beta1) * p.grad | |
| v[i] = beta2 * v[i] + (1 - beta2) * p.grad ** 2 | |
| m_hat = m[i] / (1 - beta1 ** (step + 1)) | |
| v_hat = v[i] / (1 - beta2 ** (step + 1)) | |
| p.data -= lr_t * m_hat / (v_hat ** 0.5 + eps_adam) | |
| p.grad = 0 | |
| print(f"step {step+1:4d} / {num_steps:4d} | loss {loss.data:.4f}") | |
| # Inference: may the model babble back to us | |
| temperature = 0.6 # in (0, 1], control the "creativity" of generated text, low to high | |
| print("\n--- inference ---") | |
| for sample_idx in range(20): | |
| keys, values = [[] for _ in range(n_layer)], [[] for _ in range(n_layer)] | |
| token_id = BOS | |
| print(f"sample {sample_idx+1}: ", end="") | |
| for pos_id in range(block_size): | |
| logits = gpt(token_id, pos_id, keys, values) | |
| probs = softmax([l / temperature for l in logits]) | |
| token_id = random.choices(range(vocab_size), weights=[p.data for p in probs])[0] | |
| if token_id == BOS: | |
| break | |
| print(itos[token_id], end="") | |
| print() |
Incredible!
labeled as an art project. reads like one too.
Wow... it's looks like an art 🎨
Thank you Andrej. This is beautifully clean and elegant.
Really really cool. Worlds most viewed gist incoming
Beautiful piece of art!
For anyone interested in eval design work - Mercor is hiring ML engineers to build evaluation suites for frontier models. $100-120/hr remote contract.
They're specifically looking for people who can translate ML research workflows into structured benchmarks. This minGPT implementation is actually perfect reference material for that kind of work - atomic operations, explicit gradients, no framework abstractions https://t.mercor.com/Zoxmy
This code reminds me of the Pascal quote (often attributed to Mark Twain):
"I apologize for such a long letter - I didn't have time to write a short one"
goat
This is awesome. Can we get a non-minified version of this so that it is easier or us (humans) to read. :)
🔥
Absolute banger
absolutel cinema
It's beautiful! Atomic GPT, built from the atom up!
very interested in the implications of this
Is there gonna be a redstone port?
you did all of this in 243 lines, that is insane
🐐
In Julia is insane
Opportunity for SOTA by deleting a few comments
Is it Python or You to wrap it in 243 lines 🔥🔥
Beautiful
Now also available here on a single page :)
https://karpathy.ai/microgpt.html
🐑
This is absolute cinema!
Where’s my signed (and numbered) karpathy original print?
this is the version but in Haskell
`{-# LANGUAGE RecordWildCards #-}
module Main where
import Control.Monad
import Data.Char
import Data.IORef
import qualified Data.Map.Strict as M
import qualified Data.Set as S
import System.Directory
import System.Process
import System.Random
import qualified Data.List as L
data Value = Value
{ vid :: !Int
, vdata :: !(IORef Double)
, vgrad :: !(IORef Double)
, vback :: !(IORef (IO ()))
, vprev :: !(IORef [Value])
, vop :: !String
}
type StateDict = M.Map String [[Value]]
newId :: IORef Int -> IO Int
newId r = atomicModifyIORef' r (\i -> (i + 1, i))
mkValue :: IORef Int -> Double -> [Value] -> String -> IO Value
mkValue idRef x prev op = do
i <- newId idRef
d <- newIORef x
g <- newIORef 0.0
b <- newIORef (pure ())
p <- newIORef prev
pure Value { vid = i, vdata = d, vgrad = g, vback = b, vprev = p, vop = op }
readData :: Value -> IO Double
readData Value{..} = readIORef vdata
readGrad :: Value -> IO Double
readGrad Value{..} = readIORef vgrad
addGrad :: Value -> Double -> IO ()
addGrad Value{..} x = modifyIORef' vgrad (+ x)
setGrad :: Value -> Double -> IO ()
setGrad Value{..} x = writeIORef vgrad x
setData :: Value -> Double -> IO ()
setData Value{..} x = writeIORef vdata x
setBack :: Value -> IO () -> IO ()
setBack Value{..} act = writeIORef vback act
getBack :: Value -> IO (IO ())
getBack Value{..} = readIORef vback
getPrev :: Value -> IO [Value]
getPrev Value{..} = readIORef vprev
valConst :: IORef Int -> Double -> IO Value
valConst idRef x = mkValue idRef x [] "const"
vAdd :: IORef Int -> Value -> Value -> IO Value
vAdd idRef a b = do
ad <- readData a
bd <- readData b
out <- mkValue idRef (ad + bd) [a,b] "+"
setBack out $ do
og <- readGrad out
addGrad a og
addGrad b og
pure out
vMul :: IORef Int -> Value -> Value -> IO Value
vMul idRef a b = do
ad <- readData a
bd <- readData b
out <- mkValue idRef (ad * bd) [a,b] "*"
setBack out $ do
og <- readGrad out
addGrad a (bd * og)
addGrad b (ad * og)
pure out
vNeg :: IORef Int -> Value -> IO Value
vNeg idRef a = do
m1 <- valConst idRef (-1.0)
vMul idRef a m1
vSub :: IORef Int -> Value -> Value -> IO Value
vSub idRef a b = do
nb <- vNeg idRef b
vAdd idRef a nb
vPow :: IORef Int -> Value -> Double -> IO Value
vPow idRef a pwr = do
ad <- readData a
out <- mkValue idRef (ad ** pwr) [a] ("**" ++ show pwr)
setBack out $ do
og <- readGrad out
addGrad a ((pwr * (ad ** (pwr - 1.0))) * og)
pure out
vLog :: IORef Int -> Value -> IO Value
vLog idRef a = do
ad <- readData a
out <- mkValue idRef (log ad) [a] "log"
setBack out $ do
og <- readGrad out
addGrad a ((1.0 / ad) * og)
pure out
vExp :: IORef Int -> Value -> IO Value
vExp idRef a = do
ad <- readData a
let ed = exp ad
out <- mkValue idRef ed [a] "exp"
setBack out $ do
og <- readGrad out
addGrad a (ed * og)
pure out
vReLU :: IORef Int -> Value -> IO Value
vReLU idRef a = do
ad <- readData a
let od = if ad < 0 then 0 else ad
out <- mkValue idRef od [a] "ReLU"
setBack out $ do
og <- readGrad out
addGrad a ((if od > 0 then 1 else 0) * og)
pure out
sumValues :: IORef Int -> [Value] -> IO Value
sumValues idRef xs = do
z <- valConst idRef 0.0
foldM (vAdd idRef) z xs
meanValues :: IORef Int -> [Value] -> IO Value
meanValues idRef xs = do
s <- sumValues idRef xs
n <- valConst idRef (fromIntegral (length xs))
inv <- vPow idRef n (-1.0)
vMul idRef s inv
backward :: Value -> IO ()
backward loss = do
topoRef <- newIORef ([] :: [Value])
let dfs visited v = do
let i = vid v
if S.member i visited
then pure visited
else do
prevs <- getPrev v
visited' <- foldM dfs (S.insert i visited) prevs
modifyIORef' topoRef (v :)
pure visited'
_ <- dfs S.empty loss
topo <- readIORef topoRef
setGrad loss 1.0
forM_ topo $ \v -> getBack v >>= id
boxMuller :: Double -> Double -> (Double, Double)
boxMuller u1 u2 =
let r = sqrt (-2.0 * log (max 1e-12 u1))
t = 2.0 * pi * u2
in (r * cos t, r * sin t)
gauss :: IORef StdGen -> Double -> Double -> IO Double
gauss genRef mu sigma = do
g <- readIORef genRef
let (u1, g1) = randomR (0.0, 1.0) g
(u2, g2) = randomR (0.0, 1.0) g1
(z0, _) = boxMuller u1 u2
writeIORef genRef g2
pure (mu + sigma * z0)
randInt :: IORef StdGen -> Int -> Int -> IO Int
randInt genRef lo hi = do
g <- readIORef genRef
let (x, g') = randomR (lo, hi) g
writeIORef genRef g'
pure x
shuffle :: IORef StdGen -> [a] -> IO [a]
shuffle genRef xs = do
let n = length xs
arr <- newIORef xs
let swapAt i j ys =
let xi = ys !! i
xj = ys !! j
in [ if k == i then xj else if k == j then xi else ys !! k | k <- [0..n-1] ]
forM_ [n-1, n-2 .. 1] $ \i -> do
j <- randInt genRef 0 i
ys <- readIORef arr
writeIORef arr (swapAt i j ys)
readIORef arr
matrix :: IORef Int -> IORef StdGen -> Int -> Int -> Double -> IO [[Value]]
matrix idRef genRef nout nin std = do
forM [1..nout] $ _ ->
forM [1..nin] $ _ -> do
x <- gauss genRef 0.0 std
mkValue idRef x [] "param"
linear :: IORef Int -> [Value] -> [[Value]] -> IO [Value]
linear idRef x w = forM w $ \wo -> do
prods <- forM (zip wo x) $ (wi, xi) -> vMul idRef wi xi
sumValues idRef prods
softmax :: IORef Int -> [Value] -> IO [Value]
softmax idRef logits = do
ds <- mapM readData logits
let mx = maximum ds
mxv <- valConst idRef mx
exps <- forM logits $ \v -> do
dv <- vSub idRef v mxv
vExp idRef dv
total <- sumValues idRef exps
inv <- vPow idRef total (-1.0)
forM exps $ \e -> vMul idRef e inv
rmsnorm :: IORef Int -> [Value] -> IO [Value]
rmsnorm idRef x = do
sqs <- forM x $ \xi -> vMul idRef xi xi
ms <- meanValues idRef sqs
eps <- valConst idRef 1e-5
denom <- vAdd idRef ms eps
scale <- vPow idRef denom (-0.5)
forM x $ \xi -> vMul idRef xi scale
slice :: Int -> Int -> [a] -> [a]
slice s l = take l . drop s
zipWithM' :: (a -> b -> IO c) -> [a] -> [b] -> IO [c]
zipWithM' f as bs = sequence (zipWith f as bs)
gpt :: IORef Int
-> StateDict
-> Int -> Int
-> [[[[Value]]]] -> [[[[Value]]]]
-> Int -> Int -> Int -> Int -> Int
-> IO [Value]
gpt idRef st tokenId posId keys values nLayer nHead headDim nEmbd vocabSize = do
let wte = st M.! "wte"
wpe = st M.! "wpe"
let tokEmb = wte !! tokenId
posEmb = wpe !! posId
x0 <- zipWithM' (vAdd idRef) tokEmb posEmb
x1 <- rmsnorm idRef x0
foldM (\x li -> do
xRes1 <- pure x
xN1 <- rmsnorm idRef x
q <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wq"))
k <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wk"))
v <- linear idRef xN1 (st M.! ("layer" ++ show li ++ ".attn_wv"))
let keysLi = keys !! li
valuesLi = values !! li
let keysLi' = keysLi ++ [k]
valuesLi' = valuesLi ++ [v]
let keys' = take li keys ++ [keysLi'] ++ drop (li+1) keys
let values' = take li values ++ [valuesLi'] ++ drop (li+1) values
xAttnHeads <- forM [0..nHead-1] $ \h -> do
let hs = h * headDim
let qh = slice hs headDim q
let kh = map (slice hs headDim) (keys' !! li)
let vh = map (slice hs headDim) (values' !! li)
logits <- forM [0..length kh - 1] $ \t -> do
dots <- forM [0..headDim-1] $ \j -> vMul idRef (qh !! j) (kh !! t !! j)
s <- sumValues idRef dots
denom <- valConst idRef (sqrt (fromIntegral headDim))
inv <- vPow idRef denom (-1.0)
vMul idRef s inv
weights <- softmax idRef logits
forM [0..headDim-1] $ \j -> do
terms <- forM [0..length vh - 1] $ \t -> vMul idRef (weights !! t) (vh !! t !! j)
sumValues idRef terms
let xAttn = concat xAttnHeads
xProj <- linear idRef xAttn (st M.! ("layer" ++ show li ++ ".attn_wo"))
x2 <- zipWithM' (vAdd idRef) xProj xRes1
xRes2 <- pure x2
xN2 <- rmsnorm idRef x2
xFc1 <- linear idRef xN2 (st M.! ("layer" ++ show li ++ ".mlp_fc1"))
xAct <- forM xFc1 $ \xi -> do
r <- vReLU idRef xi
vPow idRef r 2.0
xFc2 <- linear idRef xAct (st M.! ("layer" ++ show li ++ ".mlp_fc2"))
zipWithM' (vAdd idRef) xFc2 xRes2
) x1 [0..nLayer-1] >>= \xFinal ->
linear idRef xFinal (st M.! "lm_head")
categorical :: IORef StdGen -> [Double] -> IO Int
categorical genRef ws = do
let total = sum ws
g <- readIORef genRef
let (r, g') = randomR (0.0, total) g
writeIORef genRef g'
pure (go r 0 ws)
where
go _ i [] = max 0 (i - 1)
go r i (w:rest) = if r <= w then i else go (r - w) (i + 1) rest
main :: IO ()
main = do
let seed = 42 :: Int
genRef <- newIORef (mkStdGen seed)
idRef <- newIORef 0
exists <- doesFileExist "input.txt"
unless exists $ do
let url = "https://raw.githubusercontent.com/karpathy/makemore/refs/heads/master/names.txt"
callCommand ("curl -L " ++ url ++ " -o input.txt")
raw <- readFile "input.txt"
let docs0 = filter (not . null) (map (dropWhileEnd isSpace . dropWhile isSpace) (lines raw))
docs <- shuffle genRef docs0
putStrLn ("num docs: " ++ show (length docs))
let chars = "" : (L.sort . S.toList . S.fromList . concat $ docs)
let vocabSize = length chars
let stoi = M.fromList (zip chars [0..])
let itos = M.fromList (zip [0..] chars)
let bos = stoi M.! ""
putStrLn ("vocab size: " ++ show vocabSize)
let nEmbd = 16
let nHead = 4
let nLayer = 1
let blockSize = 8
let headDim = nEmbd div nHead
wte <- matrix idRef genRef vocabSize nEmbd 0.02
wpe <- matrix idRef genRef blockSize nEmbd 0.02
lm <- matrix idRef genRef vocabSize nEmbd 0.02
let initSD = M.fromList [("wte", wte), ("wpe", wpe), ("lm_head", lm)]
sd <- foldM (\m li -> do
wq <- matrix idRef genRef nEmbd nEmbd 0.02
wk <- matrix idRef genRef nEmbd nEmbd 0.02
wv <- matrix idRef genRef nEmbd nEmbd 0.02
wo <- matrix idRef genRef nEmbd nEmbd 0.0
fc1 <- matrix idRef genRef (4nEmbd) nEmbd 0.02
fc2 <- matrix idRef genRef nEmbd (4nEmbd) 0.0
pure $
M.insert ("layer" ++ show li ++ ".attn_wq") wq $
M.insert ("layer" ++ show li ++ ".attn_wk") wk $
M.insert ("layer" ++ show li ++ ".attn_wv") wv $
M.insert ("layer" ++ show li ++ ".attn_wo") wo $
M.insert ("layer" ++ show li ++ ".mlp_fc1") fc1 $
M.insert ("layer" ++ show li ++ ".mlp_fc2") fc2 m
) initSD [0..nLayer-1]
let mats = M.elems sd
let params = [ p | mat <- mats, row <- mat, p <- row ]
putStrLn ("num params: " ++ show (length params))
let learningRate = 1e-2
let beta1 = 0.9
let beta2 = 0.95
let epsAdam = 1e-8
mBuf <- newIORef (replicate (length params) 0.0)
vBuf <- newIORef (replicate (length params) 0.0)
let numSteps = 500 :: Int
forM_ [0..numSteps-1] $ \step -> do
let doc = docs !! (step mod length docs)
let toks = [bos] ++ map (\ch -> stoi M.! [ch]) doc ++ [bos]
let n = min blockSize (length toks - 1)
let keys0 = replicate nLayer []
let values0 = replicate nLayer []
lossesRef <- newIORef ([] :: [Value])
let loopKV pos keys values
| pos >= n = pure (keys, values)
| otherwise = do
let tokenId = toks !! pos
let targetId = toks !! (pos + 1)
logits <- gpt idRef sd tokenId pos keys values nLayer nHead headDim nEmbd vocabSize
probs <- softmax idRef logits
lt <- vLog idRef (probs !! targetId)
nlt <- vNeg idRef lt
modifyIORef' lossesRef (\ls -> ls ++ [nlt])
let updateAt i new xs = take i xs ++ [new] ++ drop (i+1) xs
let kAdd li = updateAt li ((keys !! li) ++ [linearKey li]) keys
vAdd li = updateAt li ((values !! li) ++ [linearVal li]) values
linearKey li = []
linearVal li = []
loopKV (pos + 1) keys values
_ <- loopKV 0 keys0 values0
losses <- readIORef lossesRef
loss <- do
s <- sumValues idRef losses
invn <- valConst idRef (1.0 / fromIntegral n)
vMul idRef invn s
backward loss
let lrT = learningRate * (1.0 - fromIntegral step / fromIntegral numSteps)
mList <- readIORef mBuf
vList <- readIORef vBuf
(mList', vList') <- foldM (\(ms, vs) (i, p) -> do
g <- readGrad p
let mi = beta1 * (ms !! i) + (1.0 - beta1) * g
let vi = beta2 * (vs !! i) + (1.0 - beta2) * (g * g)
let mHat = mi / (1.0 - beta1 ** fromIntegral (step + 1))
let vHat = vi / (1.0 - beta2 ** fromIntegral (step + 1))
d <- readData p
let d' = d - lrT * mHat / (sqrt vHat + epsAdam)
setData p d'
setGrad p 0.0
let ms' = take i ms ++ [mi] ++ drop (i+1) ms
let vs' = take i vs ++ [vi] ++ drop (i+1) vs
pure (ms', vs')
) (mList, vList) (zip [0..] params)
writeIORef mBuf mList'
writeIORef vBuf vList'
ld <- readData loss
putStrLn ("step " ++ pad 4 (show (step+1)) ++ " / " ++ pad 4 (show numSteps) ++ " | loss " ++ showFF 4 ld)
let temperature = 0.6
putStrLn "\n--- inference ---"
forM_ [1..20::Int] $ \sampleIdx -> do
let keys0 = replicate nLayer []
let values0 = replicate nLayer []
putStr ("sample " ++ show sampleIdx ++ ": ")
let genLoop pos tokenId keys values
| pos >= blockSize = putStrLn ""
| otherwise = do
logits <- gpt idRef sd tokenId pos keys values nLayer nHead headDim nEmbd vocabSize
tempV <- valConst idRef temperature
scaled <- forM logits $ \l -> do
invt <- vPow idRef tempV (-1.0)
vMul idRef l invt
probs <- softmax idRef scaled
ws <- mapM readData probs
next <- categorical genRef ws
if next == bos
then putStrLn ""
else do
let ch = itos M.! next
putStr ch
genLoop (pos + 1) next keys values
genLoop 0 bos keys0 values0
pad :: Int -> String -> String
pad n s = replicate (n - length s) ' ' ++ s
showFF :: Int -> Double -> String
showFF k x =
let p = 10 ^ k
y = fromIntegral (round (x * fromIntegral p) :: Int) / fromIntegral p
in show y
dropWhileEnd :: (a -> Bool) -> [a] -> [a]
dropWhileEnd f = reverse . dropWhile f . reverse
`
thank you, functional programmer
Thanks for this awesome distillation
GOAT