Skip to content

Instantly share code, notes, and snippets.

View wolfecameron's full-sized avatar

Cameron R. Wolfe wolfecameron

View GitHub Profile
@wolfecameron
wolfecameron / transformer_ffnn.py
Last active March 6, 2025 19:18
Feed-forward layer of a transformer.
"""
Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
"""
from torch import nn
class MLP(nn.Module):
def __init__(
self,
@wolfecameron
wolfecameron / decoder_only_block.py
Last active March 6, 2025 22:11
Implementation of a decoder-only transformer block.
"""
Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
"""
from torch import nn
class Block(nn.Module):
def __init__(
self,
d,
@wolfecameron
wolfecameron / gpt.py
Last active February 20, 2026 08:24
Implementation of a GPT-style decoder only transformer.
"""
Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
"""
import torch
from torch import nn
import torch.nn.functional as F
class GPT(nn.Module):
@wolfecameron
wolfecameron / masked_self_attention.py
Last active August 24, 2025 07:18
Basic PyTorch implementation of masked self-attention with a single attention head.
"""
Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
"""
import math
import torch
from torch import nn
import torch.nn.functional as F
class MaskedSelfAttention(nn.Module):
import torch
from transformers import AutoTokenizer
# load the llama-3.2 tokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B')
# raw text
text = "This raw text will be tokenized"
# create tokens using tokenizer
@wolfecameron
wolfecameron / expert_layer.py
Last active March 6, 2025 22:01
Expert layer for a MoE-based transformer.
"""
Based upon ColossalAI OpenMoE
"""
import torch
from torch import nn
class MLPExperts(nn.Module):
def __init__(
@wolfecameron
wolfecameron / basic_softmax_router.py
Last active March 6, 2025 18:47
Implementation of a basic softmax routing mechanism for an MoE.
import torch
from torch import nn
from torch.nn import functional as F
class BasicSoftmaxRouter(nn.Module):
def __init__(
self,
d,
n_exp = 8,
top_k = 2,
@wolfecameron
wolfecameron / full_softmax_router.py
Last active March 6, 2025 21:15
Implementation of a fully-functional softmax routing mechanism with expert capacity.
import math
import torch
from torch import nn
from torch.nn import functional as F
class Router(nn.Module):
def __init__(
self,
d,
@wolfecameron
wolfecameron / load_balancing_loss.py
Created March 6, 2025 18:31
An implementation of the MoE load balancing loss in PyTorch.
"""
Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
See equations (4)-(6) on page 7
"""
import torch
import torch.nn.functional as F
# constants
B = 16 # batch size
@wolfecameron
wolfecameron / router_z_loss.py
Created March 6, 2025 18:52
An implementation of the MoE router z-loss in PyTorch.
"""
Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
See equation (5) on page 7
"""
import torch
# constants
B = 16 # batch size
C = 256 # sequence length