Skip to content

Instantly share code, notes, and snippets.

View wolfecameron's full-sized avatar

Cameron R. Wolfe wolfecameron

View GitHub Profile
@wolfecameron
wolfecameron / cross_attention.py
Last active April 2, 2025 03:54
An implementation of cross-attention in PyTorch.
import math
import torch
from torch import nn
import torch.nn.functional as F
class CrossAttention(nn.Module):
def __init__(self, d):
"""
Arguments:
@wolfecameron
wolfecameron / bidir_self_attn.py
Last active March 29, 2025 17:36
An implementation of bidirectional self-attention in PyTorch.
import math
import torch
from torch import nn
import torch.nn.functional as F
class SelfAttention(nn.Module):
def __init__(self, d):
"""
Arguments:
@wolfecameron
wolfecameron / moe_block.py
Created March 6, 2025 22:24
MoE block for an MoE-based decoder-only transformer model in PyTorch.
from torch import nn
class MoEBlock(nn.Module):
def __init__(
self,
d,
H,
C,
n_exp,
@wolfecameron
wolfecameron / expert_layer.py
Created March 6, 2025 21:17
PyTorch implementation of a feed-forward expert layer within an MoE.
"""
Based upon ColossalAI OpenMoE
"""
from torch import nn
class MOELayer(nn.Module):
def __init__(
self,
d,
@wolfecameron
wolfecameron / router_z_loss.py
Created March 6, 2025 18:52
An implementation of the MoE router z-loss in PyTorch.
"""
Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
See equation (5) on page 7
"""
import torch
# constants
B = 16 # batch size
C = 256 # sequence length
@wolfecameron
wolfecameron / load_balancing_loss.py
Created March 6, 2025 18:31
An implementation of the MoE load balancing loss in PyTorch.
"""
Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
See equations (4)-(6) on page 7
"""
import torch
import torch.nn.functional as F
# constants
B = 16 # batch size
@wolfecameron
wolfecameron / full_softmax_router.py
Last active March 6, 2025 21:15
Implementation of a fully-functional softmax routing mechanism with expert capacity.
import math
import torch
from torch import nn
from torch.nn import functional as F
class Router(nn.Module):
def __init__(
self,
d,
@wolfecameron
wolfecameron / basic_softmax_router.py
Last active March 6, 2025 18:47
Implementation of a basic softmax routing mechanism for an MoE.
import torch
from torch import nn
from torch.nn import functional as F
class BasicSoftmaxRouter(nn.Module):
def __init__(
self,
d,
n_exp = 8,
top_k = 2,
@wolfecameron
wolfecameron / expert_layer.py
Last active March 6, 2025 22:01
Expert layer for a MoE-based transformer.
"""
Based upon ColossalAI OpenMoE
"""
import torch
from torch import nn
class MLPExperts(nn.Module):
def __init__(
import torch
from transformers import AutoTokenizer
# load the llama-3.2 tokenizer
tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B')
# raw text
text = "This raw text will be tokenized"
# create tokens using tokenizer