Cameron R. Wolfe wolfecameron

Research @ Netflix • Writer of Deep (Learning) Focus • PhD in CS from Rice University

wolfecameron / transformer_ffnn.py

Last active March 6, 2025 19:18

Feed-forward layer of a transformer.

	"""
	Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
	"""

	from torch import nn

	class MLP(nn.Module):

	def __init__(
	self,

wolfecameron / decoder_only_block.py

Last active March 6, 2025 22:11

Implementation of a decoder-only transformer block.

	"""
	Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
	"""

	from torch import nn

	class Block(nn.Module):
	def __init__(
	self,
	d,

wolfecameron / gpt.py

Last active February 20, 2026 08:24

Implementation of a GPT-style decoder only transformer.

	"""
	Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
	"""

	import torch
	from torch import nn
	import torch.nn.functional as F

	class GPT(nn.Module):

wolfecameron / masked_self_attention.py

Last active August 24, 2025 07:18

Basic PyTorch implementation of masked self-attention with a single attention head.

	"""
	Source: https://github.com/karpathy/nanoGPT/blob/master/model.py
	"""

	import math
	import torch
	from torch import nn
	import torch.nn.functional as F

	class MaskedSelfAttention(nn.Module):

wolfecameron / tokenizer_example.py

Last active April 4, 2025 07:12

	import torch
	from transformers import AutoTokenizer

	# load the llama-3.2 tokenizer
	tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-3.1-8B')

	# raw text
	text = "This raw text will be tokenized"

	# create tokens using tokenizer

wolfecameron / expert_layer.py

Last active March 6, 2025 22:01

Expert layer for a MoE-based transformer.

	"""
	Based upon ColossalAI OpenMoE
	"""

	import torch
	from torch import nn

	class MLPExperts(nn.Module):

	def __init__(

wolfecameron / basic_softmax_router.py

Last active March 6, 2025 18:47

Implementation of a basic softmax routing mechanism for an MoE.

	import torch
	from torch import nn
	from torch.nn import functional as F

	class BasicSoftmaxRouter(nn.Module):
	def __init__(
	self,
	d,
	n_exp = 8,
	top_k = 2,

wolfecameron / full_softmax_router.py

Last active March 6, 2025 21:15

Implementation of a fully-functional softmax routing mechanism with expert capacity.

	import math

	import torch
	from torch import nn
	from torch.nn import functional as F

	class Router(nn.Module):
	def __init__(
	self,
	d,

wolfecameron / load_balancing_loss.py

Created March 6, 2025 18:31

An implementation of the MoE load balancing loss in PyTorch.

	"""
	Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
	See equations (4)-(6) on page 7
	"""

	import torch
	import torch.nn.functional as F

	# constants
	B = 16 # batch size

wolfecameron / router_z_loss.py

Created March 6, 2025 18:52

An implementation of the MoE router z-loss in PyTorch.

	"""
	Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
	See equation (5) on page 7
	"""

	import torch

	# constants
	B = 16 # batch size
	C = 256 # sequence length