User: Design a simple language model based on the transformer architecture. You should include extensive comments that explain each part step by step for a newcomer to the field of machine learning.
Time: 43.49 seconds
Usage: {"prompt_tokens"=>40, "completion_tokens"=>4927, "total_tokens"=>4967, "completion_tokens_details"=>{"reasoning_tokens"=>896}}
Answer:
# Import necessary libraries
import torch # PyTorch library, provides tensors and neural network functionalities
import torch.nn as nn # Import the neural network module
import torch.nn.functional as F # Import functional module for activation functions
# Let's begin by defining the positional encoding
# Positional encoding is used to add information about the position of each token in the sequence
# Since transformers have no inherent sense of position, we need to add this information
class PositionalEncoding(nn.Module):
def __init__(self, d_model, max_len=5000):
"""
Initializes the positional encoding module.
Args:
d_model: The dimension of the embedding vector.
max_len: The maximum length of the sequence.
"""
super(PositionalEncoding, self).__init__()
# Create a matrix of shape (max_len, d_model) to hold the positional encodings
pe = torch.zeros(max_len, d_model)
# Create a vector of positions [0, 1, 2, ..., max_len-1]. Shape: (max_len, 1)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# Create a vector of dimensions [0, 1, 2, ..., d_model/2 -1]
# Each dimension will have a specific frequency
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
# Apply sine to even indices in the array; 2i
pe[:, 0::2] = torch.sin(position * div_term)
# Apply cosine to odd indices in the array; 2i+1
pe[:, 1::2] = torch.cos(position * div_term)
# Add a new dimension at the beginning for batch size compatibility
pe = pe.unsqueeze(0)
# Register pe as a buffer, which means it will not be updated during training
self.register_buffer('pe', pe)
def forward(self, x):
"""
Adds positional encoding to the input embeddings.
Args:
x: The input embeddings. Shape: (batch_size, seq_len, d_model)
Returns:
The embeddings with positional encoding added.
"""
# x is of shape [batch_size, seq_len, d_model]
# We need to make sure that pe is of the same shape as x
x = x + self.pe[:, :x.size(1), :]
return x
# Now, define the scaled dot-product attention mechanism
# This is the core of the transformer architecture
def scaled_dot_product_attention(query, key, value, mask=None):
"""
Calculate the attention weights.
Args:
query: Queries of shape (..., seq_len_q, depth)
key: Keys of shape (..., seq_len_k, depth)
value: Values of shape (..., seq_len_v, depth_v)
mask: Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k)
Returns:
output: Weighted sum of values
attention_weights: Attention weights
"""
# Calculate the dot products between the queries and the keys
matmul_qk = torch.matmul(query, key.transpose(-2, -1))
# Scale matmul_qk by the square root of the depth
d_k = key.size(-1)
scores = matmul_qk / math.sqrt(d_k)
# Apply the mask if provided
if mask is not None:
# Add a large negative value to the masked positions so that the softmax gives them zero weight
scores = scores.masked_fill(mask == 0, -1e9)
# Apply softmax to get the attention weights
attention_weights = F.softmax(scores, dim=-1)
# Multiply the attention weights by the values to get the output
output = torch.matmul(attention_weights, value)
return output, attention_weights
# Define the Multi-Head Attention layer
class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads):
"""
Initializes the Multi-Head Attention layer.
Args:
d_model: The dimension of the embedding vector.
num_heads: The number of attention heads.
"""
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
# Ensure that d_model is divisible by num_heads
assert d_model % num_heads == 0
# The dimension of each head
self.depth = d_model // num_heads
# Linear layers for queries, keys, and values
self.wq = nn.Linear(d_model, d_model) # Weight matrix for queries
self.wk = nn.Linear(d_model, d_model) # Weight matrix for keys
self.wv = nn.Linear(d_model, d_model) # Weight matrix for values
# Final linear layer after concatenating the heads
self.dense = nn.Linear(d_model, d_model)
def split_heads(self, x, batch_size):
"""
Split the last dimension into (num_heads, depth).
Transpose the result so that the shape is (batch_size, num_heads, seq_len, depth)
Args:
x: Tensor with shape (batch_size, seq_len, d_model)
Returns:
Tensor with shape (batch_size, num_heads, seq_len, depth)
"""
x = x.view(batch_size, -1, self.num_heads, self.depth)
# Transpose for the shape (batch_size, num_heads, seq_len, depth)
return x.transpose(1,2)
def forward(self, v, k, q, mask):
"""
Perform the multi-head attention.
Args:
v: Values tensor
k: Keys tensor
q: Queries tensor
mask: Mask tensor
Returns:
output: The output tensor after attention
attention_weights: The attention weights from each head
"""
batch_size = q.size(0)
# Pass through the linear layers
q = self.wq(q) # (batch_size, seq_len_q, d_model)
k = self.wk(k) # (batch_size, seq_len_k, d_model)
v = self.wv(v) # (batch_size, seq_len_v, d_model)
# Split into multiple heads
q = self.split_heads(q, batch_size) # (batch_size, num_heads, seq_len_q, depth)
k = self.split_heads(k, batch_size) # (batch_size, num_heads, seq_len_k, depth)
v = self.split_heads(v, batch_size) # (batch_size, num_heads, seq_len_v, depth)
# Apply scaled dot-product attention
scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)
# Concatenate the heads
scaled_attention = scaled_attention.transpose(1,2).contiguous().view(batch_size, -1, self.d_model)
# Pass through the final linear layer
output = self.dense(scaled_attention) # (batch_size, seq_len_q, d_model)
return output, attention_weights
# Define the Feed Forward Neural Network
class PointwiseFeedForward(nn.Module):
def __init__(self, d_model, dff):
"""
Initializes the Feed Forward Neural Network.
Args:
d_model: The dimension of the embedding vector.
dff: The dimension of the hidden layer.
"""
super(PointwiseFeedForward, self).__init__()
self.linear1 = nn.Linear(d_model, dff)
self.linear2 = nn.Linear(dff, d_model)
def forward(self, x):
"""
Pass the input through the feed forward network.
Args:
x: The input tensor.
Returns:
The output tensor after feed forward network.
"""
return self.linear2(F.relu(self.linear1(x)))
# Define the Transformer Decoder Block
class DecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, dff, dropout_rate=0.1):
"""
Initializes a single decoder layer.
Args:
d_model: The dimension of the embedding vector.
num_heads: The number of attention heads.
dff: The dimension of the feed forward network.
dropout_rate: Dropout rate.
"""
super(DecoderLayer, self).__init__()
# Multi-head self attention
self.mha = MultiHeadAttention(d_model, num_heads)
# Feed forward network
self.ffn = PointwiseFeedForward(d_model, dff)
# Layer normalization
self.layernorm1 = nn.LayerNorm(d_model)
self.layernorm2 = nn.LayerNorm(d_model)
# Dropout
self.dropout1 = nn.Dropout(dropout_rate)
self.dropout2 = nn.Dropout(dropout_rate)
def forward(self, x, mask):
"""
Forward pass through the decoder layer.
Args:
x: The input tensor. Shape: (batch_size, seq_len, d_model)
mask: The mask tensor.
Returns:
The output tensor after passing through this decoder layer.
"""
# Multi-head self attention (with look-ahead mask)
attn_output, _ = self.mha(x, x, x, mask) # (batch_size, seq_len, d_model)
attn_output = self.dropout1(attn_output)
# Add & Norm
out1 = self.layernorm1(x + attn_output) # (batch_size, seq_len, d_model)
# Feed forward network
ffn_output = self.ffn(out1) # (batch_size, seq_len, d_model)
ffn_output = self.dropout2(ffn_output)
# Add & Norm
out2 = self.layernorm2(out1 + ffn_output) # (batch_size, seq_len, d_model)
return out2
# Finally, define the Transformer decoder consisting of multiple decoder layers
class Decoder(nn.Module):
def __init__(self, vocab_size, d_model, num_layers, num_heads, dff, maximum_position_encoding, dropout_rate=0.1):
"""
Initializes the Transformer decoder.
Args:
vocab_size: Size of the vocabulary.
d_model: The dimension of the embedding vector.
num_layers: Number of decoder layers.
num_heads: Number of attention heads.
dff: Dimension of the feed forward network.
maximum_position_encoding: Maximum length of the input sequences.
dropout_rate: Dropout rate.
"""
super(Decoder, self).__init__()
self.d_model = d_model
self.num_layers = num_layers
# Embedding layer converts token indices to embeddings
self.embedding = nn.Embedding(vocab_size, d_model)
# Positional encoding adds information about the position of each token
self.pos_encoding = PositionalEncoding(d_model, maximum_position_encoding)
# Dropout layer
self.dropout = nn.Dropout(dropout_rate)
# Stack of decoder layers
self.dec_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)])
def forward(self, x, mask):
"""
Forward pass through the decoder.
Args:
x: Input tensor of token indices. Shape: (batch_size, seq_len)
mask: Mask tensor.
Returns:
The output tensor from the decoder. Shape: (batch_size, seq_len, d_model)
"""
seq_len = x.size(1)
attention_weights = {}
# Embed the input tokens
x = self.embedding(x) # (batch_size, seq_len, d_model)
x = x * math.sqrt(self.d_model) # Scale embeddings
# Add positional encoding
x = self.pos_encoding(x)
x = self.dropout(x)
# Pass through the decoder layers
for i in range(self.num_layers):
x = self.dec_layers[i](x, mask)
# x shape: (batch_size, seq_len, d_model)
return x
# Define the full Transformer model for language modeling
class TransformerLanguageModel(nn.Module):
def __init__(self, vocab_size, d_model=512, num_layers=6, num_heads=8, dff=2048, maximum_position_encoding=5000, dropout_rate=0.1):
"""
Initializes the Transformer language model.
Args:
vocab_size: Size of the vocabulary.
d_model: The dimension of the embedding vector.
num_layers: Number of decoder layers.
num_heads: Number of attention heads.
dff: Dimension of the feed forward network.
maximum_position_encoding: Maximum length of the input sequences.
dropout_rate: Dropout rate.
"""
super(TransformerLanguageModel, self).__init__()
# The decoder
self.decoder = Decoder(vocab_size, d_model, num_layers, num_heads, dff, maximum_position_encoding, dropout_rate)
# Final linear layer to project the decoder output to vocabulary size
self.final_layer = nn.Linear(d_model, vocab_size)
def create_masks(self, target_seq):
"""
Creates a mask for the target sequence to mask out future tokens (look-ahead mask).
Args:
target_seq: The target sequence tensor. Shape: (batch_size, seq_len)
Returns:
A mask tensor for the target sequence. Shape: (batch_size, 1, seq_len, seq_len)
"""
seq_len = target_seq.size(1)
# Create an upper-triangular matrix of ones
mask = torch.triu(torch.ones((seq_len, seq_len), device=target_seq.device), diagonal=1).bool()
# Add dimensions to make it broadcastable
return mask.unsqueeze(0).unsqueeze(0) # Shape: (1, 1, seq_len, seq_len)
def forward(self, target_seq):
"""
Forward pass through the Transformer language model.
Args:
target_seq: The input sequence tensor. Shape: (batch_size, seq_len)
Returns:
The output logits for each token in the sequence. Shape: (batch_size, seq_len, vocab_size)
"""
# Create masks
mask = self.create_masks(target_seq)
# Pass through the decoder
dec_output = self.decoder(target_seq, mask) # (batch_size, seq_len, d_model)
# Project the output to vocabulary size
final_output = self.final_layer(dec_output) # (batch_size, seq_len, vocab_size)
return final_output
# Now, let's assume we have a vocabulary size of 10,000
vocab_size = 10000
# Instantiate the model
model = TransformerLanguageModel(vocab_size)
# For demonstration, let's create a dummy input
# Suppose we have a batch size of 2 and sequence length of 5
dummy_input = torch.randint(0, vocab_size, (2, 5)) # Random integers between 0 and vocab_size
# Pass the dummy input through the model
output_logits = model(dummy_input)
# The output_logits is of shape (batch_size, seq_len, vocab_size)
print(output_logits.shape) # Should print: torch.Size([2, 5, 10000])
# This output can then be used to compute the loss and update the model weights during training
Explanation:
In this code, we've built a simple language model based on the Transformer architecture using PyTorch. Here's a step-by-step explanation of each part:
-
Imports:
- We import PyTorch libraries needed for tensor operations and neural network layers.
-
Positional Encoding:
- Transformers lack inherent positional information, so we add positional encoding to provide the model with information about the position of each token in the sequence.
- The
PositionalEncoding
class computes sine and cosine functions of different frequencies and adds them to the token embeddings.
-
Scaled Dot-Product Attention:
- The
scaled_dot_product_attention
function calculates attention weights and outputs for given queries, keys, and values. - It scales the dot products and applies a mask to prevent attending to future tokens during training.
- The
-
Multi-Head Attention:
- The
MultiHeadAttention
class implements the multi-head attention mechanism by projecting the queries, keys, and values into multiple subspaces (heads), performing attention in parallel, and then concatenating the results. - This allows the model to attend to information from different representation subspaces jointly.
- The
-
Pointwise Feed Forward Network:
- The
PointwiseFeedForward
class defines a simple feed-forward network applied to each position separately and identically. - It consists of two linear transformations with a ReLU activation in between.
- The
-
Decoder Layer:
- The
DecoderLayer
class combines the multi-head attention and feed-forward network with residual connections and layer normalization. - It represents a single layer of the Transformer decoder.
- The
-
Decoder:
- The
Decoder
class stacks multiple decoder layers and includes the embedding and positional encoding. - It processes the input sequence and produces encoded representations.
- The
-
Transformer Language Model:
- The
TransformerLanguageModel
class wraps the decoder and adds a final linear layer to project the decoder outputs to the vocabulary size. - It includes a method to create masks that prevent the model from attending to future tokens (look-ahead mask).
- The
-
Model Initialization and Forward Pass:
- We instantiate the model with a specified vocabulary size.
- We create a dummy input to demonstrate how to pass data through the model.
- The output logits can be used to compute the loss (e.g., using
CrossEntropyLoss
) and update the model during training.
Note for Newcomers:
-
Tokens and Vocabulary:
- Text data is typically converted into sequences of integers called tokens, where each unique word or character corresponds to a unique index in the vocabulary.
-
Embedding Layer:
- Converts token indices into dense vectors (embeddings) that the model can process.
-
Masking:
- Masks are used to prevent the model from looking ahead during training, ensuring that predictions for a position i depend only on positions less than i.
-
Training:
- During training, you would feed batches of input sequences into the model, compute the loss against the target sequences, and update the model parameters using an optimizer like Adam.
-
Inference:
- During inference or text generation, you would use the model to predict the next token in a sequence, possibly using techniques like greedy decoding or beam search.
This simplified Transformer model provides a foundational understanding of how language models based on the Transformer architecture operate. In practice, models like GPT-2 and GPT-3 are built on similar principles but with much larger sizes and additional optimizations.