Skip to content

Instantly share code, notes, and snippets.

@pashu123
Created March 15, 2025 10:27
Show Gist options
  • Save pashu123/50990c2d256b97462ab02038ebb5eeac to your computer and use it in GitHub Desktop.
Save pashu123/50990c2d256b97462ab02038ebb5eeac to your computer and use it in GitHub Desktop.
import numpy as np
def attention(Q, K, V):
"""
Computes attention: softmax(QK^T)V
Args:
Q: Query matrix of shape (batch_size, seq_len_q, d)
K: Key matrix of shape (batch_size, seq_len_k, d)
V: Value matrix of shape (batch_size, seq_len_k, d)
Returns:
Attention output of shape (batch_size, seq_len_q, d)
"""
# Compute QK^T (dot product of queries and keys)
K_transpose = np.transpose(K, (0, 2, 1))
scores = np.matmul(Q, K_transpose)
# Apply softmax to get attention weights
weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
weights = weights / np.sum(weights, axis=-1, keepdims=True)
# Multiply weights by values
output = np.matmul(weights, V)
return output
def block_attention(Q, K, V):
O = np.zeros(Q.shape)
seq_len = Q.shape[1]
step_size = 4
assert seq_len % step_size == 0, "seq_len should be divisible by step_size"
# Loop over blocks of seq_len.
for i_block in range(0, seq_len, step_size):
oi = O[:, i_block : i_block + step_size, :]
q_block = Q[:, i_block : i_block + step_size, :]
M = np.array([-np.inf]*step_size)
L = np.zeros((step_size,))
for j_block in range(0, seq_len, step_size):
k_block = K[:, j_block : j_block + step_size, :]
k_block_transposed = np.transpose(k_block, (0, 2, 1))
v_block = V[:, j_block : j_block + step_size, :]
# Calculate the attention scores.
s_ij = np.matmul(q_block, k_block_transposed)
# Find the max_along the row_block
curr_max = s_ij.max(axis=2, keepdims=False)
new_max = np.maximum(M, curr_max).ravel()
exp_S = np.exp(s_ij - new_max[..., None])
exp_old = np.exp(M - new_max)
new_L = exp_old * L + np.sum(exp_S, axis=-1)
# Update output with block contribution
oi = (exp_old[..., None] * oi + exp_S @ v_block)
# Update M and L
M, L = new_max, new_L
O[:, i_block : i_block + step_size, :] = oi / L[..., None]
return O
# Set parameters
batch_size = 1
head_dim = 1
seq_len = 8
embedding_dim = 128
# Set random seed for reproducibility
np.random.seed(3)
# Create random query, key, and value matrices
# For self-attention, these would normally be projections of the same input
# but for this example, we'll just create them directly
Q = np.random.randn(batch_size, seq_len, embedding_dim)
K = np.random.randn(batch_size, seq_len, embedding_dim)
V = np.random.randn(batch_size, seq_len, embedding_dim)
# Compute attention
output = attention(Q, K, V)
block_output = block_attention(Q, K, V)
# Print shapes to verify
print(f"Q shape: {Q.shape}")
print(f"K shape: {K.shape}")
print(f"V shape: {V.shape}")
print(f"Output shape: {output.shape}")
# Print a sample of the attention output (first token's embedding)
print("\nSample of attention output (first token):")
print(output[0, 0, :10]) # Print first 10 values of the first token's output
print("\nSample of block attention output (first token):")
print(block_output[0, 0, :10]) # Print first 10 values of the first token's output
# verify output and block output is same
np.testing.assert_allclose(output, block_output, rtol=1e-7, atol=1e-7)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment