Created
March 15, 2025 10:27
-
-
Save pashu123/50990c2d256b97462ab02038ebb5eeac to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
def attention(Q, K, V): | |
""" | |
Computes attention: softmax(QK^T)V | |
Args: | |
Q: Query matrix of shape (batch_size, seq_len_q, d) | |
K: Key matrix of shape (batch_size, seq_len_k, d) | |
V: Value matrix of shape (batch_size, seq_len_k, d) | |
Returns: | |
Attention output of shape (batch_size, seq_len_q, d) | |
""" | |
# Compute QK^T (dot product of queries and keys) | |
K_transpose = np.transpose(K, (0, 2, 1)) | |
scores = np.matmul(Q, K_transpose) | |
# Apply softmax to get attention weights | |
weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True)) | |
weights = weights / np.sum(weights, axis=-1, keepdims=True) | |
# Multiply weights by values | |
output = np.matmul(weights, V) | |
return output | |
def block_attention(Q, K, V): | |
O = np.zeros(Q.shape) | |
seq_len = Q.shape[1] | |
step_size = 4 | |
assert seq_len % step_size == 0, "seq_len should be divisible by step_size" | |
# Loop over blocks of seq_len. | |
for i_block in range(0, seq_len, step_size): | |
oi = O[:, i_block : i_block + step_size, :] | |
q_block = Q[:, i_block : i_block + step_size, :] | |
M = np.array([-np.inf]*step_size) | |
L = np.zeros((step_size,)) | |
for j_block in range(0, seq_len, step_size): | |
k_block = K[:, j_block : j_block + step_size, :] | |
k_block_transposed = np.transpose(k_block, (0, 2, 1)) | |
v_block = V[:, j_block : j_block + step_size, :] | |
# Calculate the attention scores. | |
s_ij = np.matmul(q_block, k_block_transposed) | |
# Find the max_along the row_block | |
curr_max = s_ij.max(axis=2, keepdims=False) | |
new_max = np.maximum(M, curr_max).ravel() | |
exp_S = np.exp(s_ij - new_max[..., None]) | |
exp_old = np.exp(M - new_max) | |
new_L = exp_old * L + np.sum(exp_S, axis=-1) | |
# Update output with block contribution | |
oi = (exp_old[..., None] * oi + exp_S @ v_block) | |
# Update M and L | |
M, L = new_max, new_L | |
O[:, i_block : i_block + step_size, :] = oi / L[..., None] | |
return O | |
# Set parameters | |
batch_size = 1 | |
head_dim = 1 | |
seq_len = 8 | |
embedding_dim = 128 | |
# Set random seed for reproducibility | |
np.random.seed(3) | |
# Create random query, key, and value matrices | |
# For self-attention, these would normally be projections of the same input | |
# but for this example, we'll just create them directly | |
Q = np.random.randn(batch_size, seq_len, embedding_dim) | |
K = np.random.randn(batch_size, seq_len, embedding_dim) | |
V = np.random.randn(batch_size, seq_len, embedding_dim) | |
# Compute attention | |
output = attention(Q, K, V) | |
block_output = block_attention(Q, K, V) | |
# Print shapes to verify | |
print(f"Q shape: {Q.shape}") | |
print(f"K shape: {K.shape}") | |
print(f"V shape: {V.shape}") | |
print(f"Output shape: {output.shape}") | |
# Print a sample of the attention output (first token's embedding) | |
print("\nSample of attention output (first token):") | |
print(output[0, 0, :10]) # Print first 10 values of the first token's output | |
print("\nSample of block attention output (first token):") | |
print(block_output[0, 0, :10]) # Print first 10 values of the first token's output | |
# verify output and block output is same | |
np.testing.assert_allclose(output, block_output, rtol=1e-7, atol=1e-7) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment