pashu123 · March 15, 2025 10:27
diff --git a/np_attn.py b/np_attn.py
 import numpy as np


 def attention(Q, K, V):
    """
    Computes attention: softmax(QK^T)V

    Args:
        Q: Query matrix of shape (batch_size, seq_len_q, d)
        K: Key matrix of shape (batch_size, seq_len_k, d)
        V: Value matrix of shape (batch_size, seq_len_k, d)

    Returns:
        Attention output of shape (batch_size, seq_len_q, d)
    """
    # Compute QK^T (dot product of queries and keys)
    K_transpose = np.transpose(K, (0, 2, 1))
    scores = np.matmul(Q, K_transpose)

    # Apply softmax to get attention weights
    weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
    weights = weights / np.sum(weights, axis=-1, keepdims=True)

    # Multiply weights by values
    output = np.matmul(weights, V)

    return output


 def block_attention(Q, K, V):
    O = np.zeros(Q.shape)
    seq_len = Q.shape[1]

    step_size = 4
    assert seq_len % step_size == 0, "seq_len should be divisible by step_size"

    # Loop over blocks of seq_len.
    for i_block in range(0, seq_len, step_size):

        oi = O[:, i_block : i_block + step_size, :]
        q_block = Q[:, i_block : i_block + step_size, :]
        M = np.array([-np.inf]*step_size)
        L = np.zeros((step_size,))

        for j_block in range(0, seq_len, step_size):
            k_block = K[:, j_block : j_block + step_size, :]
            k_block_transposed = np.transpose(k_block, (0, 2, 1))
            v_block = V[:, j_block : j_block + step_size, :]

            # Calculate the attention scores.
            s_ij = np.matmul(q_block, k_block_transposed)

            # Find the max_along the row_block
            curr_max = s_ij.max(axis=2, keepdims=False)
            new_max = np.maximum(M, curr_max).ravel()

            exp_S = np.exp(s_ij - new_max[..., None])
            exp_old = np.exp(M - new_max)

            new_L = exp_old * L + np.sum(exp_S, axis=-1)

            # Update output with block contribution
            oi = (exp_old[..., None] * oi + exp_S @ v_block)

            # Update M and L
            M, L = new_max, new_L

        O[:, i_block : i_block + step_size, :] =  oi / L[..., None]

    return O


 # Set parameters
 batch_size = 1
 head_dim = 1
 seq_len = 8
 embedding_dim = 128

 # Set random seed for reproducibility
 np.random.seed(3)

 # Create random query, key, and value matrices
 # For self-attention, these would normally be projections of the same input
 # but for this example, we'll just create them directly
 Q = np.random.randn(batch_size, seq_len, embedding_dim)
 K = np.random.randn(batch_size, seq_len, embedding_dim)
 V = np.random.randn(batch_size, seq_len, embedding_dim)

 # Compute attention
 output = attention(Q, K, V)

 block_output = block_attention(Q, K, V)

 # Print shapes to verify
 print(f"Q shape: {Q.shape}")
 print(f"K shape: {K.shape}")
 print(f"V shape: {V.shape}")
 print(f"Output shape: {output.shape}")

 # Print a sample of the attention output (first token's embedding)
 print("\nSample of attention output (first token):")
 print(output[0, 0, :10])  # Print first 10 values of the first token's output
 print("\nSample of block attention output (first token):")
 print(block_output[0, 0, :10])  # Print first 10 values of the first token's output


 # verify output and block output is same
 np.testing.assert_allclose(output, block_output, rtol=1e-7, atol=1e-7)
	import numpy as np


	def attention(Q, K, V):
	"""
	Computes attention: softmax(QK^T)V

	Args:
	Q: Query matrix of shape (batch_size, seq_len_q, d)
	K: Key matrix of shape (batch_size, seq_len_k, d)
	V: Value matrix of shape (batch_size, seq_len_k, d)

	Returns:
	Attention output of shape (batch_size, seq_len_q, d)
	"""
	# Compute QK^T (dot product of queries and keys)
	K_transpose = np.transpose(K, (0, 2, 1))
	scores = np.matmul(Q, K_transpose)

	# Apply softmax to get attention weights
	weights = np.exp(scores - np.max(scores, axis=-1, keepdims=True))
	weights = weights / np.sum(weights, axis=-1, keepdims=True)

	# Multiply weights by values
	output = np.matmul(weights, V)

	return output


	def block_attention(Q, K, V):
	O = np.zeros(Q.shape)
	seq_len = Q.shape[1]

	step_size = 4
	assert seq_len % step_size == 0, "seq_len should be divisible by step_size"

	# Loop over blocks of seq_len.
	for i_block in range(0, seq_len, step_size):

	oi = O[:, i_block : i_block + step_size, :]
	q_block = Q[:, i_block : i_block + step_size, :]
	M = np.array([-np.inf]*step_size)
	L = np.zeros((step_size,))

	for j_block in range(0, seq_len, step_size):
	k_block = K[:, j_block : j_block + step_size, :]
	k_block_transposed = np.transpose(k_block, (0, 2, 1))
	v_block = V[:, j_block : j_block + step_size, :]

	# Calculate the attention scores.
	s_ij = np.matmul(q_block, k_block_transposed)

	# Find the max_along the row_block
	curr_max = s_ij.max(axis=2, keepdims=False)
	new_max = np.maximum(M, curr_max).ravel()

	exp_S = np.exp(s_ij - new_max[..., None])
	exp_old = np.exp(M - new_max)

	new_L = exp_old * L + np.sum(exp_S, axis=-1)

	# Update output with block contribution
	oi = (exp_old[..., None] * oi + exp_S @ v_block)

	# Update M and L
	M, L = new_max, new_L

	O[:, i_block : i_block + step_size, :] = oi / L[..., None]

	return O


	# Set parameters
	batch_size = 1
	head_dim = 1
	seq_len = 8
	embedding_dim = 128

	# Set random seed for reproducibility
	np.random.seed(3)

	# Create random query, key, and value matrices
	# For self-attention, these would normally be projections of the same input
	# but for this example, we'll just create them directly
	Q = np.random.randn(batch_size, seq_len, embedding_dim)
	K = np.random.randn(batch_size, seq_len, embedding_dim)
	V = np.random.randn(batch_size, seq_len, embedding_dim)

	# Compute attention
	output = attention(Q, K, V)

	block_output = block_attention(Q, K, V)

	# Print shapes to verify
	print(f"Q shape: {Q.shape}")
	print(f"K shape: {K.shape}")
	print(f"V shape: {V.shape}")
	print(f"Output shape: {output.shape}")

	# Print a sample of the attention output (first token's embedding)
	print("\nSample of attention output (first token):")
	print(output[0, 0, :10]) # Print first 10 values of the first token's output
	print("\nSample of block attention output (first token):")
	print(block_output[0, 0, :10]) # Print first 10 values of the first token's output


	# verify output and block output is same
	np.testing.assert_allclose(output, block_output, rtol=1e-7, atol=1e-7)