Skip to content

Instantly share code, notes, and snippets.

@jgoodie
Created January 19, 2025 19:09
Show Gist options
  • Save jgoodie/fbb6c1c860fd7fa52edd4d78426eb1a4 to your computer and use it in GitHub Desktop.
Save jgoodie/fbb6c1c860fd7fa52edd4d78426eb1a4 to your computer and use it in GitHub Desktop.
multiheaded self-attention
import scipy
import numpy as np
from sklearn.preprocessing import OneHotEncoder
sentence = "the otter swam across the river to the other bank"
d = dict.fromkeys(sentence.split())
vocab = list(d.keys())
tokens = sentence.lower().split()
encoder = OneHotEncoder(categories=[vocab], sparse_output=False)
X = encoder.fit_transform(np.array(tokens).reshape(-1, 1))
# Set seed so we get the same random numbers
np.random.seed(3)
# # Number of inputs and dimensions of each input
D, N = X.shape
# Number of heads
H = 2
# QDV dimension
H_D = int(D/H)
omega_q = np.random.normal(size=(n_heads, H_D, D))
omega_k = np.random.normal(size=(n_heads, H_D, D))
omega_v = np.random.normal(size=(n_heads, H_D, D))
beta_q = np.random.normal(size=(n_heads, H_D, 1))
beta_k = np.random.normal(size=(n_heads, H_D, 1))
beta_v = np.random.normal(size=(n_heads, H_D, 1))
omega_c = np.random.normal(size=(D, D))
def multi_head_self_attention(X, omega_q, omega_k, omega_v, omega_c, beta_q, beta_k, beta_v, n_heads):
# Compute attention for each head
head_outputs = []
for h in range(n_heads):
q = beta_q[h] + omega_q[h]@X
k = beta_k[h] + omega_k[h]@X
v = beta_v[h] + omega_v[h]@X
dp = k.T@q
sdp = dp/np.sqrt(q.shape[0])
attention_weights = scipy.special.softmax(sdp, axis=0)
sa = v@attention_weights
head_outputs.append(sa.T)
outputs = np.concatenate(head_outputs, axis=1).T
mhsa = omega_c@outputs
return mhsa
attention_output = multi_head_self_attention(X, omega_q, omega_k, omega_v, omega_c, beta_q, beta_k, beta_v, H)
print(f"X shape: {X.shape}")
print(f"Multi-Headed Attention shape: {attention_output.shape}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment