jgoodie · January 19, 2025 17:24
diff --git a/single_headed_attention_exp.py b/single_headed_attention_exp.py
 import scipy
 import numpy as np
 from sklearn.preprocessing import OneHotEncoder


 sentence = "the otter swam across the river to the other bank"
 d = dict.fromkeys(sentence.split())
 vocab = list(d.keys())

 tokens = sentence.lower().split()
 encoder = OneHotEncoder(categories=[vocab], sparse_output=False)
 X = encoder.fit_transform(np.array(tokens).reshape(-1, 1))

 D, N = X.shape

 # Choose random values for the parameters
 omega_q = np.random.normal(size=(D,D))
 omega_k = np.random.normal(size=(D,D))
 omega_v = np.random.normal(size=(D,D))
 beta_q = np.random.normal(size=(D,1))
 beta_k = np.random.normal(size=(D,1))
 beta_v = np.random.normal(size=(D,1))

 def single_head_attention(X, beta_q, beta_k, beta_v, omega_q, omega_k, omega_v):
    query = beta_q + omega_q@X
    key = beta_k + omega_k@X
    value = beta_v + omega_v@X
    
    dp = np.dot(key.T, query) 
    scaled_dp = dp/np.sqrt(query.shape[0])
    attention_weights = scipy.special.softmax(scaled_dp, axis=0)
    attention_output = value@attention  
    return attention_output, attention_weights


 attention_output, attention_weights = single_head_attention(X, beta_q, beta_k, beta_v, omega_q, omega_k, omega_v)

 print(f"X shape: {X.shape}")
 print(f"Q weights shape: {omega_q.shape}")
 print(f"Beta weights shape: {beta_q.shape}")
 print(f"Attention weights shape: {attention_weights.shape}")
 print(f"Attention out shape: {attention_output.shape}")
	import scipy
	import numpy as np
	from sklearn.preprocessing import OneHotEncoder


	sentence = "the otter swam across the river to the other bank"
	d = dict.fromkeys(sentence.split())
	vocab = list(d.keys())

	tokens = sentence.lower().split()
	encoder = OneHotEncoder(categories=[vocab], sparse_output=False)
	X = encoder.fit_transform(np.array(tokens).reshape(-1, 1))

	D, N = X.shape

	# Choose random values for the parameters
	omega_q = np.random.normal(size=(D,D))
	omega_k = np.random.normal(size=(D,D))
	omega_v = np.random.normal(size=(D,D))
	beta_q = np.random.normal(size=(D,1))
	beta_k = np.random.normal(size=(D,1))
	beta_v = np.random.normal(size=(D,1))

	def single_head_attention(X, beta_q, beta_k, beta_v, omega_q, omega_k, omega_v):
	query = beta_q + omega_q@X
	key = beta_k + omega_k@X
	value = beta_v + omega_v@X

	dp = np.dot(key.T, query)
	scaled_dp = dp/np.sqrt(query.shape[0])
	attention_weights = scipy.special.softmax(scaled_dp, axis=0)
	attention_output = value@attention
	return attention_output, attention_weights


	attention_output, attention_weights = single_head_attention(X, beta_q, beta_k, beta_v, omega_q, omega_k, omega_v)

	print(f"X shape: {X.shape}")
	print(f"Q weights shape: {omega_q.shape}")
	print(f"Beta weights shape: {beta_q.shape}")
	print(f"Attention weights shape: {attention_weights.shape}")
	print(f"Attention out shape: {attention_output.shape}")