Last active
September 4, 2025 13:57
-
-
Save Mistobaan/e4e7fa54803274e560d3d7b06886843c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html xmlns="http://www.w3.org/1999/xhtml" lang xml:lang> | |
<head> | |
<meta charset="utf-8" /> | |
<meta name="generator" content="nbh" /> | |
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" /> | |
<title>Notebook HTML Renderer</title> | |
<!-- KaTeX --> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.9/katex.min.js" | |
integrity="sha512-LQNxIMR5rXv7o+b1l8+N1EZMfhG7iFZ9HhnbJkTp4zjNr5Wvst75AqUeFDxeRUa7l5vEDyUiAip//r+EFLLCyA==" | |
crossorigin="anonymous" referrerpolicy="no-referrer"></script> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.9/katex.min.css" | |
integrity="sha512-fHwaWebuwA7NSF5Qg/af4UeDx9XqUpYpOGgubo3yWu+b2IQR4UeQwbb42Ti7gVAjNtVoI/I9TEoYeu9omwcC6g==" | |
crossorigin="anonymous" referrerpolicy="no-referrer" /> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/KaTeX/0.16.9/contrib/auto-render.min.js" | |
integrity="sha512-iWiuBS5nt6r60fCz26Nd0Zqe0nbk1ZTIQbl3Kv7kYsX+yKMUFHzjaH2+AnM6vp2Xs+gNmaBAVWJjSmuPw76Efg==" | |
crossorigin="anonymous" referrerpolicy="no-referrer"></script> | |
<!-- Highlight.js --> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/styles/default.min.css"> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/highlight.min.js" | |
integrity="sha512-EBLzUL8XLl+va/zAsmXwS7Z2B1F9HUHkZwyS/VKwh3S7T/U0nF4BaU29EP/ZSf6zgiIxYAnKLu6bJ8dqpmX5uw==" | |
crossorigin="anonymous" referrerpolicy="no-referrer"></script> | |
<link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/styles/github-dark.min.css" | |
media="(prefers-color-scheme: dark)" rel="stylesheet"> | |
<link href="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/styles/github.min.css" | |
media="(prefers-color-scheme: light), (prefers-color-scheme: no-preference)" rel="stylesheet"> | |
<!-- Python highlight --> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/highlight.js/11.11.1/languages/python.min.js" | |
integrity="sha512-/uCTceIDOniHf+VUKbCnP/x6GQSRrm4GwUtQYMgKa9yIZPGzlR04flSsD+2or7bPn44VY9inIHI4cwNCcZmJDw==" | |
crossorigin="anonymous" referrerpolicy="no-referrer"></script> | |
<!-- Tufte CSS--> | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/tufte-css/1.8.0/tufte.min.css" | |
integrity="sha512-F5lKjC1GKbwLFXdThwMWx8yF8TX/WVrdhWYN9PWb6eb5hIRLmO463nrpqLnEUHxy2EHIzfC4dq/mncHD6ndR+g==" | |
crossorigin="anonymous" referrerpolicy="no-referrer" /> | |
</head> | |
<body> | |
<article> | |
<section> | |
<p><p><img src="https://publish-01.obsidian.md/access/fe81f5f6c89de29a46e04761b30da84c/ChatGPT%20Image%2024%20ago%202025%2C%2016<em>42</em>51.png" alt="Banner"></p><h1>Attention Sinks<span class="heading-link-symbol" aria-hidden="false"/></h1><p>Attention sinks have recently come back to the forefront of architecture discussion, especially due to their appearance in <a href="https://github.com/openai/gpt-oss">gpt-oss</a> (although in a different form than the effect we're discussing today).</p><p>As a mechanism, attention sinks are easy to describe: when trained, decoder-only transformer models tend to allocate a disproportionate amount of attention to the first few tokens, and especially to the first.</p><p>This effect is well studied in its practical terms, and is often attributed to the model "offloading" probability mass to the early tokens to avoid their spurious allocation elsewhere. Recent works, like <a href="https://arxiv.org/abs/2504.20966">Softpick</a>, provide architectural choices that prevent sinks from forming. While this explanation may sound convincing at first glance, my intuition is still bothered by it: what do you mean the model "offloads"? Of course it doesn't explore that possibility intentionally—there must be some mechanism by which the attention sinks are either advantageous or a result of an intrinsic bias of the model.</p><p>In this blogpost, we will argue that there is a significant bias in decoder-only transformers that may be to blame, at least partially, for this phenomenon. Moreover, this will also allow us to introduce a series of blogposts focused on analyzing transformers from the lens of message passing on graphs.</p><hr><h2>Attention as message-passing<span class="heading-link-symbol" aria-hidden="false"/></h2><p><a href="https://arxiv.org/abs/2506.22084">Recent work by Chaitanya K. Joshi</a> has finally freed us from having to formalize independently a well-known property of Transformers (and especially of attention layers): them being a special case of Graph Neural Networks (just like pretty much anything else, to be fair).</p><p>As a setting to our discussion, though, we will go over another angle with which attention can be seen as message-passing on a graph.</p><p>Most people are usually introduced to (multi-headed) self-attention directly via the <a href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a> paper. Despite this being generally a good practice in my opinion, it directs attention to being interpreted as the simplest way of making tokens interact in a transformer, or as just a soft version of a dictionary lookup. While neither interpretation is wrong, such views often drown out some interesting geometric details that lie in attention itself.</p><hr><h3>Multiheaded attention<span class="heading-link-symbol" aria-hidden="false"/></h3><p>Say you have <span>$n$</span> tokens, with an embedding dimension <span>$d$</span>.</p><p>Let our input tokens be shaped as a matrix <span>$X \in \mathbb{R}^{n \times d}$</span>. We first process <span>$X$</span> with three different linear projections, namely <span>$W_q$</span>, <span>$W_k$</span>, and <span>$W_v$</span>, and end up with the respective matrices:</p><ul> | |
<li><span>$Q \in \mathbb{R}^{n \times d_q}$</span></li> | |
<li><span>$K \in \mathbb{R}^{n \times d_k}$</span></li> | |
<li><span>$V \in \mathbb{R}^{n \times d_v}$</span></li> | |
</ul><p>We then perform the well-known attention operation:</p><p>$$ | |
\text{attention}(X) = \mathrm{softmax}\!\left(\frac{QK^{\top}}{\sqrt{d_k}}\right)V | |
$$</p><p>Let’s take a look at <span>$\alpha = QK^{\top}$</span>. If we rewrite it component-wise we get:</p><p>$$ | |
\alpha_{ij} = \sum_{l=1}^{d_k} Q_{il}(K^{\top})_{lj} = \sum_{l=1}^{d_k} Q_{il}K_{jl} | |
$$</p><p>Noting that rows of <span>$Q$</span> and <span>$K$</span> are respectively <span>$q_i$</span> and <span>$k_j$</span>, we see:</p><p>$$ | |
\alpha_{ij} = \langle q_i, k_j \rangle | |
$$</p><p>So the entries of the attention matrix <span>$\alpha$</span> are simply dot products between projected token embeddings.</p><hr><h3>Point-cloud interpretation<span class="heading-link-symbol" aria-hidden="false"/></h3><p>Now, reinterpret these operations geometrically.</p><p>Take</p><p>$$ | |
X \in \mathbb{R}^{\,n\times d} | |
$$</p><p>and treat the rows of <span>$X$</span> as a <strong>point cloud</strong>:</p><p>$$ | |
X = | |
\begin{bmatrix} | |
x_1^{\!\top}\\ | |
x_2^{\!\top}\\ | |
\vdots\\ | |
x_n^{\!\top} | |
\end{bmatrix}, | |
\qquad | |
x_i \in \mathbb{R}^d. | |
$$</p><p>Constructing the <span>$Q, K, V$</span> matrices:</p><p>$$Q = X W_q \in \mathbb{R}^{\,n\times d_q}$$</p><p>$$K = X W_k \in \mathbb{R}^{\,n\times d_k}$$</p><p>$$V = X W_v \in \mathbb{R}^{\,n\times d_v}$$</p><p>We can view this as projecting the cloud in three different ways. Then we define an adjacency-like structure:</p><p>$$ | |
\alpha_{ij} = \langle q_i, k_j \rangle = q_i k_j^{\top} | |
$$</p><p>Stacking all scores:</p><p>$$ | |
\alpha = QK^{\top} \in \mathbb{R}^{\,n\times n} | |
$$</p><p>The more points align in Query–Key space, the stronger their connection.</p><p>After softmax:</p><p>$$ | |
A_{ij} = | |
\frac{\exp\!\bigl(\alpha_{ij}/\sqrt{d_k}\bigr)} | |
{\sum_{j'=1}^n \exp\!\bigl(\alpha_{ij'}/\sqrt{d_k}\bigr)}\,, | |
\qquad | |
A = \mathrm{softmax}\!\Bigl(\tfrac{\alpha}{\sqrt{d_k}}\Bigr) | |
$$</p><p>Each row of <span>$A$</span> is a probability distribution over neighbors.</p><p>Finally:</p><p>$$ | |
\text{attention}(X) = AV | |
$$</p><p>This can be interpreted as diffusion on a graph: each node’s value is updated as a weighted average of its neighbors.</p><hr><h2>Causal Transformers and Attention Sinks<span class="heading-link-symbol" aria-hidden="false"/></h2><p>In causal Transformers, masking turns the attention graph into a <strong>Directed Acyclic Graph (DAG)</strong>. That structure is nilpotent: repeated applications of <span>$A$</span> push mass leftward toward earlier tokens.</p><p>This explains why attention sinks naturally emerge.</p><p><img src="https://publish-01.obsidian.md/access/fe81f5f6c89de29a46e04761b30da84c/new<em>A%5E1.png" alt="Masked A^1"> <img src="https://publish-01.obsidian.md/access/fe81f5f6c89de29a46e04761b30da84c/Pasted%20image%2020250822184715.png" alt="Masked A^2"> <img src="new</em>A^4.png" alt="Masked A^4"> <img src="new_A^8.png" alt="Masked A^8"></p><p><em>Fig.1–4: Simulated attention matrices across powers, showing mass concentration on early tokens.</em></p><hr><h2>Wrapping up<span class="heading-link-symbol" aria-hidden="false"/></h2><p>We individuated a possible mechanism biasing causal Transformers toward allocating attention to their first few tokens. Despite simplifications, this intrinsic bias explains attention sinks as a structural effect, not just a learned feature.</p><hr><h3>Acknowledgements<span class="heading-link-symbol" aria-hidden="false"/></h3> | |
<p>Thanks a lot to <a href="https://x.com/thelokasiffers">thelakosiffers</a>, <a href="https://x.com/Niccolg92">Niccolò</a>, <a href="https://x.com/fabmilo">Fabrizio</a>, <a href="https://x.com/Cyndesama">Cynde</a>, <a href="https://x.com/f14bertolotti">Francesco</a>, and <a href="https://x.com/zmkzmkz">Zed</a> for their precious feedback!</p><hr><h3>Suggested citation<span class="heading-link-symbol" aria-hidden="false"/></h3><pre><code class="language-bibtex">@misc{pappone2025attentionsinks, | |
author = {Francesco Pappone}, | |
title = {Attention sinks from the graph perspective}, | |
year = {2025}, | |
month = {August}, | |
day = {24}, | |
institution = {Università La Sapienza di Roma -- PSTP Technoscience}, | |
howpublished = {\url{https://publish.obsidian.md/the-tensor-throne/Transformers+as+GNNs/Attention+sinks+from+the+graph+perspective}}, | |
note = {Blogpost} | |
}</code></pre></p> | |
</section> | |
<section> | |
<p></p> | |
</section> | |
<script> | |
hljs.highlightAll(); | |
/** | |
* Utility function to calculate the current theme setting. | |
* Look for a local storage value. | |
* Fall back to system setting. | |
* Fall back to light mode. | |
*/ | |
function calculateSettingAsThemeString({ localStorageTheme, systemSettingDark }) { | |
if (localStorageTheme !== null) { | |
return localStorageTheme; | |
} | |
if (systemSettingDark.matches) { | |
return "dark"; | |
} | |
return "light"; | |
} | |
/** | |
* Utility function to update the button text and aria-label. | |
*/ | |
function updateButton({ buttonEl, isDark }) { | |
const newCta = isDark ? "Change to light theme" : "Change to dark theme"; | |
// use an aria-label if you are omitting text on the button | |
// and using a sun/moon icon, for example | |
buttonEl.setAttribute("aria-label", newCta); | |
buttonEl.innerText = newCta; | |
} | |
/** | |
* Utility function to update the theme setting on the html tag | |
*/ | |
function updateThemeOnHTMLElement({ theme }) { | |
document.querySelector("body").setAttribute("data-theme", theme); | |
} | |
/** | |
* On page load: | |
*/ | |
/** | |
* 1. Grab what we need from the DOM and system settings on page load | |
*/ | |
const button = document.querySelector("[data-theme-toggle]"); | |
const localStorageTheme = localStorage.getItem("theme"); | |
const systemSettingDark = window.matchMedia("(prefers-color-scheme: dark)"); | |
/** | |
* 2. Work out the current site settings | |
*/ | |
let currentThemeSetting = calculateSettingAsThemeString({ localStorageTheme, systemSettingDark }); | |
/** | |
* 3. Update the theme setting and button text according to current settings | |
*/ | |
updateButton({ buttonEl: button, isDark: currentThemeSetting === "dark" }); | |
updateThemeOnHTMLElement({ theme: currentThemeSetting }); | |
/** | |
* 4. Add an event listener to toggle the theme | |
*/ | |
button.addEventListener("click", (event) => { | |
const newTheme = currentThemeSetting === "dark" ? "light" : "dark"; | |
localStorage.setItem("theme", newTheme); | |
updateButton({ buttonEl: button, isDark: newTheme === "dark" }); | |
updateThemeOnHTMLElement({ theme: newTheme }); | |
currentThemeSetting = newTheme; | |
console.log("Current theme setting:", currentThemeSetting); | |
const localStorageTheme = localStorage.getItem("theme"); | |
const systemSettingDark = window.matchMedia("(prefers-color-scheme: dark)"); | |
console.log("Local storage theme:", localStorageTheme); | |
console.log("System setting dark mode:", systemSettingDark.matches); | |
}); | |
</script> | |
<script> | |
document.addEventListener("DOMContentLoaded", function () { | |
renderMathInElement(document.body, { | |
// customised options | |
// • auto-render specific keys, e.g.: | |
delimiters: [ | |
{ left: '$$', right: '$$', display: true }, | |
{ left: '\\(', right: '\\)', display: false }, | |
{ left: '$', right: '$', display: false }, | |
{ left: '\\[', right: '\\]', display: true } | |
], | |
// • rendering keys, e.g.: | |
throwOnError: false | |
}); | |
}); | |
</script> | |
<!-- <button type="button" data-theme-toggle aria-label="Change to light theme"> | |
Change to light theme (or icon here) | |
</button> --> | |
</body> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment