Created
February 22, 2026 04:43
-
-
Save jbdamask/d53ae439673fbbd6b44da266eb27e478 to your computer and use it in GitHub Desktop.
NowIGetIt: attentionisalluneed.pdf
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <!DOCTYPE html> | |
| <html lang="en"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> | |
| <title>Attention Is All You Need — Interactive Explainer</title> | |
| <style> | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500;600&display=swap'); | |
| * { margin: 0; padding: 0; box-sizing: border-box; } | |
| :root { | |
| --bg: #0a0a0f; | |
| --bg2: #12121a; | |
| --bg3: #1a1a2e; | |
| --accent: #6c63ff; | |
| --accent2: #ff6584; | |
| --accent3: #43e97b; | |
| --accent4: #38f9d7; | |
| --text: #e8e8f0; | |
| --text2: #9595b0; | |
| --gold: #ffd700; | |
| } | |
| html { scroll-behavior: smooth; } | |
| body { | |
| font-family: 'Inter', sans-serif; | |
| background: var(--bg); | |
| color: var(--text); | |
| overflow-x: hidden; | |
| line-height: 1.7; | |
| } | |
| /* Scrollbar */ | |
| ::-webkit-scrollbar { width: 6px; } | |
| ::-webkit-scrollbar-track { background: var(--bg); } | |
| ::-webkit-scrollbar-thumb { background: var(--accent); border-radius: 3px; } | |
| /* Canvas bg */ | |
| #bgCanvas { | |
| position: fixed; | |
| top: 0; left: 0; | |
| width: 100%; height: 100%; | |
| z-index: 0; | |
| pointer-events: none; | |
| } | |
| .content { position: relative; z-index: 1; } | |
| /* NAV */ | |
| nav { | |
| position: fixed; top: 0; left: 0; right: 0; | |
| z-index: 100; | |
| background: rgba(10,10,15,0.85); | |
| backdrop-filter: blur(20px); | |
| border-bottom: 1px solid rgba(108,99,255,0.15); | |
| padding: 0 2rem; | |
| display: flex; | |
| align-items: center; | |
| height: 60px; | |
| transition: transform 0.3s; | |
| } | |
| nav .logo { | |
| font-weight: 800; | |
| font-size: 1.1rem; | |
| background: linear-gradient(135deg, var(--accent), var(--accent2)); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| letter-spacing: -0.5px; | |
| } | |
| nav .nav-links { | |
| display: flex; gap: 1.5rem; margin-left: auto; | |
| list-style: none; | |
| } | |
| nav .nav-links a { | |
| color: var(--text2); | |
| text-decoration: none; | |
| font-size: 0.85rem; | |
| font-weight: 500; | |
| transition: color 0.3s; | |
| } | |
| nav .nav-links a:hover { color: var(--accent); } | |
| /* HERO */ | |
| .hero { | |
| min-height: 100vh; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| text-align: center; | |
| padding: 2rem; | |
| position: relative; | |
| } | |
| .hero-badge { | |
| display: inline-block; | |
| padding: 0.4rem 1.2rem; | |
| border: 1px solid rgba(108,99,255,0.4); | |
| border-radius: 50px; | |
| font-size: 0.8rem; | |
| font-weight: 500; | |
| color: var(--accent); | |
| margin-bottom: 2rem; | |
| background: rgba(108,99,255,0.08); | |
| animation: pulse-border 3s infinite; | |
| } | |
| @keyframes pulse-border { | |
| 0%, 100% { border-color: rgba(108,99,255,0.4); } | |
| 50% { border-color: rgba(108,99,255,0.8); } | |
| } | |
| .hero h1 { | |
| font-size: clamp(2.5rem, 7vw, 5.5rem); | |
| font-weight: 900; | |
| line-height: 1.05; | |
| margin-bottom: 1.5rem; | |
| letter-spacing: -2px; | |
| } | |
| .hero h1 .gradient { | |
| background: linear-gradient(135deg, var(--accent), var(--accent2), var(--accent4)); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| background-size: 200% 200%; | |
| animation: gradient-shift 5s ease infinite; | |
| } | |
| @keyframes gradient-shift { | |
| 0%, 100% { background-position: 0% 50%; } | |
| 50% { background-position: 100% 50%; } | |
| } | |
| .hero p { | |
| max-width: 600px; | |
| color: var(--text2); | |
| font-size: 1.15rem; | |
| margin-bottom: 3rem; | |
| } | |
| .hero-cta { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| padding: 0.9rem 2rem; | |
| background: linear-gradient(135deg, var(--accent), #8b5cf6); | |
| color: white; | |
| border: none; | |
| border-radius: 50px; | |
| font-size: 1rem; | |
| font-weight: 600; | |
| cursor: pointer; | |
| text-decoration: none; | |
| transition: transform 0.3s, box-shadow 0.3s; | |
| box-shadow: 0 0 30px rgba(108,99,255,0.3); | |
| } | |
| .hero-cta:hover { | |
| transform: translateY(-2px); | |
| box-shadow: 0 0 50px rgba(108,99,255,0.5); | |
| } | |
| .scroll-hint { | |
| position: absolute; | |
| bottom: 2rem; | |
| animation: bounce 2s ease infinite; | |
| color: var(--text2); | |
| font-size: 0.8rem; | |
| } | |
| @keyframes bounce { | |
| 0%, 100% { transform: translateY(0); } | |
| 50% { transform: translateY(10px); } | |
| } | |
| /* SECTIONS */ | |
| section { | |
| max-width: 1100px; | |
| margin: 0 auto; | |
| padding: 6rem 2rem; | |
| } | |
| .section-label { | |
| font-size: 0.75rem; | |
| text-transform: uppercase; | |
| letter-spacing: 3px; | |
| color: var(--accent); | |
| font-weight: 600; | |
| margin-bottom: 1rem; | |
| } | |
| .section-title { | |
| font-size: clamp(1.8rem, 4vw, 3rem); | |
| font-weight: 800; | |
| margin-bottom: 1.5rem; | |
| letter-spacing: -1px; | |
| line-height: 1.15; | |
| } | |
| .section-desc { | |
| color: var(--text2); | |
| font-size: 1.05rem; | |
| max-width: 700px; | |
| margin-bottom: 3rem; | |
| } | |
| /* CARDS */ | |
| .card-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); | |
| gap: 1.5rem; | |
| } | |
| .card { | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 16px; | |
| padding: 2rem; | |
| transition: transform 0.3s, border-color 0.3s, box-shadow 0.3s; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .card:hover { | |
| transform: translateY(-4px); | |
| border-color: rgba(108,99,255,0.3); | |
| box-shadow: 0 20px 60px rgba(0,0,0,0.3); | |
| } | |
| .card-icon { | |
| width: 48px; height: 48px; | |
| border-radius: 12px; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| font-size: 1.5rem; | |
| margin-bottom: 1.2rem; | |
| } | |
| .card h3 { | |
| font-size: 1.15rem; | |
| font-weight: 700; | |
| margin-bottom: 0.7rem; | |
| } | |
| .card p { | |
| color: var(--text2); | |
| font-size: 0.92rem; | |
| line-height: 1.65; | |
| } | |
| /* INTERACTIVE ATTENTION DEMO */ | |
| .attention-demo { | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 20px; | |
| padding: 2.5rem; | |
| margin-top: 2rem; | |
| } | |
| .demo-header { | |
| display: flex; | |
| align-items: center; | |
| justify-content: space-between; | |
| margin-bottom: 2rem; | |
| flex-wrap: wrap; | |
| gap: 1rem; | |
| } | |
| .demo-header h3 { | |
| font-size: 1.3rem; | |
| font-weight: 700; | |
| } | |
| .demo-controls { | |
| display: flex; gap: 0.5rem; | |
| } | |
| .demo-btn { | |
| padding: 0.5rem 1rem; | |
| border: 1px solid rgba(255,255,255,0.1); | |
| background: rgba(255,255,255,0.04); | |
| color: var(--text); | |
| border-radius: 8px; | |
| font-size: 0.8rem; | |
| cursor: pointer; | |
| transition: all 0.3s; | |
| font-family: inherit; | |
| } | |
| .demo-btn:hover, .demo-btn.active { | |
| background: var(--accent); | |
| border-color: var(--accent); | |
| color: white; | |
| } | |
| .sentence-row { | |
| display: flex; | |
| gap: 0.5rem; | |
| flex-wrap: wrap; | |
| margin-bottom: 1rem; | |
| justify-content: center; | |
| } | |
| .word-token { | |
| padding: 0.5rem 0.8rem; | |
| background: var(--bg3); | |
| border: 1px solid rgba(255,255,255,0.08); | |
| border-radius: 8px; | |
| cursor: pointer; | |
| transition: all 0.3s; | |
| font-size: 0.9rem; | |
| font-weight: 500; | |
| position: relative; | |
| user-select: none; | |
| } | |
| .word-token:hover { | |
| border-color: var(--accent); | |
| background: rgba(108,99,255,0.15); | |
| } | |
| .word-token.selected { | |
| background: var(--accent); | |
| border-color: var(--accent); | |
| color: white; | |
| box-shadow: 0 0 20px rgba(108,99,255,0.4); | |
| } | |
| .attention-viz { | |
| position: relative; | |
| margin: 2rem 0; | |
| min-height: 100px; | |
| } | |
| #attentionCanvas { | |
| width: 100%; | |
| height: 120px; | |
| display: block; | |
| } | |
| .target-row { | |
| display: flex; | |
| gap: 0.5rem; | |
| flex-wrap: wrap; | |
| justify-content: center; | |
| } | |
| .target-token { | |
| padding: 0.5rem 0.8rem; | |
| border-radius: 8px; | |
| font-size: 0.9rem; | |
| font-weight: 500; | |
| transition: all 0.3s; | |
| border: 1px solid transparent; | |
| } | |
| .attention-bar { | |
| height: 4px; | |
| background: linear-gradient(90deg, var(--accent), var(--accent4)); | |
| border-radius: 2px; | |
| margin-top: 4px; | |
| transition: width 0.5s ease; | |
| } | |
| /* ARCHITECTURE INTERACTIVE */ | |
| .arch-container { | |
| display: flex; | |
| gap: 2rem; | |
| margin-top: 2rem; | |
| flex-wrap: wrap; | |
| } | |
| .arch-diagram { | |
| flex: 1; | |
| min-width: 300px; | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 20px; | |
| padding: 2rem; | |
| position: relative; | |
| } | |
| .arch-info { | |
| flex: 1; | |
| min-width: 300px; | |
| } | |
| .arch-layer { | |
| padding: 0.8rem 1.2rem; | |
| margin: 0.5rem 0; | |
| border-radius: 10px; | |
| cursor: pointer; | |
| transition: all 0.3s; | |
| border: 1px solid rgba(255,255,255,0.06); | |
| font-size: 0.9rem; | |
| font-weight: 500; | |
| text-align: center; | |
| position: relative; | |
| } | |
| .arch-layer:hover { | |
| transform: scale(1.02); | |
| } | |
| .arch-layer.active { | |
| border-color: var(--accent); | |
| box-shadow: 0 0 20px rgba(108,99,255,0.2); | |
| } | |
| .arch-layer .layer-tag { | |
| position: absolute; | |
| right: 8px; | |
| top: 50%; | |
| transform: translateY(-50%); | |
| font-size: 0.65rem; | |
| background: rgba(108,99,255,0.2); | |
| color: var(--accent); | |
| padding: 0.15rem 0.5rem; | |
| border-radius: 4px; | |
| } | |
| .layer-detail { | |
| display: none; | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 16px; | |
| padding: 2rem; | |
| animation: fadeIn 0.4s; | |
| } | |
| .layer-detail.visible { display: block; } | |
| @keyframes fadeIn { | |
| from { opacity: 0; transform: translateY(10px); } | |
| to { opacity: 1; transform: translateY(0); } | |
| } | |
| .layer-detail h4 { | |
| font-size: 1.2rem; | |
| margin-bottom: 0.8rem; | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .layer-detail p { | |
| color: var(--text2); | |
| font-size: 0.92rem; | |
| line-height: 1.7; | |
| } | |
| /* COMPARISON BARS */ | |
| .compare-section { | |
| margin-top: 3rem; | |
| } | |
| .compare-item { | |
| margin-bottom: 1.5rem; | |
| } | |
| .compare-label { | |
| display: flex; | |
| justify-content: space-between; | |
| margin-bottom: 0.5rem; | |
| font-size: 0.9rem; | |
| } | |
| .compare-label .score { | |
| font-weight: 700; | |
| font-family: 'JetBrains Mono', monospace; | |
| } | |
| .compare-bar-bg { | |
| height: 32px; | |
| background: var(--bg3); | |
| border-radius: 8px; | |
| overflow: hidden; | |
| position: relative; | |
| } | |
| .compare-bar-fill { | |
| height: 100%; | |
| border-radius: 8px; | |
| transition: width 1.5s ease; | |
| width: 0%; | |
| display: flex; | |
| align-items: center; | |
| padding-left: 1rem; | |
| font-size: 0.75rem; | |
| font-weight: 600; | |
| color: white; | |
| } | |
| /* POSITIONAL ENCODING VIZ */ | |
| .pe-container { | |
| margin-top: 2rem; | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 20px; | |
| padding: 2rem; | |
| overflow: hidden; | |
| } | |
| #peCanvas { | |
| width: 100%; | |
| height: 200px; | |
| display: block; | |
| border-radius: 10px; | |
| } | |
| .pe-controls { | |
| display: flex; | |
| gap: 1rem; | |
| margin-top: 1rem; | |
| align-items: center; | |
| flex-wrap: wrap; | |
| } | |
| .pe-controls label { | |
| font-size: 0.85rem; | |
| color: var(--text2); | |
| } | |
| .pe-controls input[type="range"] { | |
| -webkit-appearance: none; | |
| background: var(--bg3); | |
| height: 6px; | |
| border-radius: 3px; | |
| outline: none; | |
| width: 150px; | |
| } | |
| .pe-controls input[type="range"]::-webkit-slider-thumb { | |
| -webkit-appearance: none; | |
| width: 16px; height: 16px; | |
| border-radius: 50%; | |
| background: var(--accent); | |
| cursor: pointer; | |
| } | |
| /* MULTI-HEAD VIZ */ | |
| .multihead-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fill, minmax(110px, 1fr)); | |
| gap: 0.8rem; | |
| margin-top: 1.5rem; | |
| } | |
| .head-box { | |
| aspect-ratio: 1; | |
| border-radius: 12px; | |
| display: flex; | |
| flex-direction: column; | |
| align-items: center; | |
| justify-content: center; | |
| cursor: pointer; | |
| transition: all 0.3s; | |
| border: 2px solid transparent; | |
| position: relative; | |
| overflow: hidden; | |
| } | |
| .head-box:hover { | |
| transform: scale(1.05); | |
| } | |
| .head-box.active { | |
| border-color: white; | |
| box-shadow: 0 0 25px rgba(255,255,255,0.15); | |
| } | |
| .head-box canvas { | |
| width: 100%; | |
| height: 100%; | |
| border-radius: 10px; | |
| } | |
| .head-label { | |
| position: absolute; | |
| bottom: 6px; | |
| font-size: 0.7rem; | |
| font-weight: 600; | |
| background: rgba(0,0,0,0.6); | |
| padding: 0.15rem 0.5rem; | |
| border-radius: 4px; | |
| } | |
| /* TIMELINE */ | |
| .timeline { | |
| position: relative; | |
| margin-top: 3rem; | |
| } | |
| .timeline::before { | |
| content: ''; | |
| position: absolute; | |
| left: 20px; | |
| top: 0; | |
| bottom: 0; | |
| width: 2px; | |
| background: linear-gradient(to bottom, var(--accent), var(--accent2)); | |
| } | |
| .timeline-item { | |
| position: relative; | |
| padding-left: 60px; | |
| margin-bottom: 2.5rem; | |
| } | |
| .timeline-dot { | |
| position: absolute; | |
| left: 12px; | |
| top: 4px; | |
| width: 18px; | |
| height: 18px; | |
| border-radius: 50%; | |
| background: var(--accent); | |
| border: 3px solid var(--bg); | |
| box-shadow: 0 0 15px rgba(108,99,255,0.5); | |
| } | |
| .timeline-item h4 { | |
| font-size: 1rem; | |
| margin-bottom: 0.3rem; | |
| } | |
| .timeline-item p { | |
| color: var(--text2); | |
| font-size: 0.88rem; | |
| } | |
| /* STATS */ | |
| .stats-grid { | |
| display: grid; | |
| grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); | |
| gap: 1.5rem; | |
| margin-top: 2rem; | |
| } | |
| .stat-card { | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 16px; | |
| padding: 2rem; | |
| text-align: center; | |
| transition: transform 0.3s; | |
| } | |
| .stat-card:hover { transform: translateY(-4px); } | |
| .stat-number { | |
| font-size: 2.5rem; | |
| font-weight: 900; | |
| font-family: 'JetBrains Mono', monospace; | |
| margin-bottom: 0.3rem; | |
| background: linear-gradient(135deg, var(--accent), var(--accent4)); | |
| -webkit-background-clip: text; | |
| -webkit-text-fill-color: transparent; | |
| } | |
| .stat-label { | |
| color: var(--text2); | |
| font-size: 0.85rem; | |
| } | |
| /* RNN vs Transformer */ | |
| .vs-container { | |
| display: grid; | |
| grid-template-columns: 1fr auto 1fr; | |
| gap: 2rem; | |
| align-items: center; | |
| margin-top: 2rem; | |
| } | |
| .vs-box { | |
| background: var(--bg2); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 16px; | |
| padding: 2rem; | |
| } | |
| .vs-badge { | |
| font-size: 2rem; | |
| font-weight: 900; | |
| color: var(--accent); | |
| text-align: center; | |
| } | |
| .vs-box h4 { | |
| font-size: 1.1rem; | |
| margin-bottom: 1rem; | |
| text-align: center; | |
| } | |
| .vs-list { | |
| list-style: none; | |
| } | |
| .vs-list li { | |
| padding: 0.5rem 0; | |
| font-size: 0.88rem; | |
| color: var(--text2); | |
| display: flex; | |
| align-items: center; | |
| gap: 0.5rem; | |
| } | |
| .vs-list li::before { | |
| content: ''; | |
| width: 6px; height: 6px; | |
| border-radius: 50%; | |
| flex-shrink: 0; | |
| } | |
| .vs-old li::before { background: var(--accent2); } | |
| .vs-new li::before { background: var(--accent3); } | |
| /* FOOTER */ | |
| footer { | |
| text-align: center; | |
| padding: 3rem 2rem; | |
| border-top: 1px solid rgba(255,255,255,0.06); | |
| color: var(--text2); | |
| font-size: 0.85rem; | |
| } | |
| footer a { | |
| color: var(--accent); | |
| text-decoration: none; | |
| } | |
| /* Responsive */ | |
| @media (max-width: 768px) { | |
| .vs-container { | |
| grid-template-columns: 1fr; | |
| } | |
| .vs-badge { display: none; } | |
| nav .nav-links { display: none; } | |
| .arch-container { flex-direction: column; } | |
| } | |
| /* Utility */ | |
| .glow-text { | |
| text-shadow: 0 0 40px rgba(108,99,255,0.3); | |
| } | |
| .fade-in { | |
| opacity: 0; | |
| transform: translateY(30px); | |
| transition: opacity 0.8s, transform 0.8s; | |
| } | |
| .fade-in.visible { | |
| opacity: 1; | |
| transform: translateY(0); | |
| } | |
| /* Code block */ | |
| .formula-block { | |
| background: var(--bg3); | |
| border: 1px solid rgba(255,255,255,0.06); | |
| border-radius: 12px; | |
| padding: 1.5rem; | |
| font-family: 'JetBrains Mono', monospace; | |
| font-size: 0.95rem; | |
| text-align: center; | |
| margin: 1.5rem 0; | |
| overflow-x: auto; | |
| color: var(--accent4); | |
| } | |
| .tag { | |
| display: inline-block; | |
| padding: 0.2rem 0.6rem; | |
| border-radius: 6px; | |
| font-size: 0.72rem; | |
| font-weight: 600; | |
| text-transform: uppercase; | |
| letter-spacing: 0.5px; | |
| } | |
| .tag-encoder { background: rgba(67,233,123,0.15); color: var(--accent3); } | |
| .tag-decoder { background: rgba(255,101,132,0.15); color: var(--accent2); } | |
| .tag-attention { background: rgba(108,99,255,0.15); color: var(--accent); } | |
| .divider { | |
| width: 60px; | |
| height: 3px; | |
| background: linear-gradient(90deg, var(--accent), var(--accent2)); | |
| border-radius: 2px; | |
| margin: 2rem 0; | |
| } | |
| </style> | |
| </head> | |
| <body> | |
| <canvas id="bgCanvas"></canvas> | |
| <nav> | |
| <div class="logo">⚡ Transformer Explained</div> | |
| <ul class="nav-links"> | |
| <li><a href="#problem">The Problem</a></li> | |
| <li><a href="#attention">Attention</a></li> | |
| <li><a href="#architecture">Architecture</a></li> | |
| <li><a href="#demo">Demo</a></li> | |
| <li><a href="#results">Results</a></li> | |
| </ul> | |
| </nav> | |
| <div class="content"> | |
| <!-- HERO --> | |
| <div class="hero"> | |
| <div class="hero-badge">📄 NeurIPS 2017 · Vaswani et al.</div> | |
| <h1 class="glow-text"> | |
| <span class="gradient">Attention</span> Is<br>All You Need | |
| </h1> | |
| <p>The revolutionary 2017 paper that introduced the <strong>Transformer</strong> architecture — the foundation behind GPT, BERT, and virtually every modern AI system. Let's break it down.</p> | |
| <a href="#problem" class="hero-cta"> | |
| Explore the Paper ↓ | |
| </a> | |
| <div class="scroll-hint">scroll to begin</div> | |
| </div> | |
| <!-- THE PROBLEM --> | |
| <section id="problem" class="fade-in"> | |
| <div class="section-label">01 — The Problem</div> | |
| <h2 class="section-title">Why did we need something new?</h2> | |
| <p class="section-desc">Before the Transformer, AI models for language (like translation) processed words <em>one at a time</em>, like reading a book aloud. This was painfully slow and made it hard to understand how distant words relate to each other.</p> | |
| <div class="vs-container"> | |
| <div class="vs-box"> | |
| <h4>🐌 Before: RNNs</h4> | |
| <ul class="vs-list vs-old"> | |
| <li>Process words one-by-one, sequentially</li> | |
| <li>Slow — can't use modern GPUs fully</li> | |
| <li>Forget things from long ago in a sentence</li> | |
| <li>Hard to see long-range connections</li> | |
| <li>Training takes weeks</li> | |
| </ul> | |
| </div> | |
| <div class="vs-badge">VS</div> | |
| <div class="vs-box"> | |
| <h4>⚡ After: Transformer</h4> | |
| <ul class="vs-list vs-new"> | |
| <li>Process ALL words at once, in parallel</li> | |
| <li>Fast — fully utilizes GPU power</li> | |
| <li>Every word can "see" every other word</li> | |
| <li>Learns relationships regardless of distance</li> | |
| <li>Training in hours/days, not weeks</li> | |
| </ul> | |
| </div> | |
| </div> | |
| <!-- RNN Animation --> | |
| <div class="attention-demo" style="margin-top:2rem;"> | |
| <h3 style="margin-bottom:1rem;">⏱ Sequential vs Parallel Processing</h3> | |
| <p style="color:var(--text2);font-size:0.9rem;margin-bottom:1.5rem;">Watch how an RNN processes words one by one, while a Transformer handles them all simultaneously.</p> | |
| <div style="display:flex;gap:2rem;flex-wrap:wrap;"> | |
| <div style="flex:1;min-width:250px;"> | |
| <div style="font-size:0.8rem;color:var(--accent2);font-weight:600;margin-bottom:0.5rem;">RNN (Sequential)</div> | |
| <div id="rnnRow" class="sentence-row" style="justify-content:flex-start;"></div> | |
| </div> | |
| <div style="flex:1;min-width:250px;"> | |
| <div style="font-size:0.8rem;color:var(--accent3);font-weight:600;margin-bottom:0.5rem;">Transformer (Parallel)</div> | |
| <div id="transRow" class="sentence-row" style="justify-content:flex-start;"></div> | |
| </div> | |
| </div> | |
| <button class="demo-btn" id="playSeqBtn" style="margin-top:1rem;" onclick="playSequential()">▶ Play Animation</button> | |
| </div> | |
| </section> | |
| <!-- ATTENTION --> | |
| <section id="attention" class="fade-in"> | |
| <div class="section-label">02 — The Key Idea</div> | |
| <h2 class="section-title">What is "Attention"?</h2> | |
| <p class="section-desc">Attention is a mechanism that lets the model ask: <em>"Which other words should I focus on to understand this word better?"</em> It's like having a spotlight that can highlight relevant parts of a sentence.</p> | |
| <div class="card-grid"> | |
| <div class="card"> | |
| <div class="card-icon" style="background:rgba(108,99,255,0.15);">🔍</div> | |
| <h3>Query (Q)</h3> | |
| <p>The word currently "asking" a question — "What should I pay attention to?" Think of it as a search query.</p> | |
| </div> | |
| <div class="card"> | |
| <div class="card-icon" style="background:rgba(67,233,123,0.15);">🔑</div> | |
| <h3>Key (K)</h3> | |
| <p>Labels on each word that say "this is what I contain." Queries are compared against keys to find relevant matches.</p> | |
| </div> | |
| <div class="card"> | |
| <div class="card-icon" style="background:rgba(255,101,132,0.15);">💎</div> | |
| <h3>Value (V)</h3> | |
| <p>The actual information each word carries. Once we find the relevant keys, we retrieve their values.</p> | |
| </div> | |
| </div> | |
| <div class="formula-block"> | |
| Attention(Q, K, V) = softmax( Q·Kᵀ / √d<sub>k</sub> ) · V | |
| </div> | |
| <p style="color:var(--text2);font-size:0.88rem;text-align:center;">In plain English: compare queries with keys, normalize the scores, then use them to weight the values.</p> | |
| <!-- Interactive Attention Demo --> | |
| <div class="attention-demo" style="margin-top:2.5rem;"> | |
| <div class="demo-header"> | |
| <h3>🎯 Interactive Self-Attention</h3> | |
| <div class="demo-controls"> | |
| <button class="demo-btn active" onclick="setDemoSentence(0, this)">Sentence 1</button> | |
| <button class="demo-btn" onclick="setDemoSentence(1, this)">Sentence 2</button> | |
| <button class="demo-btn" onclick="setDemoSentence(2, this)">Sentence 3</button> | |
| </div> | |
| </div> | |
| <p style="color:var(--text2);font-size:0.88rem;margin-bottom:1.5rem;">Click any word to see what it "pays attention to." Brighter = more attention.</p> | |
| <div class="sentence-row" id="attnSourceRow"></div> | |
| <div style="text-align:center;padding:0.5rem 0;color:var(--text2);font-size:0.75rem;">↕ attention weights ↕</div> | |
| <div class="sentence-row" id="attnTargetRow"></div> | |
| </div> | |
| </section> | |
| <!-- MULTI-HEAD ATTENTION --> | |
| <section class="fade-in"> | |
| <div class="section-label">03 — Multi-Head Attention</div> | |
| <h2 class="section-title">Eight pairs of eyes are better than one</h2> | |
| <p class="section-desc">Instead of running attention once, the Transformer runs it <strong>8 times in parallel</strong> — each "head" can learn to focus on different relationships: grammar, meaning, position, coreference, and more.</p> | |
| <div class="attention-demo"> | |
| <h3 style="margin-bottom:0.5rem;">🧠 Multi-Head Attention Heatmaps</h3> | |
| <p style="color:var(--text2);font-size:0.88rem;margin-bottom:1rem;">Each head learns a different pattern. Click a head to explore.</p> | |
| <div class="multihead-grid" id="multiheadGrid"></div> | |
| <div id="headDescription" style="margin-top:1.5rem;padding:1rem;background:var(--bg3);border-radius:10px;display:none;"> | |
| <p style="color:var(--text2);font-size:0.9rem;" id="headDescText"></p> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- ARCHITECTURE --> | |
| <section id="architecture" class="fade-in"> | |
| <div class="section-label">04 — The Architecture</div> | |
| <h2 class="section-title">Inside the Transformer</h2> | |
| <p class="section-desc">The Transformer has two main parts: an <strong>Encoder</strong> (reads the input) and a <strong>Decoder</strong> (generates the output). Click each layer to learn what it does.</p> | |
| <div class="arch-container"> | |
| <div class="arch-diagram"> | |
| <div style="text-align:center;font-size:0.75rem;color:var(--text2);margin-bottom:1rem;">ENCODER</div> | |
| <div class="arch-layer" data-layer="input" onclick="showLayer('input')" style="background:rgba(108,99,255,0.08);"> | |
| 📝 Input Embedding | |
| <span class="layer-tag">Input</span> | |
| </div> | |
| <div class="arch-layer" data-layer="posenc" onclick="showLayer('posenc')" style="background:rgba(56,249,215,0.08);"> | |
| 📍 + Positional Encoding | |
| <span class="layer-tag">Position</span> | |
| </div> | |
| <div style="border-left:2px solid rgba(67,233,123,0.3);margin-left:1.5rem;padding-left:0.5rem;"> | |
| <div style="font-size:0.7rem;color:var(--accent3);margin:0.5rem 0;">×6 layers</div> | |
| <div class="arch-layer" data-layer="selfattn" onclick="showLayer('selfattn')" style="background:rgba(67,233,123,0.08);"> | |
| 🔗 Self-Attention | |
| <span class="layer-tag">Attention</span> | |
| </div> | |
| <div class="arch-layer" data-layer="ffn" onclick="showLayer('ffn')" style="background:rgba(67,233,123,0.05);"> | |
| ⚙️ Feed-Forward Network | |
| <span class="layer-tag">FFN</span> | |
| </div> | |
| <div class="arch-layer" data-layer="addnorm" onclick="showLayer('addnorm')" style="background:rgba(67,233,123,0.03);"> | |
| ➕ Add & Layer Norm | |
| <span class="layer-tag">Residual</span> | |
| </div> | |
| </div> | |
| <div style="text-align:center;margin:1.5rem 0 0.5rem;font-size:0.75rem;color:var(--text2);">DECODER</div> | |
| <div style="border-left:2px solid rgba(255,101,132,0.3);margin-left:1.5rem;padding-left:0.5rem;"> | |
| <div style="font-size:0.7rem;color:var(--accent2);margin:0.5rem 0;">×6 layers</div> | |
| <div class="arch-layer" data-layer="maskedattn" onclick="showLayer('maskedattn')" style="background:rgba(255,101,132,0.08);"> | |
| 🎭 Masked Self-Attention | |
| <span class="layer-tag">Masked</span> | |
| </div> | |
| <div class="arch-layer" data-layer="crossattn" onclick="showLayer('crossattn')" style="background:rgba(255,101,132,0.06);"> | |
| 🔀 Cross-Attention | |
| <span class="layer-tag">Enc→Dec</span> | |
| </div> | |
| <div class="arch-layer" data-layer="ffn2" onclick="showLayer('ffn2')" style="background:rgba(255,101,132,0.04);"> | |
| ⚙️ Feed-Forward Network | |
| <span class="layer-tag">FFN</span> | |
| </div> | |
| </div> | |
| <div class="arch-layer" data-layer="output" onclick="showLayer('output')" style="background:rgba(255,215,0,0.08);"> | |
| 🎯 Linear + Softmax → Output | |
| <span class="layer-tag">Output</span> | |
| </div> | |
| </div> | |
| <div class="arch-info"> | |
| <div id="layerDetail-default" class="layer-detail visible"> | |
| <h4>👈 Click a layer to explore</h4> | |
| <p>Each component of the Transformer plays a crucial role. Click on any layer in the architecture diagram to learn what it does and why it matters.</p> | |
| <div class="divider"></div> | |
| <div class="stats-grid" style="margin-top:1rem;"> | |
| <div class="stat-card"> | |
| <div class="stat-number">6</div> | |
| <div class="stat-label">Layers per stack</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-number">512</div> | |
| <div class="stat-label">Model dimension</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-number">8</div> | |
| <div class="stat-label">Attention heads</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-number">65M</div> | |
| <div class="stat-label">Parameters</div> | |
| </div> | |
| </div> | |
| </div> | |
| <div id="layerDetail-input" class="layer-detail"> | |
| <h4><span class="tag tag-encoder">Encoder</span> Input Embedding</h4> | |
| <p>Words can't be fed directly to a neural network — they need to be converted to numbers. Each word (or token) is mapped to a <strong>512-dimensional vector</strong>. Think of it as giving each word a unique "fingerprint" in a 512-dimensional space where similar words are closer together.</p> | |
| <div class="formula-block">word → vector of 512 numbers</div> | |
| <p>For example, "king" and "queen" would have similar vectors because they share meaning.</p> | |
| </div> | |
| <div id="layerDetail-posenc" class="layer-detail"> | |
| <h4>📍 Positional Encoding</h4> | |
| <p>Since the Transformer processes all words simultaneously (not sequentially), it has no inherent sense of word order. <strong>Positional encodings</strong> are added to tell the model "this is the 1st word, this is the 2nd word," etc.</p> | |
| <div class="formula-block">PE(pos, 2i) = sin(pos / 10000^(2i/512))<br>PE(pos, 2i+1) = cos(pos / 10000^(2i/512))</div> | |
| <p>These use sine and cosine waves at different frequencies — a clever mathematical trick that lets the model learn relative positions.</p> | |
| </div> | |
| <div id="layerDetail-selfattn" class="layer-detail"> | |
| <h4><span class="tag tag-attention">Attention</span> Self-Attention</h4> | |
| <p>This is the magic ingredient! Each word looks at <strong>every other word</strong> in the sentence to understand context. The word "bank" means something different in "river bank" vs. "bank account" — self-attention helps figure this out.</p> | |
| <p style="margin-top:0.8rem;">Each word generates a Query, Key, and Value. Queries are matched against all Keys to determine how much each word should "attend" to every other word.</p> | |
| </div> | |
| <div id="layerDetail-ffn" class="layer-detail"> | |
| <h4>⚙️ Feed-Forward Network</h4> | |
| <p>After attention gathers contextual information, each position gets processed through a simple neural network independently. It's like each word getting its own "thinking time" to process all the context it just gathered.</p> | |
| <div class="formula-block">FFN(x) = max(0, x·W₁ + b₁)·W₂ + b₂</div> | |
| <p>The inner layer expands to 2048 dimensions (4× the model size), giving the model more "thinking space."</p> | |
| </div> | |
| <div id="layerDetail-addnorm" class="layer-detail"> | |
| <h4>➕ Add & Layer Norm</h4> | |
| <p><strong>Residual connections</strong> add the input of each sub-layer back to its output. This prevents information loss as data flows through many layers — like having a "highway" that lets the original signal pass through.</p> | |
| <p style="margin-top:0.8rem;"><strong>Layer normalization</strong> keeps the values in a stable range, preventing training from becoming unstable. Together, these tricks enable the model to be deep (6 layers) without degradation.</p> | |
| </div> | |
| <div id="layerDetail-maskedattn" class="layer-detail"> | |
| <h4><span class="tag tag-decoder">Decoder</span> Masked Self-Attention</h4> | |
| <p>The decoder generates output one word at a time. When predicting the 5th word, it should only see words 1–4 (not peek at future words!). <strong>Masking</strong> blocks attention to future positions.</p> | |
| <p style="margin-top:0.8rem;">Think of it like writing a sentence left to right — you can only use words you've already written to decide what comes next.</p> | |
| </div> | |
| <div id="layerDetail-crossattn" class="layer-detail"> | |
| <h4>🔀 Cross-Attention (Encoder→Decoder)</h4> | |
| <p>This is the bridge between encoder and decoder. The decoder sends Queries (what it's looking for), while Keys and Values come from the encoder (the input sentence). This lets the decoder focus on relevant parts of the input when generating each output word.</p> | |
| <p style="margin-top:0.8rem;">For translation: when generating "Bonjour," the decoder attends heavily to the input word "Hello."</p> | |
| </div> | |
| <div id="layerDetail-ffn2" class="layer-detail"> | |
| <h4>⚙️ Decoder Feed-Forward</h4> | |
| <p>Same as the encoder's FFN — each position gets independent processing. This gives the model capacity to transform the attended information into useful representations for generating the next output word.</p> | |
| </div> | |
| <div id="layerDetail-output" class="layer-detail"> | |
| <h4>🎯 Output Layer</h4> | |
| <p>The final layer maps the decoder's representation to a probability distribution over the entire vocabulary. The word with the highest probability is chosen as the next output token.</p> | |
| <p style="margin-top:0.8rem;">A <strong>softmax</strong> function ensures all probabilities sum to 1. Beam search explores multiple possibilities to find the best overall sequence.</p> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- POSITIONAL ENCODING VIZ --> | |
| <section class="fade-in"> | |
| <div class="section-label">05 — Positional Encoding</div> | |
| <h2 class="section-title">Teaching word order with waves</h2> | |
| <p class="section-desc">Since the Transformer sees all words at once, it needs a way to know word order. The solution: add unique wave patterns to each word's position. Each row is a position, each column a dimension.</p> | |
| <div class="pe-container"> | |
| <canvas id="peCanvas"></canvas> | |
| <div class="pe-controls"> | |
| <label>Positions: <span id="posCount">30</span></label> | |
| <input type="range" id="posSlider" min="10" max="80" value="30" oninput="updatePE()"> | |
| <label>Dimensions: <span id="dimCount">64</span></label> | |
| <input type="range" id="dimSlider" min="16" max="128" value="64" oninput="updatePE()"> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- INTERACTIVE TRANSLATION DEMO --> | |
| <section id="demo" class="fade-in"> | |
| <div class="section-label">06 — See It In Action</div> | |
| <h2 class="section-title">Translation with attention</h2> | |
| <p class="section-desc">Here's a simplified view of how attention works during English→French translation. The model attends to different input words while generating each output word.</p> | |
| <div class="attention-demo"> | |
| <h3 style="margin-bottom:1rem;">🌍 English → French Translation</h3> | |
| <div style="margin-bottom:1.5rem;"> | |
| <div style="font-size:0.75rem;color:var(--accent3);font-weight:600;margin-bottom:0.5rem;">INPUT (English)</div> | |
| <div class="sentence-row" id="transSourceRow" style="justify-content:flex-start;"></div> | |
| </div> | |
| <div id="translationViz" style="min-height:60px;margin-bottom:1rem;"> | |
| <canvas id="translationCanvas" width="900" height="80" style="width:100%;height:80px;"></canvas> | |
| </div> | |
| <div> | |
| <div style="font-size:0.75rem;color:var(--accent2);font-weight:600;margin-bottom:0.5rem;">OUTPUT (French) — click a word</div> | |
| <div class="sentence-row" id="transTargetRow" style="justify-content:flex-start;"></div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- RESULTS --> | |
| <section id="results" class="fade-in"> | |
| <div class="section-label">07 — Results</div> | |
| <h2 class="section-title">Crushing the competition</h2> | |
| <p class="section-desc">The Transformer didn't just match existing models — it <strong>destroyed</strong> them while training in a fraction of the time.</p> | |
| <div class="stats-grid"> | |
| <div class="stat-card"> | |
| <div class="stat-number">28.4</div> | |
| <div class="stat-label">BLEU score (EN→DE)<br>+2.0 over previous best</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-number">41.8</div> | |
| <div class="stat-label">BLEU score (EN→FR)<br>New state-of-the-art</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-number">3.5</div> | |
| <div class="stat-label">Days to train on 8 GPUs<br>vs weeks for competitors</div> | |
| </div> | |
| <div class="stat-card"> | |
| <div class="stat-number">10×</div> | |
| <div class="stat-label">Less compute needed<br>than competing models</div> | |
| </div> | |
| </div> | |
| <div class="compare-section"> | |
| <h3 style="font-size:1.2rem;margin-bottom:1.5rem;">📊 BLEU Score Comparison (EN→DE)</h3> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>ByteNet</span> | |
| <span class="score">23.75</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="56" style="background:linear-gradient(90deg,#4a4a6a,#6a6a8a);">23.75</div> | |
| </div> | |
| </div> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>GNMT + RL</span> | |
| <span class="score">24.6</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="58" style="background:linear-gradient(90deg,#5a5a7a,#7a7a9a);">24.6</div> | |
| </div> | |
| </div> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>ConvS2S</span> | |
| <span class="score">25.16</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="63" style="background:linear-gradient(90deg,#6a6a9a,#8a8aaa);">25.16</div> | |
| </div> | |
| </div> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>MoE (Mixture of Experts)</span> | |
| <span class="score">26.03</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="67" style="background:linear-gradient(90deg,#7a7aaa,#9a9aba);">26.03</div> | |
| </div> | |
| </div> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>ConvS2S Ensemble</span> | |
| <span class="score">26.36</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="69" style="background:linear-gradient(90deg,#8a8aba,#aaaacc);">26.36</div> | |
| </div> | |
| </div> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>Transformer (base)</span> | |
| <span class="score" style="color:var(--accent);">27.3</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="78" style="background:linear-gradient(90deg,var(--accent),#8b5cf6);">27.3</div> | |
| </div> | |
| </div> | |
| <div class="compare-item"> | |
| <div class="compare-label"> | |
| <span>🏆 Transformer (big)</span> | |
| <span class="score" style="color:var(--gold);">28.4</span> | |
| </div> | |
| <div class="compare-bar-bg"> | |
| <div class="compare-bar-fill" data-width="90" style="background:linear-gradient(90deg,var(--gold),#ff8c00);">28.4 🏆</div> | |
| </div> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- IMPACT --> | |
| <section class="fade-in"> | |
| <div class="section-label">08 — Legacy & Impact</div> | |
| <h2 class="section-title">The paper that changed everything</h2> | |
| <p class="section-desc">The Transformer became the foundation for virtually all modern AI language models. Here's what it spawned:</p> | |
| <div class="timeline"> | |
| <div class="timeline-item"> | |
| <div class="timeline-dot"></div> | |
| <h4>June 2017 — "Attention Is All You Need"</h4> | |
| <p>Published by Google researchers. Introduces the Transformer for machine translation.</p> | |
| </div> | |
| <div class="timeline-item"> | |
| <div class="timeline-dot" style="background:var(--accent3);"></div> | |
| <h4>2018 — BERT (Google)</h4> | |
| <p>Uses the Transformer encoder. Revolutionized NLP benchmarks and search engines.</p> | |
| </div> | |
| <div class="timeline-item"> | |
| <div class="timeline-dot" style="background:var(--accent2);"></div> | |
| <h4>2018-2020 — GPT, GPT-2, GPT-3 (OpenAI)</h4> | |
| <p>Uses the Transformer decoder. Showed that scaling up leads to emergent abilities.</p> | |
| </div> | |
| <div class="timeline-item"> | |
| <div class="timeline-dot" style="background:var(--accent4);"></div> | |
| <h4>2020+ — Vision Transformers (ViT)</h4> | |
| <p>Transformers expand beyond text to images, protein folding (AlphaFold), and more.</p> | |
| </div> | |
| <div class="timeline-item"> | |
| <div class="timeline-dot" style="background:var(--gold);"></div> | |
| <h4>2022-2024 — ChatGPT, GPT-4, Claude, Gemini</h4> | |
| <p>Transformer-based models become mainstream, powering the AI revolution we see today.</p> | |
| </div> | |
| </div> | |
| <div class="card-grid" style="margin-top:3rem;"> | |
| <div class="card"> | |
| <div class="card-icon" style="background:rgba(255,215,0,0.15);">📈</div> | |
| <h3>100,000+ Citations</h3> | |
| <p>One of the most cited AI papers in history. Its ideas permeate virtually every area of modern machine learning.</p> | |
| </div> | |
| <div class="card"> | |
| <div class="card-icon" style="background:rgba(56,249,215,0.15);">🌐</div> | |
| <h3>Beyond Language</h3> | |
| <p>Transformers now work on images, audio, video, protein structures, code generation, robotics, and more.</p> | |
| </div> | |
| <div class="card"> | |
| <div class="card-icon" style="background:rgba(108,99,255,0.15);">💡</div> | |
| <h3>One Key Insight</h3> | |
| <p>You don't need complex recurrence or convolution. A simple attention mechanism, applied right, is all you need.</p> | |
| </div> | |
| </div> | |
| </section> | |
| <!-- KEY TAKEAWAYS --> | |
| <section class="fade-in"> | |
| <div class="section-label">TL;DR</div> | |
| <h2 class="section-title">Key Takeaways</h2> | |
| <div class="card-grid"> | |
| <div class="card" style="border-left:3px solid var(--accent);"> | |
| <h3>1. Ditch sequential processing</h3> | |
| <p>Processing words one by one is slow. Processing them all at once with attention is massively faster and more effective.</p> | |
| </div> | |
| <div class="card" style="border-left:3px solid var(--accent3);"> | |
| <h3>2. Self-attention is powerful</h3> | |
| <p>Letting every word directly attend to every other word captures relationships that sequential models struggle with.</p> | |
| </div> | |
| <div class="card" style="border-left:3px solid var(--accent2);"> | |
| <h3>3. Multi-head = multi-perspective</h3> | |
| <p>Running multiple attention heads in parallel lets the model capture grammar, meaning, and structure simultaneously.</p> | |
| </div> | |
| <div class="card" style="border-left:3px solid var(--accent4);"> | |
| <h3>4. Simplicity scales</h3> | |
| <p>The Transformer's clean, simple design made it easy to scale to billions of parameters — enabling modern LLMs.</p> | |
| </div> | |
| </div> | |
| </section> | |
| </div> | |
| <footer> | |
| <p>Interactive explainer for <a href="https://arxiv.org/abs/1706.03762" target="_blank">"Attention Is All You Need" (Vaswani et al., 2017)</a></p> | |
| <p style="margin-top:0.5rem;">Built for educational purposes. Paper by Vaswani, Shazeer, Parmar, Uszkoreit, Jones, Gomez, Kaiser & Polosukhin.</p> | |
| </footer> | |
| <script> | |
| // ===== BG CANVAS ===== | |
| const bgCanvas = document.getElementById('bgCanvas'); | |
| const bgCtx = bgCanvas.getContext('2d'); | |
| let particles = []; | |
| function resizeBg() { | |
| bgCanvas.width = window.innerWidth; | |
| bgCanvas.height = window.innerHeight; | |
| } | |
| resizeBg(); | |
| window.addEventListener('resize', resizeBg); | |
| for (let i = 0; i < 60; i++) { | |
| particles.push({ | |
| x: Math.random() * window.innerWidth, | |
| y: Math.random() * window.innerHeight, | |
| vx: (Math.random() - 0.5) * 0.3, | |
| vy: (Math.random() - 0.5) * 0.3, | |
| r: Math.random() * 2 + 0.5, | |
| opacity: Math.random() * 0.3 + 0.05 | |
| }); | |
| } | |
| function drawBg() { | |
| bgCtx.clearRect(0, 0, bgCanvas.width, bgCanvas.height); | |
| particles.forEach(p => { | |
| p.x += p.vx; | |
| p.y += p.vy; | |
| if (p.x < 0) p.x = bgCanvas.width; | |
| if (p.x > bgCanvas.width) p.x = 0; | |
| if (p.y < 0) p.y = bgCanvas.height; | |
| if (p.y > bgCanvas.height) p.y = 0; | |
| bgCtx.beginPath(); | |
| bgCtx.arc(p.x, p.y, p.r, 0, Math.PI * 2); | |
| bgCtx.fillStyle = `rgba(108, 99, 255, ${p.opacity})`; | |
| bgCtx.fill(); | |
| }); | |
| // Draw connections | |
| for (let i = 0; i < particles.length; i++) { | |
| for (let j = i + 1; j < particles.length; j++) { | |
| const dx = particles[i].x - particles[j].x; | |
| const dy = particles[i].y - particles[j].y; | |
| const d = Math.sqrt(dx * dx + dy * dy); | |
| if (d < 150) { | |
| bgCtx.beginPath(); | |
| bgCtx.moveTo(particles[i].x, particles[i].y); | |
| bgCtx.lineTo(particles[j].x, particles[j].y); | |
| bgCtx.strokeStyle = `rgba(108, 99, 255, ${0.06 * (1 - d / 150)})`; | |
| bgCtx.lineWidth = 0.5; | |
| bgCtx.stroke(); | |
| } | |
| } | |
| } | |
| requestAnimationFrame(drawBg); | |
| } | |
| drawBg(); | |
| // ===== SCROLL ANIMATIONS ===== | |
| const observer = new IntersectionObserver((entries) => { | |
| entries.forEach(entry => { | |
| if (entry.isIntersecting) { | |
| entry.target.classList.add('visible'); | |
| // Animate comparison bars | |
| entry.target.querySelectorAll('.compare-bar-fill').forEach(bar => { | |
| setTimeout(() => { | |
| bar.style.width = bar.dataset.width + '%'; | |
| }, 200); | |
| }); | |
| } | |
| }); | |
| }, { threshold: 0.1 }); | |
| document.querySelectorAll('.fade-in').forEach(el => observer.observe(el)); | |
| // ===== SEQUENTIAL VS PARALLEL ===== | |
| const seqWords = ['The', 'cat', 'sat', 'on', 'the', 'mat']; | |
| function initSeqDemo() { | |
| const rnnRow = document.getElementById('rnnRow'); | |
| const transRow = document.getElementById('transRow'); | |
| rnnRow.innerHTML = ''; | |
| transRow.innerHTML = ''; | |
| seqWords.forEach(w => { | |
| const t1 = document.createElement('div'); | |
| t1.className = 'word-token'; | |
| t1.textContent = w; | |
| t1.style.opacity = '0.3'; | |
| rnnRow.appendChild(t1); | |
| const t2 = document.createElement('div'); | |
| t2.className = 'word-token'; | |
| t2.textContent = w; | |
| t2.style.opacity = '0.3'; | |
| transRow.appendChild(t2); | |
| }); | |
| } | |
| initSeqDemo(); | |
| function playSequential() { | |
| initSeqDemo(); | |
| const rnnTokens = document.getElementById('rnnRow').children; | |
| const transTokens = document.getElementById('transRow').children; | |
| // RNN: one by one | |
| for (let i = 0; i < seqWords.length; i++) { | |
| setTimeout(() => { | |
| rnnTokens[i].style.opacity = '1'; | |
| rnnTokens[i].classList.add('selected'); | |
| setTimeout(() => rnnTokens[i].classList.remove('selected'), 350); | |
| }, i * 400); | |
| } | |
| // Transformer: all at once after a small delay | |
| setTimeout(() => { | |
| for (let i = 0; i < seqWords.length; i++) { | |
| transTokens[i].style.opacity = '1'; | |
| transTokens[i].classList.add('selected'); | |
| setTimeout(() => { | |
| transTokens[i].classList.remove('selected'); | |
| }, 500); | |
| } | |
| }, 300); | |
| } | |
| // ===== INTERACTIVE ATTENTION DEMO ===== | |
| const sentences = [ | |
| { | |
| words: ['The', 'animal', 'didn\'t', 'cross', 'the', 'street', 'because', 'it', 'was', 'too', 'tired'], | |
| attn: { | |
| 0: [0.6,0.05,0.02,0.02,0.1,0.05,0.02,0.03,0.03,0.03,0.05], | |
| 1: [0.1,0.5,0.02,0.05,0.02,0.08,0.02,0.08,0.03,0.03,0.07], | |
| 2: [0.03,0.05,0.5,0.15,0.02,0.05,0.05,0.03,0.05,0.02,0.05], | |
| 3: [0.02,0.1,0.1,0.4,0.03,0.15,0.03,0.02,0.05,0.03,0.07], | |
| 4: [0.05,0.02,0.02,0.02,0.3,0.35,0.03,0.03,0.05,0.05,0.08], | |
| 5: [0.02,0.05,0.03,0.15,0.15,0.4,0.02,0.02,0.05,0.03,0.08], | |
| 6: [0.03,0.05,0.1,0.08,0.02,0.08,0.4,0.05,0.08,0.05,0.06], | |
| 7: [0.05,0.45,0.02,0.03,0.02,0.05,0.08,0.15,0.02,0.03,0.1], | |
| 8: [0.02,0.08,0.05,0.03,0.02,0.03,0.05,0.12,0.4,0.1,0.1], | |
| 9: [0.03,0.02,0.03,0.02,0.02,0.05,0.05,0.03,0.15,0.45,0.15], | |
| 10: [0.02,0.15,0.05,0.03,0.02,0.03,0.05,0.1,0.1,0.1,0.35], | |
| } | |
| }, | |
| { | |
| words: ['The', 'Law', 'will', 'never', 'be', 'perfect', 'but', 'its', 'application', 'should', 'be', 'just'], | |
| attn: { | |
| 0: [0.5,0.15,0.05,0.03,0.03,0.05,0.02,0.02,0.03,0.03,0.03,0.06], | |
| 1: [0.2,0.4,0.03,0.02,0.03,0.08,0.02,0.05,0.08,0.02,0.03,0.04], | |
| 2: [0.05,0.08,0.4,0.1,0.15,0.05,0.02,0.02,0.03,0.03,0.03,0.04], | |
| 3: [0.03,0.02,0.1,0.45,0.08,0.12,0.03,0.02,0.03,0.02,0.03,0.07], | |
| 4: [0.02,0.03,0.05,0.05,0.35,0.3,0.02,0.02,0.03,0.05,0.05,0.03], | |
| 5: [0.02,0.05,0.03,0.1,0.12,0.45,0.02,0.02,0.03,0.03,0.05,0.08], | |
| 6: [0.03,0.03,0.03,0.05,0.05,0.08,0.35,0.05,0.08,0.1,0.08,0.07], | |
| 7: [0.05,0.35,0.02,0.02,0.02,0.03,0.05,0.2,0.15,0.03,0.03,0.05], | |
| 8: [0.02,0.1,0.02,0.02,0.02,0.03,0.05,0.15,0.4,0.08,0.05,0.06], | |
| 9: [0.02,0.02,0.05,0.02,0.05,0.03,0.05,0.03,0.1,0.4,0.15,0.08], | |
| 10:[0.02,0.02,0.03,0.02,0.08,0.05,0.03,0.02,0.05,0.12,0.4,0.16], | |
| 11:[0.02,0.03,0.02,0.03,0.1,0.15,0.05,0.02,0.05,0.08,0.1,0.35], | |
| } | |
| }, | |
| { | |
| words: ['I', 'looked', 'at', 'my', 'phone', 'and', 'saw', 'that', 'she', 'had', 'called'], | |
| attn: { | |
| 0: [0.5,0.1,0.02,0.15,0.05,0.02,0.03,0.02,0.03,0.03,0.05], | |
| 1: [0.12,0.35,0.1,0.03,0.08,0.02,0.08,0.02,0.05,0.05,0.1], | |
| 2: [0.03,0.2,0.3,0.02,0.15,0.05,0.05,0.05,0.03,0.05,0.07], | |
| 3: [0.2,0.03,0.02,0.4,0.15,0.02,0.02,0.02,0.03,0.03,0.08], | |
| 4: [0.05,0.05,0.05,0.15,0.4,0.02,0.03,0.02,0.03,0.05,0.15], | |
| 5: [0.03,0.05,0.02,0.02,0.03,0.45,0.15,0.08,0.05,0.05,0.07], | |
| 6: [0.08,0.15,0.02,0.02,0.05,0.05,0.35,0.05,0.05,0.08,0.1], | |
| 7: [0.03,0.03,0.02,0.02,0.02,0.03,0.1,0.45,0.1,0.1,0.1], | |
| 8: [0.03,0.02,0.02,0.02,0.02,0.03,0.05,0.08,0.45,0.15,0.13], | |
| 9: [0.02,0.03,0.02,0.02,0.02,0.02,0.05,0.05,0.15,0.4,0.22], | |
| 10:[0.03,0.05,0.02,0.02,0.05,0.02,0.08,0.05,0.15,0.18,0.35], | |
| } | |
| } | |
| ]; | |
| let currentSentence = 0; | |
| let selectedWord = null; | |
| function setDemoSentence(idx, btn) { | |
| currentSentence = idx; | |
| selectedWord = null; | |
| document.querySelectorAll('.demo-controls .demo-btn').forEach(b => b.classList.remove('active')); | |
| if (btn) btn.classList.add('active'); | |
| renderAttention(); | |
| } | |
| function renderAttention() { | |
| const s = sentences[currentSentence]; | |
| const src = document.getElementById('attnSourceRow'); | |
| const tgt = document.getElementById('attnTargetRow'); | |
| src.innerHTML = ''; | |
| tgt.innerHTML = ''; | |
| s.words.forEach((w, i) => { | |
| const tok = document.createElement('div'); | |
| tok.className = 'word-token' + (selectedWord === i ? ' selected' : ''); | |
| tok.textContent = w; | |
| tok.onclick = () => { selectedWord = i; renderAttention(); }; | |
| src.appendChild(tok); | |
| const ttok = document.createElement('div'); | |
| ttok.className = 'word-token'; | |
| ttok.textContent = w; | |
| ttok.style.position = 'relative'; | |
| if (selectedWord !== null && s.attn[selectedWord]) { | |
| const weight = s.attn[selectedWord][i]; | |
| const alpha = Math.min(weight * 2.5, 1); | |
| ttok.style.background = `rgba(108, 99, 255, ${alpha * 0.6})`; | |
| ttok.style.borderColor = `rgba(108, 99, 255, ${alpha})`; | |
| if (alpha > 0.4) ttok.style.color = 'white'; | |
| const bar = document.createElement('div'); | |
| bar.className = 'attention-bar'; | |
| bar.style.width = (weight * 100) + '%'; | |
| ttok.appendChild(bar); | |
| } | |
| tgt.appendChild(ttok); | |
| }); | |
| } | |
| renderAttention(); | |
| // ===== MULTI-HEAD ATTENTION VIZ ===== | |
| const headDescriptions = [ | |
| "Head 1: Focuses on the NEXT word — learning local/adjacent relationships and bigram patterns.", | |
| "Head 2: Focuses on the PREVIOUS word — capturing backward-looking context.", | |
| "Head 3: Attends to semantically related words — grouping nouns, verbs, and related concepts.", | |
| "Head 4: Focuses on punctuation and sentence boundaries — learning structural breaks.", | |
| "Head 5: Resolves coreference — connecting pronouns like 'it' and 'its' to their referents.", | |
| "Head 6: Attends to the beginning of sentence — capturing the topic/subject.", | |
| "Head 7: Long-distance dependencies — connecting words far apart (e.g., 'making...difficult').", | |
| "Head 8: Attends broadly/uniformly — providing a general context summary.", | |
| ]; | |
| function initMultiHead() { | |
| const grid = document.getElementById('multiheadGrid'); | |
| grid.innerHTML = ''; | |
| for (let h = 0; h < 8; h++) { | |
| const box = document.createElement('div'); | |
| box.className = 'head-box'; | |
| const canvas = document.createElement('canvas'); | |
| canvas.width = 100; | |
| canvas.height = 100; | |
| box.appendChild(canvas); | |
| const label = document.createElement('div'); | |
| label.className = 'head-label'; | |
| label.textContent = `Head ${h + 1}`; | |
| box.appendChild(label); | |
| box.onclick = () => { | |
| document.querySelectorAll('.head-box').forEach(b => b.classList.remove('active')); | |
| box.classList.add('active'); | |
| const desc = document.getElementById('headDescription'); | |
| const text = document.getElementById('headDescText'); | |
| desc.style.display = 'block'; | |
| text.textContent = headDescriptions[h]; | |
| }; | |
| grid.appendChild(box); | |
| drawHeadHeatmap(canvas, h); | |
| } | |
| } | |
| function drawHeadHeatmap(canvas, headIdx) { | |
| const ctx = canvas.getContext('2d'); | |
| const n = 10; // 10x10 grid | |
| const cellW = canvas.width / n; | |
| const cellH = canvas.height / n; | |
| const colors = [ | |
| [108, 99, 255], | |
| [255, 101, 132], | |
| [67, 233, 123], | |
| [56, 249, 215], | |
| [255, 215, 0], | |
| [255, 140, 0], | |
| [147, 112, 219], | |
| [0, 191, 255], | |
| ]; | |
| const color = colors[headIdx]; | |
| for (let i = 0; i < n; i++) { | |
| for (let j = 0; j < n; j++) { | |
| let val; | |
| switch(headIdx) { | |
| case 0: val = j === (i + 1) % n ? 0.85 : Math.random() * 0.15; break; // next word | |
| case 1: val = j === Math.max(0, i - 1) ? 0.85 : Math.random() * 0.15; break; // prev word | |
| case 2: val = Math.abs(i - j) < 2 ? 0.3 + Math.random() * 0.5 : Math.random() * 0.15; break; // semantic | |
| case 3: val = (j === 0 || j === n-1) ? 0.5 + Math.random() * 0.3 : Math.random() * 0.1; break; // boundaries | |
| case 4: { // coreference | |
| const dist = Math.abs(i - j); | |
| val = (dist === 3 || dist === 5) ? 0.6 + Math.random() * 0.3 : Math.random() * 0.12; | |
| break; | |
| } | |
| case 5: val = j < 2 ? 0.4 + Math.random() * 0.4 : Math.random() * 0.12; break; // beginning | |
| case 6: { // long distance | |
| const d = Math.abs(i - j); | |
| val = d > 4 ? 0.3 + Math.random() * 0.5 : Math.random() * 0.1; | |
| break; | |
| } | |
| case 7: val = 0.08 + Math.random() * 0.12; break; // uniform | |
| } | |
| ctx.fillStyle = `rgba(${color[0]}, ${color[1]}, ${color[2]}, ${val})`; | |
| ctx.fillRect(j * cellW, i * cellH, cellW - 0.5, cellH - 0.5); | |
| } | |
| } | |
| } | |
| initMultiHead(); | |
| // ===== ARCHITECTURE LAYER DETAILS ===== | |
| function showLayer(id) { | |
| document.querySelectorAll('.layer-detail').forEach(d => d.classList.remove('visible')); | |
| document.querySelectorAll('.arch-layer').forEach(l => l.classList.remove('active')); | |
| const detail = document.getElementById('layerDetail-' + id); | |
| if (detail) detail.classList.add('visible'); | |
| const layer = document.querySelector(`.arch-layer[data-layer="${id}"]`); | |
| if (layer) layer.classList.add('active'); | |
| } | |
| // ===== POSITIONAL ENCODING VIZ ===== | |
| function updatePE() { | |
| const canvas = document.getElementById('peCanvas'); | |
| const ctx = canvas.getContext('2d'); | |
| const positions = parseInt(document.getElementById('posSlider').value); | |
| const dims = parseInt(document.getElementById('dimSlider').value); | |
| document.getElementById('posCount').textContent = positions; | |
| document.getElementById('dimCount').textContent = dims; | |
| canvas.width = canvas.offsetWidth * 2; | |
| canvas.height = 400; | |
| ctx.scale(2, 2); | |
| const w = canvas.offsetWidth; | |
| const h = 200; | |
| const cellW = w / dims; | |
| const cellH = h / positions; | |
| for (let pos = 0; pos < positions; pos++) { | |
| for (let i = 0; i < dims; i++) { | |
| const angle = pos / Math.pow(10000, (2 * Math.floor(i/2)) / 512); | |
| const val = i % 2 === 0 ? Math.sin(angle) : Math.cos(angle); | |
| const norm = (val + 1) / 2; | |
| const r = Math.floor(norm * 108 + (1-norm) * 10); | |
| const g = Math.floor(norm * 99 + (1-norm) * 10); | |
| const b = Math.floor(norm * 255 + (1-norm) * 30); | |
| ctx.fillStyle = `rgb(${r},${g},${b})`; | |
| ctx.fillRect(i * cellW, pos * cellH, cellW + 0.5, cellH + 0.5); | |
| } | |
| } | |
| } | |
| setTimeout(updatePE, 100); | |
| window.addEventListener('resize', () => setTimeout(updatePE, 100)); | |
| // ===== TRANSLATION DEMO ===== | |
| const transSource = ['The', 'cat', 'sat', 'on', 'the', 'mat', '.']; | |
| const transTarget = ['Le', 'chat', 'était', 'assis', 'sur', 'le', 'tapis', '.']; | |
| const transAttn = [ | |
| [0.8, 0.02, 0.02, 0.02, 0.08, 0.02, 0.04], // Le -> The | |
| [0.02, 0.75, 0.03, 0.02, 0.02, 0.02, 0.14], // chat -> cat | |
| [0.02, 0.05, 0.6, 0.15, 0.02, 0.02, 0.14], // était -> sat | |
| [0.02, 0.03, 0.55, 0.2, 0.02, 0.02, 0.16], // assis -> sat/on | |
| [0.02, 0.02, 0.02, 0.7, 0.08, 0.02, 0.14], // sur -> on | |
| [0.05, 0.02, 0.02, 0.02, 0.72, 0.02, 0.15], // le -> the | |
| [0.02, 0.02, 0.02, 0.02, 0.02, 0.75, 0.15], // tapis -> mat | |
| [0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.88], // . -> . | |
| ]; | |
| let selectedTrans = null; | |
| function initTransDemo() { | |
| const srcRow = document.getElementById('transSourceRow'); | |
| const tgtRow = document.getElementById('transTargetRow'); | |
| srcRow.innerHTML = ''; | |
| tgtRow.innerHTML = ''; | |
| transSource.forEach((w, i) => { | |
| const tok = document.createElement('div'); | |
| tok.className = 'word-token'; | |
| tok.textContent = w; | |
| tok.id = 'tsrc-' + i; | |
| srcRow.appendChild(tok); | |
| }); | |
| transTarget.forEach((w, i) => { | |
| const tok = document.createElement('div'); | |
| tok.className = 'word-token'; | |
| tok.textContent = w; | |
| tok.onclick = () => { selectedTrans = i; renderTransAttn(); }; | |
| tgtRow.appendChild(tok); | |
| }); | |
| } | |
| function renderTransAttn() { | |
| // Reset source tokens | |
| transSource.forEach((w, i) => { | |
| const tok = document.getElementById('tsrc-' + i); | |
| if (selectedTrans !== null && transAttn[selectedTrans]) { | |
| const weight = transAttn[selectedTrans][i]; | |
| const alpha = Math.min(weight * 2, 1); | |
| tok.style.background = `rgba(67, 233, 123, ${alpha * 0.6})`; | |
| tok.style.borderColor = `rgba(67, 233, 123, ${alpha})`; | |
| if (alpha > 0.4) tok.style.color = 'white'; | |
| else tok.style.color = ''; | |
| } else { | |
| tok.style.background = ''; | |
| tok.style.borderColor = ''; | |
| tok.style.color = ''; | |
| } | |
| }); | |
| // Update target tokens | |
| const tgtRow = document.getElementById('transTargetRow'); | |
| Array.from(tgtRow.children).forEach((tok, i) => { | |
| if (i === selectedTrans) { | |
| tok.classList.add('selected'); | |
| } else { | |
| tok.classList.remove('selected'); | |
| } | |
| }); | |
| // Draw lines on canvas | |
| drawTransLines(); | |
| } | |
| function drawTransLines() { | |
| const canvas = document.getElementById('translationCanvas'); | |
| const ctx = canvas.getContext('2d'); | |
| const rect = canvas.getBoundingClientRect(); | |
| canvas.width = rect.width * 2; | |
| canvas.height = rect.height * 2; | |
| ctx.scale(2, 2); | |
| ctx.clearRect(0, 0, rect.width, rect.height); | |
| if (selectedTrans === null) return; | |
| const srcRow = document.getElementById('transSourceRow'); | |
| const tgtRow = document.getElementById('transTargetRow'); | |
| const srcToks = Array.from(srcRow.children); | |
| const tgtTok = tgtRow.children[selectedTrans]; | |
| if (!tgtTok) return; | |
| const canvasRect = canvas.getBoundingClientRect(); | |
| const tgtX = tgtTok.getBoundingClientRect().left + tgtTok.offsetWidth / 2 - canvasRect.left; | |
| const tgtY = rect.height; | |
| srcToks.forEach((tok, i) => { | |
| const srcX = tok.getBoundingClientRect().left + tok.offsetWidth / 2 - canvasRect.left; | |
| const srcY = 0; | |
| const weight = transAttn[selectedTrans][i]; | |
| const alpha = Math.min(weight * 2.5, 1); | |
| ctx.beginPath(); | |
| ctx.moveTo(srcX, srcY); | |
| ctx.quadraticCurveTo((srcX + tgtX) / 2, rect.height / 2, tgtX, tgtY); | |
| ctx.strokeStyle = `rgba(67, 233, 123, ${alpha})`; | |
| ctx.lineWidth = weight * 5; | |
| ctx.stroke(); | |
| }); | |
| } | |
| initTransDemo(); | |
| window.addEventListener('resize', () => { | |
| if (selectedTrans !== null) renderTransAttn(); | |
| setTimeout(updatePE, 100); | |
| }); | |
| // ===== COMPARISON BAR ANIMATION ON SCROLL ===== | |
| // Already handled by intersection observer above | |
| // ===== SMOOTH SCROLL FOR NAV ===== | |
| document.querySelectorAll('nav a[href^="#"]').forEach(a => { | |
| a.addEventListener('click', (e) => { | |
| e.preventDefault(); | |
| const target = document.querySelector(a.getAttribute('href')); | |
| if (target) target.scrollIntoView({ behavior: 'smooth', block: 'start' }); | |
| }); | |
| }); | |
| </script> | |
| </body> | |
| </html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment